In [None]:
import pandas as pd
import os
import gzip
import pickle
from openai import OpenAI
import torch.nn.functional as F
import torch
from tqdm import tqdm
import json
import numpy as np

In [None]:
client = OpenAI()

In [None]:
def get_openai_response(prompt):
    tokens = 1000
    model="gpt-4-turbo-preview"
    # model="gpt-3.5-turbo-0125"
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant"
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        max_tokens=tokens,
        top_p=1,
    )
    choice = response.choices[0]

    text = choice.message.content


    return text

In [None]:
def process_thread(row):
    try:
        date = row["dates"].split("<sep>")[-1]

        thread = ""
        thread += "Date: " + date[:7] + "\n"
        thread += "Topic: " + row["topic"] + "\n"        
        thread += "### Original post:\n"
        i = 1
        for (post, date) in zip(row["post"].split("<sep>"), row["dates"].split("<sep>")):
            if len(post) > 1200:
                thread += post[:1200] + "<rest of post truncated>\n\n"
                thread += f"### Reply {i}:\n"
                i += 1
            elif len(post) < 5:
                pass
            else:
                thread += post + "\n\n"
                thread += f"### Reply {i}:\n"
                i += 1
        #remove the last line
        thread = thread[:-len(f"### Reply {i-1}:\n")]
        if len(thread) > 5000:
            thread = thread[:5000] + "<rest of thread truncated>\n"

        return row["index"], date, thread

    except:
        print("Error processing thread:" + str(row["index"]))
        return None, None, None



In [None]:
refresh_df = False
# refresh_df = True

if refresh_df:
    categories = [
        "groupbuys",
        "hardware",
        "miners",
        "mining",
        "mining_support",
        # "pools",
    ]

    df = pd.DataFrame()

    #load every csv in the folder and append them
    for cat in categories:
        with gzip.open('../1_forum_dataset/cleaned-data/'+cat+'.pkl.gz', 'rb') as f:
            df_cat = pickle.load(f)
            df_cat['category'] = cat
            df = pd.concat([df, df_cat], ignore_index=True)

    df["index"] = df.index
    df.to_csv("concatenated_threads.csv", index=False)

else:
    df = pd.read_csv("concatenated_threads.csv")

In [None]:
df

In [None]:


path = "./"
file_name = "dataset.csv"

already_processed_thread_ids = []

if not os.path.exists(path+file_name):
    dataset = pd.DataFrame(columns=['index','input','output'])    
else:
    dataset = pd.read_csv(path+file_name)
    already_processed_thread_ids = dataset['index'].tolist()

# for each unique year, sample x threads
x = 230
df2 = df.sample(24000,random_state=44)
rows = pd.DataFrame()
unique_years = np.arange(2010, 2023+1)
year_counts = {year: 0 for year in unique_years}
for i in range(len(df2)):
    index, date, thread = process_thread(df2.iloc[i])
    if index is None:
        continue
    year = int(date[:4])
    if year_counts[year] < x:
        rows = pd.concat([rows, pd.DataFrame(df2.iloc[i]).T], ignore_index=True)
        year_counts[year] += 1

In [None]:
rows

In [None]:
# pd.set_option('display.max_rows', None)
# rows.sort_values(by="date", inplace=True)
# rows

In [None]:
prompt = """User:
In the given Bitcoin forum thread, pay close attention to the language used when mentioning hardware pieces. Look for explicit statements indicating ownership or hypothetical discussions.

{}



Reply with a formatted JSON document containing an array of objects. Each object should represent a piece of hardware mentioned in the thread and include the following fields:
- hardware_name: A string containing the name of the hardware.
- hardware_is_owned: A boolean. If the mention suggests concrete ownership by any user, write true. If the hardware is discussed in a hypothetical or speculative way, write false. 


Assistant:
Sure! Here is the requested JSON file, with the correct ownership status for each piece of hardware:"""

In [None]:
indices, dates, threads = [], [], []
for j in range(len(rows)):
    row = rows.iloc[j]
    if(len(row["post"]) < 20):
        print(f"skipping {j} as it is too short")
        continue

    id, date, thread = process_thread(row)

    if id in already_processed_thread_ids:
        print(f"Skipping thread {id} as it is already processed")
        continue

    indices.append(id)
    dates.append(date)
    threads.append(thread)

In [None]:
len(indices)

In [None]:




# for (date, thread) in tqdm(zip(dates, threads), total=len(dates)):
for (threadid, date, thread) in zip(indices, dates, threads):

    
    print(f"processing thread id {threadid}\n\n"+ thread + "\n\n")

    prompt2 = prompt.format(thread)
    

    response = get_openai_response(prompt2)

    print("model response: \n\n"+response+"\n\n\n")

    if not response.__contains__("```json"):
        print("ERROR: response does not contain JSON")
        continue

    response = response.replace("```json\n","")
    response = response.split("```")[0].strip()

    print("parsed response: \n\n"+response+"\n\n\n")

    try:
        _ = json.loads(response)
    except:
        print("ERROR: could not parse response as JSON")
        continue




    # Append the new rows to the dataset
    input = thread
    output = response
    dataset = pd.concat([dataset, pd.DataFrame({'index': [threadid],'input': [input], 'output': [output]})], ignore_index=True)



    dataset.to_csv(path+file_name, index=False)
    threadid+=1

    # break

In [None]:
import pandas as pd
dataset = pd.read_csv("dataset.csv")
dataset

In [None]:
# create inputs that will be given to trained model
inputs = pd.DataFrame()
for i in range(len(df)):
    date, thread = process_thread(df.iloc[i])
    prompt2 = prompt.format(thread)
    inputs = pd.concat([inputs, pd.DataFrame({'input': [prompt2]})], ignore_index=True)


inputs.to_csv("inputs.csv", index=False)

In [None]:
print(inputs.iloc[0].values[0])

In [None]:
inputs