In [None]:
import pandas as pd
import os
import gzip
import pickle
from openai import OpenAI
import torch.nn.functional as F
import torch
from tqdm import tqdm
import json
import numpy as np

In [None]:
client = OpenAI()

In [None]:
def get_openai_response(prompt):
    tokens = 1000
    model="gpt-4-turbo-preview"
    # model="gpt-3.5-turbo-0125"
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant"
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        max_tokens=tokens,
        top_p=1,
    )
    choice = response.choices[0]

    text = choice.message.content


    return text

In [None]:
def process_thread(row):
    try:
        date = row["date"]

        thread = ""
        thread += "Date: " + date[:7] + "\n"
        thread += "Topic: " + row["topic"] + "\n"        
        thread += "### Original post:\n"
        i = 1
        for post in row["post"].split("<sep>"):
            if len(post) > 1200:
                thread += post[:1200] + "<rest of post truncated>\n\n"
                thread += f"### Reply {i}:\n"
                i += 1
            elif len(post) < 5:
                pass
            else:
                thread += post + "\n\n"
                thread += f"### Reply {i}:\n"
                i += 1
        #remove the last line
        thread = thread[:-len(f"### Reply {i-1}:\n")]
        if len(thread) > 5000:
            thread = thread[:5000] + "<rest of thread truncated>\n"

        return row["index"], date, thread

    except:
        print("Error processing thread:" + str(row["index"]))
        return None, None, None



In [None]:
df = pd.read_csv("../1_forum_dataset/cleaned-data.csv") # topic date post dates index
# remove threads where date is below 2010
df = df[df["date"] >= "2010-01-01"]

In [None]:
df

In [None]:


path = "./"
file_name = "dataset.csv"

already_processed_thread_ids = []

if not os.path.exists(path+file_name):
    dataset = pd.DataFrame(columns=['index','input','output'])    
else:
    dataset = pd.read_csv(path+file_name)
    already_processed_thread_ids = dataset['index'].tolist()

# for each unique year, sample x threads
x = 100
df2 = df.sample(60000,random_state=44)
rows = pd.DataFrame()
unique_years = np.arange(2009, 2024+1)
year_counts = {year: 0 for year in unique_years}
for i in range(len(df2)):
    index, date, thread = process_thread(df2.iloc[i])
    if index is None:
        continue
    year = int(date[:4])
    if year_counts[year] < x:
        rows = pd.concat([rows, pd.DataFrame(df2.iloc[i]).T], ignore_index=True)
        year_counts[year] += 1

In [None]:
rows

In [None]:
# pd.set_option('display.max_rows', None)
# rows.sort_values(by="date", inplace=True)
# rows

In [None]:
# prompt = """User:
# Here is a bitcoin forum thread:

# ```thread
# {}
# ```


# Here is a list of categories:

# optimistic_speculation
#    - Optimistic threads discussing investment strategies / price predictions.
   
# pessimistic_speculation
#    - Pessimistic threads discussing investment strategies / price predictions.

# bitcoin_adoption
#    - Discussions on countries adopting Bitcoin as legal tender and regulatory changes.

# bitcoin_technology
#    - Technical discussions on Bitcoin's underlying technology and security.

# financial_products
#    - Threads related to Bitcoin exchanges, ETFs, and other financial products related to Bitcoin.

# bitcoin_challenges
#    - Debates and discussions on the challenges facing Bitcoin.

# scams
#    - Threads discussing scams, fraud and ransoms in the Bitcoin space.

# bitcoin_mining
#    - Discussions on the process of mining Bitcoin.

# altcoins
#    - Discussions about other cryptocurrencies.

# educational_resources
#     - Threads offering educational resources for new Bitcoin users and investors.

# other
#     - For threads that do not fit into any of the above categories.


# Reply with a formatted JSON document containing a single field called "categories". This field should be an array of strings with 1 to 3 categories that best describe the thread.


# Assistant:
# Sure! Here is the requested JSON document:"""

In [None]:
prompt = """User:
Here is a bitcoin forum thread:

```thread
{}
```


Here is a list of categories:

speculation
- Discussions about speculations on price movements.
 
adoption
- Discussions about Bitcoin's adoption, underlying technology, mining process and security.
 
altcoins
- Discussions about cryptocurrencies other than Bitcoin.

none
- Discussions that do not fit into any of the above categories.



Reply with a formatted JSON document containing the following fields:
-A field called "category". This field should be a string with the category that best describes the thread.
-A field called "sentiment". This field should be a string with the sentiment of the thread. The possible sentiments are "strongly positive", "positive", "neutral", "negative", and "strongly negative".


Assistant:
Sure! Here is the requested JSON document:"""

In [None]:
def get_stuff(data, skip=True):
    indices, dates, threads = [], [], []
    for j in range(len(data)):
        row = data.iloc[j]
        if(len(str(row["post"])) < 50):
            print(f"skipping {j} as it is too short")
            continue

        id, date, thread = process_thread(row)

        if id in already_processed_thread_ids and skip:
            print(f"Skipping thread {id} as it is already processed")
            continue

        indices.append(id)
        dates.append(date)
        threads.append(thread)

    return indices, dates, threads

indices, dates, threads = get_stuff(rows)

In [None]:
len(indices)

In [None]:



done = 0
# for (date, thread) in tqdm(zip(dates, threads), total=len(dates)):
for (threadid, date, thread) in zip(indices, dates, threads):

    
    print(f"processing thread id {threadid}\n\n"+ thread + "\n\n")
    print(f"done {done}/{len(indices)}\n\n")

    prompt2 = prompt.format(thread)
    

    response = get_openai_response(prompt2)

    print("model response: \n\n"+response+"\n\n\n")

    if not response.__contains__("```json"):
        print("ERROR: response does not contain JSON")
        continue

    response = response.replace("```json\n","")
    response = response.split("```")[0].strip()

    print("parsed response: \n\n"+response+"\n\n\n")

    try:
        _ = json.loads(response)
    except:
        print("ERROR: could not parse response as JSON")
        continue




    # Append the new rows to the dataset
    input = prompt2
    output = response
    dataset = pd.concat([dataset, pd.DataFrame({'index': [threadid],'input': [input], 'output': [output]})], ignore_index=True)



    dataset.to_csv(path+file_name, index=False)
    done+=1

    # break

In [None]:
# create the inputs to do inference later
indices, dates, threads = get_stuff(df, skip=False)

In [None]:
inputs = pd.DataFrame(columns=['index','date','input'])

In [None]:
for (threadid, date, thread) in zip(indices, dates, threads):
    prompt2 = prompt.format(thread)
    inputs = pd.concat([inputs, pd.DataFrame({'index': [threadid],'date': [date],'input': [prompt2]})], ignore_index=True)

inputs.to_csv(path+"inputs.csv", index=False)

In [None]:
inputs