In [1]:
import pandas as pd
import os
import gzip
import pickle
from openai import OpenAI
import torch.nn.functional as F
import torch
from tqdm import tqdm
import json
import numpy as np

In [2]:
client = OpenAI()

In [3]:
def get_openai_response(prompt):
    tokens = 1000
    # model="gpt-4-turbo-preview"
    model="gpt-3.5-turbo-0125"
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant"
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        max_tokens=tokens,
        top_p=1,
    )
    choice = response.choices[0]

    text = choice.message.content


    return text

In [4]:
def process_thread(row):
    date = row["dates"].split("<sep>")[-1]

    thread = ""
    thread += "Date: " + date[:7] + "\n"
    thread += "Topic: " + row["topic"] + "\n"        
    thread += "### Original post:\n"
    i = 1
    for (post, date) in zip(row["post"].split("<sep>"), row["dates"].split("<sep>")):
        if len(post) > 1200:
            thread += post[:1200] + "<rest of post truncated>\n\n"
            thread += f"### Reply {i}:\n"
            i += 1
        elif len(post) < 5:
            pass
        else:
            thread += post + "\n\n"
            thread += f"### Reply {i}:\n"
            i += 1
    #remove the last line
    thread = thread[:-len(f"### Reply {i-1}:\n")]
    if len(thread) > 5000:
        thread = thread[:5000] + "<rest of thread truncated>\n"

    return date, thread

In [5]:
categories = [
    "groupbuys",
    "hardware",
    "miners",
    "mining",
    "mining_support",
    # "pools",
]

df = pd.DataFrame()

#load every csv in the folder and append them
for cat in categories:
    with gzip.open('../1_forum_dataset/cleaned-data/'+cat+'.pkl.gz', 'rb') as f:
        df_cat = pickle.load(f)
        df_cat['category'] = cat
        df = pd.concat([df, df_cat], ignore_index=True)

path = "./"
file_name = "dataset.csv"


if not os.path.exists(path+file_name):
    dataset = pd.DataFrame(columns=['input','output'])
else:
    dataset = pd.read_csv(path+file_name)

# for each unique year, sample x threads
x = 5
df2 = df.sample(24000)
rows = pd.DataFrame()
unique_years = np.arange(2010, 2023+1)
year_counts = {year: 0 for year in unique_years}
for i in range(len(df2)):
    date, thread = process_thread(df2.iloc[i])
    year = int(date[:4])
    if year_counts[year] < x:
        rows = pd.concat([rows, pd.DataFrame(df2.iloc[i]).T], ignore_index=True)
        year_counts[year] += 1

In [6]:
# pd.set_option('display.max_rows', None)
# rows.sort_values(by="date", inplace=True)
# rows

In [7]:
dates, threads = [], []
for j in range(len(rows)):
    row = rows.iloc[j]
    if(len(row["post"]) < 20):
        print(f"skipping {j}")
        continue

    date, thread = process_thread(row)

    dates.append(date)
    threads.append(thread)


threadid=0
# for (date, thread) in tqdm(zip(dates, threads), total=len(dates)):
for (date, thread) in zip(dates, threads):

    
    print(f"processing thread id {threadid}\n\n"+ thread + "\n\n")

    prompt = f"""
User:
In the given Bitcoin forum thread, pay close attention to the language used when mentioning hardware pieces. Look for explicit statements indicating ownership or hypothetical discussions.

{thread}



Reply with a formatted JSON document containing an array of objects. Each object should represent a piece of hardware mentioned in the thread and include the following fields:
- hardware_name: A string containing the name of the hardware.
- hardware_is_owned: A boolean. If the mention suggests concrete ownership by any user, write true. If the hardware is discussed in a hypothetical or speculative way, write false. 


Assistant:
Sure! Here is the requested JSON file, with the correct ownership status for each piece of hardware:
""".strip()
    

    response = get_openai_response(prompt)

    print("model response: \n\n"+response+"\n\n\n")

    if not response.__contains__("```json"):
        print("ERROR: response does not contain JSON")

    response = response.replace("```json\n","")
    response = response.split("```")[0].strip()

    print("parsed response: \n\n"+response+"\n\n\n")

    try:
        _ = json.loads(response)
    except:
        print("ERROR: could not parse response as JSON")
        continue




    # Append the new rows to the dataset
    input = thread
    output = response
    dataset = pd.concat([dataset, pd.DataFrame({'input': [input], 'output': [output]})], ignore_index=True)

    # break

    dataset.to_csv(path+file_name, index=False)
    threadid+=1

processing thread id 0

Date: 2014-01
Topic: [OPEN][Worldwide] Dualminer USB ASIC sha/scrypt miner 0.105btc
### Original post:
UPDATE:ORDERS ARE OPENWE HAVE 600 AVAILABLE SHIPPING FROM CHINA 9TH FEB

### Reply 1:
so what are you anticipating the weight will be if ordering under 25 units?

### Reply 2:
reservedInterested in about 100-200

### Reply 3:
less then 2kg.I will add you to the list for 150 you can change it later.

### Reply 4:
Put me in for 3

### Reply 5:
reserve 20.Thank you!

### Reply 6:
DoneDoneI have included a new device that people might be interested in it is a 5 chip miner based off the same Gridseed dual algorithm capable chips.Video of the device can be found here.<link>




model response: 

```json
[
    {
        "hardware_name": "Dualminer USB ASIC sha/scrypt miner",
        "hardware_is_owned": false
    },
    {
        "hardware_name": "5 chip miner based off the same Gridseed dual algorithm capable chips",
        "hardware_is_owned": false
    }
]
```





In [8]:
import pandas as pd
dataset = pd.read_csv("dataset.csv")
dataset

Unnamed: 0,input,output
0,Date: 2014-01\nTopic: [OPEN][Worldwide] Dualmi...,"[\n {\n ""hardware_name"": ""Dualminer ..."
1,Date: 2014-08\nTopic: [OPEN - CUBE] batch #34 ...,"[\n {\n ""hardware_name"": ""Cube"",\n ..."
2,Date: 2011-06\nTopic: Silly linux scripts and ...,"[\n {\n ""hardware_name"": ""AMDOverdri..."
3,Date: 2017-02\nTopic: Have to send Antminer S9...,"[\n {\n ""hardware_name"": ""Antminer S..."
4,Date: 2015-04\nTopic: Trying to get a controll...,"[\n {\n ""hardware_name"": ""zen contro..."
...,...,...
61,Date: 2020-02\nTopic: S17 : WARN: chain[1] - 0...,"[\n {\n ""hardware_name"": ""ASIC chips..."
62,Date: 2020-03\nTopic: Canaan A1066 Pro Review...,"[\n {\n ""hardware_name"": ""A1041"",\n ..."
63,Date: 2020-03\nTopic: Flashing S9 Control Boar...,"[\n {\n ""hardware_name"": ""SD chip"",\..."
64,Date: 2020-01\nTopic: nicehash not working wit...,"[\n {\n ""hardware_name"": ""T17 antmin..."
