In [1]:
import pandas as pd
import os
import gzip
import pickle
from openai import OpenAI
import torch.nn.functional as F
import torch
from tqdm import tqdm
import json
import numpy as np

In [2]:
client = OpenAI()

In [3]:
def get_openai_response(prompt):
    tokens = 1000
    model="gpt-4-turbo-preview"
    # model="gpt-3.5-turbo-0125"
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant"
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        max_tokens=tokens,
        top_p=1,
    )
    choice = response.choices[0]

    text = choice.message.content


    return text

In [4]:
def process_thread(row):
    try:
        date = row["dates"].split("<sep>")[-1]

        thread = ""
        thread += "Date: " + date[:7] + "\n"
        thread += "Topic: " + row["topic"] + "\n"        
        thread += "### Original post:\n"
        i = 1
        for (post, date) in zip(row["post"].split("<sep>"), row["dates"].split("<sep>")):
            if len(post) > 1200:
                thread += post[:1200] + "<rest of post truncated>\n\n"
                thread += f"### Reply {i}:\n"
                i += 1
            elif len(post) < 5:
                pass
            else:
                thread += post + "\n\n"
                thread += f"### Reply {i}:\n"
                i += 1
        #remove the last line
        thread = thread[:-len(f"### Reply {i-1}:\n")]
        if len(thread) > 5000:
            thread = thread[:5000] + "<rest of thread truncated>\n"

        return row["index"], date, thread

    except:
        print("Error processing thread:" + str(row["index"]))
        return None, None, None



In [5]:
refresh_df = False
# refresh_df = True

if refresh_df:
    categories = [
        "groupbuys",
        "hardware",
        "miners",
        "mining",
        "mining_support",
        # "pools",
    ]

    df = pd.DataFrame()

    #load every csv in the folder and append them
    for cat in categories:
        with gzip.open('../1_forum_dataset/cleaned-data/'+cat+'.pkl.gz', 'rb') as f:
            df_cat = pickle.load(f)
            df_cat['category'] = cat
            df = pd.concat([df, df_cat], ignore_index=True)

    df["index"] = df.index
    df.to_csv("forum_threads.csv", index=False)

else:
    df = pd.read_csv("forum_threads.csv")

In [6]:
df

Unnamed: 0,topic,post,date,dates,category,index
0,[Group Buy in China] Avalon batch #3,List of <truncated> Forum Name - Bitcoin Addre...,2013-03-25 11:06:33,2013-03-25 10:16:23<sep>2013-03-25 10:42:45<se...,groupbuys,0
1,Avalon Group buy - Germany,"Hi, I'm in time to invest?<sep>too late, sorry",2013-03-25 13:13:44,2013-03-25 12:47:34<sep>2013-03-25 13:13:44,groupbuys,1
2,Avalon Group buy - Germany - closed,I am willing to organize a community purchase ...,2013-03-25 13:14:29,2013-03-25 13:14:29,groupbuys,2
3,I'm about to buy Avalon asic 85,"it sold out, its gone.<sep>People just order b...",2013-03-25 14:26:29,2013-03-25 13:46:12<sep>2013-03-25 14:26:29,groupbuys,3
4,CANCELLED [Group Buy in China] Avalon batch #3,Do you not have enough BTC to buy a batch 3 Av...,2013-03-25 16:42:09,2013-03-25 16:42:09,groupbuys,4
...,...,...,...,...,...,...
24184,S19 Amlogic control board brick ?,"Hello everyone, I have an amlogic s19 j pro ca...",2023-10-23 15:51:19,2023-08-30 13:53:04<sep>2023-08-30 17:45:01<se...,mining_support,24184
24185,S19jPro abnormal network - (Will not connect t...,Driving me nuts... after changing pool address...,2023-10-24 14:00:23,2023-02-24 15:37:21<sep>2023-02-24 15:59:12<se...,mining_support,24185
24186,Flashing control Board S9 to R4,"Hello,I recently purchased a control board tha...",2023-10-25 23:42:40,2023-10-05 14:14:14<sep>2023-10-09 05:06:58<se...,mining_support,24186
24187,AvalonMiner 1166Pro-72,"Hello,can someone help me find the problem wit...",2023-10-26 08:40:14,2023-10-09 07:54:00<sep>2023-10-09 11:50:47<se...,mining_support,24187


In [7]:


path = "./"
file_name = "dataset.csv"

already_processed_thread_ids = []

if not os.path.exists(path+file_name):
    dataset = pd.DataFrame(columns=['index','input','output'])    
else:
    dataset = pd.read_csv(path+file_name)
    already_processed_thread_ids = dataset['index'].tolist()

# for each unique year, sample x threads
x = 230
df2 = df.sample(24000,random_state=44)
rows = pd.DataFrame()
unique_years = np.arange(2010, 2023+1)
year_counts = {year: 0 for year in unique_years}
for i in range(len(df2)):
    index, date, thread = process_thread(df2.iloc[i])
    if index is None:
        continue
    year = int(date[:4])
    if year_counts[year] < x:
        rows = pd.concat([rows, pd.DataFrame(df2.iloc[i]).T], ignore_index=True)
        year_counts[year] += 1

Error processing thread:2967
Error processing thread:13251
Error processing thread:9306
Error processing thread:9152
Error processing thread:6543
Error processing thread:2907
Error processing thread:4609
Error processing thread:6620
Error processing thread:2344
Error processing thread:12206
Error processing thread:7170
Error processing thread:6956
Error processing thread:5493
Error processing thread:14915


In [8]:
rows

Unnamed: 0,topic,post,date,dates,category,index
0,Selling another batch of 40 ICARUS - sold out,"Those Icarus are batch 3, tested for several w...",2012-06-15 10:53:37,2012-06-15 03:19:09<sep>2012-06-15 10:53:12<se...,hardware,4109
1,Advice -Adding additional 6 pin PCIE outputs o...,Hi AllI have a few modular/semi modular PC psu...,2022-09-22 01:12:31,2022-09-20 19:44:16<sep>2022-09-21 01:29:09<se...,hardware,11208
2,Please Help: Mining with both cores of 5970 wi...,"Hi, I'm having trouble setting up a 5970 and m...",2013-07-04 15:42:38,2013-03-27 13:50:47<sep>2013-03-27 20:16:41<se...,miners,12483
3,ASIC Not working!!! HELP,An ASIC is an Application Specific Integrated ...,2019-09-12 00:27:02,2019-09-10 15:56:23<sep>2019-09-10 16:04:48<se...,mining_support,23073
4,USB Block Erupter showing up as generic USB De...,I've been mining with two USB Block Erupter Sa...,2014-03-19 04:09:42,2014-03-18 05:11:47<sep>2014-03-19 03:06:55<se...,mining_support,19054
...,...,...,...,...,...,...
2986,Used Antminer S17 Pro - Having issues and PO f...,I just bought a used Antminer S17Pro and it sh...,2020-04-28 05:06:41,2020-04-07 14:23:01<sep>2020-04-07 15:35:54<se...,mining_support,23258
2987,And one more S15 question...,I'm getting 1960w draw at the wall running at ...,2020-03-04 00:09:31,2020-02-29 02:52:52<sep>2020-02-29 03:08:20<se...,mining_support,23219
2988,Innosilicon Miner not connecting to INTERNET,Have you tried to replace the ethernet cable w...,2020-06-11 00:37:58,2020-06-08 17:20:16<sep>2020-06-09 02:15:06<se...,mining_support,23284
2989,Antminer T9+ Low hashrate problem!!!,Your temp sensor has failed. You can play with...,2020-06-22 00:46:48,2020-06-22 00:31:03<sep>2020-06-22 00:32:14<se...,mining_support,23288


In [9]:
# pd.set_option('display.max_rows', None)
# rows.sort_values(by="date", inplace=True)
# rows

In [10]:
prompt = """User:
In the given Bitcoin forum thread, pay close attention to the language used when mentioning hardware pieces. Look for explicit statements indicating ownership or hypothetical discussions.

{}



Reply with a formatted JSON document containing an array of objects. Each object should represent a piece of hardware mentioned in the thread and include the following fields:
- hardware_name: A string containing the name of the hardware.
- hardware_is_owned: A boolean. If the mention suggests concrete ownership by any user, write true. If the hardware is discussed in a hypothetical or speculative way, write false. 


Assistant:
Sure! Here is the requested JSON file, with the correct ownership status for each piece of hardware:"""

In [11]:
indices, dates, threads = [], [], []
for j in range(len(rows)):
    row = rows.iloc[j]
    if(len(row["post"]) < 20):
        print(f"skipping {j} as it is too short")
        continue

    id, date, thread = process_thread(row)

    if id in already_processed_thread_ids:
        print(f"Skipping thread {id} as it is already processed")
        continue

    indices.append(id)
    dates.append(date)
    threads.append(thread)

Skipping thread 4109 as it is already processed
Skipping thread 11208 as it is already processed
Skipping thread 12483 as it is already processed
Skipping thread 23073 as it is already processed
Skipping thread 19054 as it is already processed
Skipping thread 19099 as it is already processed
Skipping thread 9265 as it is already processed
Skipping thread 12145 as it is already processed
Skipping thread 4600 as it is already processed
Skipping thread 8577 as it is already processed
Skipping thread 5258 as it is already processed
Skipping thread 11087 as it is already processed
Skipping thread 21036 as it is already processed
Skipping thread 11702 as it is already processed
Skipping thread 19166 as it is already processed
Skipping thread 14240 as it is already processed
Skipping thread 8155 as it is already processed
Skipping thread 14160 as it is already processed
Skipping thread 1844 as it is already processed
Skipping thread 1084 as it is already processed
Skipping thread 15240 as it 

In [12]:
len(indices)

2928

In [13]:




# for (date, thread) in tqdm(zip(dates, threads), total=len(dates)):
for (threadid, date, thread) in zip(indices, dates, threads):

    
    print(f"processing thread id {threadid}\n\n"+ thread + "\n\n")

    prompt2 = prompt.format(thread)
    

    response = get_openai_response(prompt2)

    print("model response: \n\n"+response+"\n\n\n")

    if not response.__contains__("```json"):
        print("ERROR: response does not contain JSON")
        continue

    response = response.replace("```json\n","")
    response = response.split("```")[0].strip()

    print("parsed response: \n\n"+response+"\n\n\n")

    try:
        _ = json.loads(response)
    except:
        print("ERROR: could not parse response as JSON")
        continue




    # Append the new rows to the dataset
    input = thread
    output = response
    dataset = pd.concat([dataset, pd.DataFrame({'index': [threadid],'input': [input], 'output': [output]})], ignore_index=True)



    dataset.to_csv(path+file_name, index=False)
    threadid+=1

    # break

processing thread id 9724

Date: 2016-02
Topic: [GUIDE] Undervolt antminer s1 [1.19W/GH at the wall]
### Original post:
meanwhile, I did the opposite mod and am pushing near 220GH out of some of my ants ... =) hope they dont die! ... been a week or so ... still goin strong.

### Reply 1:
Holly!We need some explanation here!

### Reply 2:
thats very good - i did not expect them to be capable of much more than 205GH even with pencil mod. did you have to add a second fan or shrouding at all?

### Reply 3:
The same as I did but with the resistor R12 (the one at the left of R3).It will increase the voltage at the chip, so be carefull

### Reply 4:
Yeah, I just vmodded one of my '14 units to around 1.15-1.16v per 'module'Its doing 431mhz @ .55% HW @ 219.5GH for the last 4hrs ... =)(previously the same unit has been at 425mhz @ .18% HW @ 216GH for a few days...)I may try it on my other 2 '14 units tonight.My '13 units dont vmod as well =/NOTE: no, didnt modify cooling and this particular ant 

In [None]:
import pandas as pd
dataset = pd.read_csv("dataset.csv")
dataset

In [None]:
# create inputs that will be given to trained model
inputs = pd.DataFrame()
for i in range(len(df)):
    date, thread = process_thread(df.iloc[i])
    prompt2 = prompt.format(thread)
    inputs = pd.concat([inputs, pd.DataFrame({'input': [prompt2]})], ignore_index=True)


inputs.to_csv("inputs.csv", index=False)

In [None]:
print(inputs.iloc[0].values[0])

In [None]:
inputs