In [6]:
import pandas as pd
import os
import gzip
import pickle
import openai
import torch.nn.functional as F
import torch
from tqdm import tqdm
import json

In [7]:
def get_openai_response(prompt, tokens = 2000, model="gpt-3.5-turbo-instruct"):
    if model.__contains__("instruct"):
        response = openai.Completion.create(
            model=model,
            prompt=prompt,
            temperature=0,
            max_tokens=tokens,
            top_p=1,
            # logit_bias = {
            #     "198": -100, # new lines
            #     },
            logprobs= 4,
        )
        # print(response)
        text = response.choices[0].text
        logprobs = response.choices[0].logprobs.top_logprobs

        return text, logprobs
    else:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant"
                },
                {
                    "role": "user",
                    "content": prompt
                }
                ],
            temperature=0,
            max_tokens=tokens,
            top_p=1,
        )
        text = response.choices[0].message.content


        # print("returning ai text:", text)
        return text

In [8]:
def process_thread(row):
    date = row["dates"].split("<sep>")[-1]

    thread = ""
    # thread += "Category: " + row["category"] + "\n"
    thread += "Date: " + date[:7] + "\n"
    thread += "Topic: " + row["topic"] + "\n"        
    thread += "### Original post:\n"
    i = 1
    for (post, date) in zip(row["post"].split("<sep>"), row["dates"].split("<sep>")):
        if len(post) > 800:
            thread += post[:800] + "<rest of post truncated>\n\n"
            thread += f"### Reply {i}:\n"
            i += 1
        elif len(post) < 5:
            pass
        else:
            thread += post + "\n\n"
            thread += f"### Reply {i}:\n"
            i += 1
    #remove the last line
    thread = thread[:-len(f"### Reply {i-1}:\n")]
    if len(thread) > 4000:
        thread = thread[:4000] + "<rest of thread truncated>\n"

    return date, thread

In [9]:
categories = [
    "groupbuys",
    "hardware",
    "miners",
    "mining",
    "mining_support",
    # "pools",
]

df = pd.DataFrame()

#load every csv in the folder and append them
for cat in categories:
    with gzip.open('cleaned-data/'+cat+'.pkl.gz', 'rb') as f:
        df_cat = pickle.load(f)
        df_cat['category'] = cat
        df = pd.concat([df, df_cat], ignore_index=True)

path = "datasets/extracted/"
file_name = "1_original.csv"


if not os.path.exists(path+file_name):
    dataset = pd.DataFrame(columns=['date','hardware_name','owned'])
else:
    dataset = pd.read_csv(path+file_name)

# rows = df.sample(30)
rows = df

In [10]:
dates, threads = [], []
for j in range(len(rows)):
# for j in range(50):
    row = rows.iloc[j]
    if(len(row["post"]) < 100):
        # print(f"\n\nskipping {j}\n\n")
        continue

    date, thread = process_thread(row)


    # if not thread.__contains__("S7"):
    #     continue
    
    # if not date.__contains__("2011"):
    #     continue

    if int(date[:4]) < 2018:
        continue

    dates.append(date)
    threads.append(thread)


for (date, thread) in tqdm(zip(dates, threads), total=len(dates)):

    
    # print(thread)

    prompt = f"""
In the given Bitcoin forum thread, pay close attention to the language used when mentioning hardware pieces.

{thread}



Reply with a formatted JSON document containing an array of objects. Each object should represent a piece of hardware mentioned in the thread and include the following fields: 
- hardware_name: A string containing the name of the hardware. 
- hardware_is_owned: A boolean. If the mention suggests concrete ownership by any user, write true. If the hardware is discussed in a hypothetical or speculative way, write false. 


[
  {"{"}
    "hardware_name": "
""".strip()
    

    response, logprobs = get_openai_response(prompt, tokens = 500)

    # logprobs looks like:
    # [
    # {
    # "IN": -0.0049544396,
    # "T": -6.2804995,
    # "inn": -7.0385637,
    # "miner": -7.3308096
    # },
    # {
    # " true": -0.35301816,
    # " false": -1.2169975,
    # "true": -7.54616,
    # "false": -7.8522363
    # }
    # ]

    # print("\n\n\n\n\n"+prompt + response)

    try:
        response = json.loads("[{\"hardware_name\": \""+response)
    except:
        continue

    true_probabilities = []
    hardware_names = []

    for logprob in logprobs:
        # Check if " true" is a key in the dictionary
        if " true" in logprob:
            # Extract the log probabilities for " true" and " false"
            true_logprob = logprob.get(" true", float('-inf'))
            false_logprob = logprob.get(" false", float('-inf'))
            
            # Apply softmax to the top 2 log probabilities
            probs = F.softmax(torch.tensor([true_logprob, false_logprob]), dim=0)
            
            # The first element corresponds to the probability of " true"
            true_probabilities.append(probs[0].item())

    # Extract hardware names from the response JSON
    for item in response:
        hardware_names.append(item['hardware_name'])

    if len(hardware_names) != len(true_probabilities):
        print("ERROR: length of hardware names and probabilities do not match")
        print(hardware_names)
        print(true_probabilities)
        print("\n\n\n\n\n"+prompt + response)
        continue

    # Create a new DataFrame from the response and probabilities
    new_rows = pd.DataFrame({
        'date': [date] * len(response),
        'hardware_name': hardware_names,
        'owned': true_probabilities
    })

    # Append the new rows to the dataset
    import warnings
    warnings.filterwarnings("ignore")
    dataset = pd.concat([dataset, new_rows], ignore_index=True)

    # break

    dataset.to_csv(path+file_name, index=False)

100%|██████████| 3606/3606 [2:41:45<00:00,  2.69s/it]  


In [11]:
dataset

Unnamed: 0,date,hardware_name,owned
0,2018-01-05 23:21:15,E10 18T,0.090441
1,2018-01-05 23:21:15,Ebit,0.633453
2,2018-01-05 23:21:15,s9,0.235280
3,2018-01-19 04:11:17,Sidehack 15gh BM1384 stick,0.649974
4,2018-01-19 04:11:17,2pac,0.591880
...,...,...,...
23725,2023-10-27 16:39:20,PC,0.911859
23726,2023-10-27 16:39:20,Avalonminer 1166 pro,0.974937
23727,2023-10-27 16:39:20,DIY soundproof box,0.203321
23728,2023-10-27 16:39:20,VPN,0.076580


In [12]:
print(response)

[{'hardware_name': 'Awesomeminer', 'hardware_is_owned': False}, {'hardware_name': 'FMS tool', 'hardware_is_owned': False}, {'hardware_name': 'VPN', 'hardware_is_owned': False}, {'hardware_name': 'SSH tunnel', 'hardware_is_owned': False}, {'hardware_name': 'TeamViewer', 'hardware_is_owned': False}, {'hardware_name': 'PC', 'hardware_is_owned': True}, {'hardware_name': 'Avalonminer 1166 pro', 'hardware_is_owned': True}, {'hardware_name': 'DIY soundproof box', 'hardware_is_owned': False}, {'hardware_name': 'VPN', 'hardware_is_owned': False}, {'hardware_name': 'ASIC miners', 'hardware_is_owned': True}]
