In [32]:
import pandas as pd
import os
import gzip
import pickle
from openai import OpenAI
import torch.nn.functional as F
import torch
from tqdm import tqdm
import json

In [33]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [34]:
def get_openai_response(prompt):
    tokens = 1000
    model="gpt-4-turbo-preview"
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant"
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        max_tokens=tokens,
        top_p=1,
        logprobs=True,
        top_logprobs=5
    )
    choice = response.choices[0]
    # print(choice)

    # Choice(finish_reason='stop', index=0, logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='```', bytes=[96, 96, 96], logprob=-0.32594898, top_logprobs=[]), ChatCompletionTokenLogprob(token='json', bytes=[106, 115, 111, 110], logprob=-1.9816675e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='\n', bytes=[10], logprob=-6.704273e-07, top_logprobs=[]), ChatCompletionTokenLogprob(token='[]\n', bytes=[91, 93, 10], logprob=-0.01040671, top_logprobs=[]), ChatCompletionTokenLogprob(token='```', bytes=[96, 96, 96], logprob=-0.00010556965, top_logprobs=[])]), message=ChatCompletionMessage(content='```json\n[]\n```', role='assistant', function_call=None, tool_calls=None))

    text = choice.message.content
    logprobs = choice.logprobs.content

    return text, logprobs

In [35]:
def process_thread(row):
    date = row["dates"].split("<sep>")[-1]

    thread = ""
    # thread += "Category: " + row["category"] + "\n"
    thread += "Date: " + date[:7] + "\n"
    thread += "Topic: " + row["topic"] + "\n"        
    thread += "### Original post:\n"
    i = 1
    for (post, date) in zip(row["post"].split("<sep>"), row["dates"].split("<sep>")):
        if len(post) > 800:
            thread += post[:800] + "<rest of post truncated>\n\n"
            thread += f"### Reply {i}:\n"
            i += 1
        elif len(post) < 5:
            pass
        else:
            thread += post + "\n\n"
            thread += f"### Reply {i}:\n"
            i += 1
    #remove the last line
    thread = thread[:-len(f"### Reply {i-1}:\n")]
    if len(thread) > 4000:
        thread = thread[:4000] + "<rest of thread truncated>\n"

    return date, thread

In [36]:
categories = [
    "groupbuys",
    "hardware",
    "miners",
    "mining",
    "mining_support",
    # "pools",
]

df = pd.DataFrame()

#load every csv in the folder and append them
for cat in categories:
    with gzip.open('../1_forum_dataset/cleaned-data/'+cat+'.pkl.gz', 'rb') as f:
        df_cat = pickle.load(f)
        df_cat['category'] = cat
        df = pd.concat([df, df_cat], ignore_index=True)

path = "./"
file_name = "1_extracted.csv"


if not os.path.exists(path+file_name):
    dataset = pd.DataFrame(columns=['date','hardware_name','owned'])
else:
    dataset = pd.read_csv(path+file_name)

# rows = df.sample(30)
rows = df

In [40]:
dates, threads = [], []
# for j in range(len(rows)):
for j in range(50):
    # if(j<26):
    #     print(f"skipping {j}")
    #     continue
    row = rows.iloc[j]
    if(len(row["post"]) < 20):
        print(f"skipping {j}")
        continue

    date, thread = process_thread(row)


    # if not thread.__contains__("S7"):
    #     continue
    
    # if not date.__contains__("2011"):
    #     continue

    # if int(date[:4]) < 2018:
    #     continue

    dates.append(date)
    threads.append(thread)


threadid=0
# for (date, thread) in tqdm(zip(dates, threads), total=len(dates)):
for (date, thread) in zip(dates, threads):

    
    print(f"processing thread id {threadid}, date: "+ date +" thread: "+ thread + "\n\n")

    prompt = f"""
User:
In the given Bitcoin forum thread, pay close attention to the language used when mentioning hardware pieces. Look for explicit statements indicating ownership or hypothetical discussions.

{thread}



Reply with a formatted JSON document containing an array of objects. Each object should represent a piece of hardware mentioned in the thread and include the following fields:
- hardware_name: A string containing the name of the hardware.
- hardware_is_owned: A boolean. If the mention suggests concrete ownership by any user, write true. If the hardware is discussed in a hypothetical or speculative way, write false. 


Assistant:
Sure! Here is the requested JSON file, with the correct ownership status for each piece of hardware:
""".strip()
    
    # continue
    response, logprobs = get_openai_response(prompt)

    print("model response: \n\n"+response+"\n\n\n")
    # print("model logprobs: \n\n",logprobs,"\n\n\n")


    try:
        response = json.loads(response.replace("```json\n","").replace("\n```","").strip())
    except:
        print("ERROR: could not parse response as JSON")
        continue

    true_probabilities = []
    hardware_names = []

    # for logprob in logprobs:
    for (i,logprob) in enumerate(logprobs):
        # print("token "+str(i))
        top = logprob.top_logprobs
        # top_logprobs=[TopLogprob(token='```', bytes=[96, 96, 96], logprob=0.0),...]
        logprog_dict = {}
        for TopLogprob in top:
            logprog_dict[TopLogprob.token] = TopLogprob.logprob

        # print("logprog_dict")
        # print(logprog_dict)
        # Check if " true" is a key in the dictionary
        if " true" in logprog_dict:
            # Extract the log probabilities for " true" and " false"
            true_logprob = logprog_dict.get(" true", float('-inf'))
            false_logprob = logprog_dict.get(" false", float('-inf'))
            
            # Apply softmax to the top 2 log probabilities
            probs = F.softmax(torch.tensor([true_logprob, false_logprob]), dim=0)
            
            # The first element corresponds to the probability of " true"
            true_probabilities.append(probs[0].item())

    # Extract hardware names from the response JSON
    for item in response:
        hardware_names.append(item['hardware_name'])

    if len(hardware_names) != len(true_probabilities):
        print("ERROR: length of hardware names and probabilities do not match")
        print(hardware_names)
        print(true_probabilities)
        print("\n\n\n\n\n"+prompt + response)
        continue

    print("probabilities:")
    for (name, prob) in zip(hardware_names, true_probabilities):
        print(name, prob)
    print("\n\n\n\n\n")

    # Create a new DataFrame from the response and probabilities
    new_rows = pd.DataFrame({
        'date': [date] * len(response),
        'hardware_name': hardware_names,
        'owned': true_probabilities
    })

    # Append the new rows to the dataset
    import warnings
    warnings.filterwarnings("ignore")
    dataset = pd.concat([dataset, new_rows], ignore_index=True)

    # break

    dataset.to_csv(path+file_name, index=False)
    threadid+=1

skipping 0
skipping 1
skipping 2
skipping 3
skipping 4
skipping 5
skipping 6
skipping 7
skipping 8
skipping 9
skipping 10
skipping 11
skipping 12
skipping 13
skipping 14
skipping 15
skipping 16
skipping 17
skipping 18
skipping 19
skipping 20
skipping 21
skipping 22
skipping 23
skipping 24
skipping 25
processing thread id 0, date: 2013-04-01 01:56:04 thread: Date: 2013-04
Topic: Looking for two people to group buy Avalon unit in Batch3
### Original post:
You should verify yourself with a senior or preferred hero member being that you are pretty new. Due Diligence please.

### Reply 1:
Got few PMs with some interesting counter offers, at this point just looking to pair up with two people @ 25 coins each, no smaller amounts.There are few other Group buys for people who are interested in smaller coin contributions.Art

### Reply 2:
to late. sold out

### Reply 3:
Its been sold out on and off for last 4 hours I also got one in my profile on hold. Either way if i dont get one with two other 

In [38]:
dataset

Unnamed: 0,date,hardware_name,owned
0,2013-03-25 16:42:09,Avalon batch #3 Avalon ASIC,2.430024e-05
1,2013-03-25 13:14:29,Avalon ASIC,7.33821e-07
2,2013-03-25 14:26:29,Avalon asic 85,1.342001e-05
3,2013-03-25 16:42:09,Avalon batch #3 Avalon ASIC,2.2828e-05
4,2013-03-25 22:23:10,Avalon unit,0.0001713082
5,2013-03-25 23:51:49,AVALON,3.647854e-05
6,2013-03-25 23:51:49,unit,0.994615
7,2013-03-26 00:19:57,Avalon batch #3,0.01205378
8,2013-03-26 00:19:57,3-module one,0.9999982
9,2013-03-26 00:41:12,Avalon,7.183261e-06


In [39]:
print(response)

[]
