In [None]:
import pandas as pd
import os
import gzip
import pickle
import openai
import torch.nn.functional as F
import torch

In [None]:
categories = [
    "groupbuys",
    "hardware",
    "miners",
    "mining",
    "mining_support",
    "pools",
]

df = pd.DataFrame()

#load every csv in the folder and append them
for cat in categories:
    with gzip.open('cleaned-data/'+cat+'.pkl.gz', 'rb') as f:
        df_cat = pickle.load(f)
        df_cat['category'] = cat
        df = pd.concat([df, df_cat], ignore_index=True)
    

In [None]:
def get_openai_response(prompt, tokens = 2000, model="gpt-3.5-turbo-instruct"):
    if model.__contains__("instruct"):
        response = openai.Completion.create(
            model=model,
            prompt=prompt,
            temperature=0,
            max_tokens=tokens,
            top_p=1,
            logit_bias = {
                "198": -100, # new lines
                },
            logprobs= 15,
        )
        text = response.choices[0].text

        return text
    else:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant"
                },
                {
                    "role": "user",
                    "content": prompt
                }
                ],
            temperature=0,
            max_tokens=tokens,
            top_p=1,
        )
        text = response.choices[0].message.content


        # print("returning ai text:", text)
        return text

# hardware name

In [None]:
path = "datasets/extracted/"
file_name = "hardware_name.csv"

prompt_start = """
You are a bitcoin mining expert. Your task is to analyze if a forum thread contains a name of mining hardware. 

Here is a thread from a bitcoin forum:
""".strip()

prompt_end = """
Your task is to analyze if the thread contains a name of mining hardware. 

Example hardware names are ARM Cortex A9, X6500, AvalonMiner 1, Jupiter, RockerBox, BE300, PickAxe, Antminer S9 (there are many more)
Sometimes common hardware is mentioned by its number only. For example, "5870" may refer to a Radeon card. "S9" may refer to the Antminer S9. Those are also valid hardware names.

After reading the thread, write either "Hardware found: Nothing" or "Hardware found: <hardware name>".
""".strip()


if not os.path.exists(path+file_name):
    dataset = pd.DataFrame()

    thread_count = 0
    for (id,row) in df.sample(10000).iterrows():
        if(len(row["post"]) < 100):
            continue

        date = row["dates"].split("<sep>")[-1]

        thread = ""
        # thread += "Category: " + row["category"] + "\n"
        thread += "Topic: " + row["topic"] + "\n"
        thread += "Date: " + date[:7] + "\n\n"
        thread += "### Original post:\n"
        i = 1
        for (post, date) in zip(row["post"].split("<sep>"), row["dates"].split("<sep>")):
            if len(post) > 800:
                thread += post[:800] + "<rest of post truncated>\n\n"
                thread += f"### Reply {i}:\n"
                i += 1
            elif len(post) < 5:
                pass
            else:
                thread += post + "\n\n"
                thread += f"### Reply {i}:\n"
                i += 1
        #remove the last line
        thread = thread[:-len(f"### Reply {i-1}:\n")]
        if len(thread) > 4000:
            thread = thread[:4000] + "<rest of thread truncated>\n"
        
        print(thread)

        actual_prompt = prompt_start + "\n" + thread + "\n\n\n\n\n\n\n" + prompt_end + "\n\n\n\n\n\n\n"
        # print(actual_prompt)

        response = get_openai_response(actual_prompt, tokens = 20)
        print("response:", response, "\n\n\n\n\n\n")

        if "Nothing" in response:
            continue

        dataset = pd.concat([dataset, pd.DataFrame([{
            "date": date[:-9],
            "hardware_name": response.replace("Hardware found: ", ""),
            }])], ignore_index=True)

        thread_count += 1  
        if thread_count > 20:
            break
        
    print(dataset.value_counts())

    #save the dataset
    dataset.to_csv(path+file_name, index=True)

# hardware price

In [None]:
# file_name = "hardware_price.csv"

# if not os.path.exists(path+file_name):
#     prompt = """
# You are a bitcoin mining expert.

# The will be given a thread from a bitcoin forum.

# Your task is to analyze if the thread mentions a hardware price.

# After reading the thread, write either "Hardware price found: Nothing" or "Hardware price found: <hardware price>".

# The thread:
#     """.strip()

#     dataset = pd.DataFrame()

#     thread_count = 0
#     for (id,row) in df.sample(10000).iterrows():
#         if(len(row["post"]) < 100):
#             continue
#         # continue

#         date = row["dates"].split("<sep>")[-1]

#         thread = ""
#         # thread += "Category: " + row["category"] + "\n"
#         thread += "Topic: " + row["topic"] + "\n"
#         thread += "Date: " + date[:7] + "\n\n"
#         thread += "### Original post:\n"
#         i = 1
#         for (post, date) in zip(row["post"].split("<sep>"), row["dates"].split("<sep>")):
#             if len(post) > 800:
#                 thread += post[:800] + "<rest of post truncated>\n\n"
#                 thread += f"### Reply {i}:\n"
#                 i += 1
#             elif len(post) < 5:
#                 pass
#             else:
#                 thread += post + "\n\n"
#                 thread += f"### Reply {i}:\n"
#                 i += 1
#         #remove the last line
#         thread = thread[:-len(f"### Reply {i-1}:\n")]
#         if len(thread) > 4000:
#             thread = thread[:4000] + "<rest of thread truncated>\n"

#         # if not "$" in thread.lower():
#         #     continue

#         whitelist = [
#             "$",
#             "usd",
#             "dollar",
#             "€",
#             "eur",
#             "euro",
#             "£",
#             "gbp",
#             "pound",
#             "yen",
#             "jpy",
#             "cny",
#             "rmb",
#             "yuan",
#             "ruble",
#         ]

#         if not any([x in thread.lower() for x in whitelist]):
#             continue

          
#         print(thread)

#         actual_prompt = prompt + "\n" + thread + "\n\n\n\n\n\n\n\n" + ""
#         # print(actual_prompt)

#         response = get_openai_response(actual_prompt, tokens = 20)
#         print("response:", response, "\n\n\n\n\n\n")

#         if "Nothing" in response:
#             continue

#         dataset = pd.concat([dataset, pd.DataFrame([{
#             "date": date[:-9],
#             "hardware_price": response.replace("Hardware price found: ", "").replace("<hardware price>", "").strip(),
#             }])], ignore_index=True)
        
#         thread_count += 1  
#         if thread_count > 20:
#             break
        
#     print(dataset.value_counts())

#     #save the dataset
#     dataset.to_csv(path+file_name, index=True)