In [None]:
import pandas as pd
import os
import gzip
import pickle
import openai
import torch.nn.functional as F
import torch

In [None]:
categories = [
    "groupbuys",
    "hardware",
    "miners",
    "mining",
    "mining_support",
    "pools",
]

df = pd.DataFrame()

#load every csv in the folder and append them
for cat in categories:
    with gzip.open('cleaned-data/'+cat+'.pkl.gz', 'rb') as f:
        df_cat = pickle.load(f)
        df_cat['category'] = cat
        df = pd.concat([df, df_cat], ignore_index=True)
    

In [None]:
df.sample(10)

In [None]:
def get_openai_response(prompt, tokens = 2000, model="gpt-3.5-turbo-instruct"):
    if model.__contains__("instruct"):
        response = openai.Completion.create(
            model=model,
            # messages=[
            #     {
            #         "role": "system",
            #         "content": "Answer the prompt with a single word"
            #     },
            #     {
            #         "role": "user",
            #         "content": prompt
            #     }
            #     ],
            prompt=prompt,
            temperature=0,
            max_tokens=tokens,
            top_p=1,
            logit_bias = {
                "198": -100, # new lines
                },
            logprobs= 15,
        )
        text = response.choices[0].text
        # print(response.choices[0].logprobs)

        # print("returning ai text:", text)
        # return text
        logits = response.choices[0].logprobs.top_logprobs[0]
        yes = logits["Yes"]
        no = logits["No"]
        # print("yes:", yes, "no:", no)
        return {"yes": yes, "no": no, "positive": F.softmax(torch.tensor([yes, no]), dim=0)[0].item()}
    else:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant"
                },
                {
                    "role": "user",
                    "content": prompt
                }
                ],
            temperature=0,
            max_tokens=tokens,
            top_p=1,
        )
        text = response.choices[0].message.content


        # print("returning ai text:", text)
        return text

In [None]:
prompt = """
You are a bitcoin mining expert.

The will be given a thread from a bitcoin forum.

Your task is to analyze if the thread contains a name of mining hardware. 

Example hardware names are ARM Cortex A9, X6500, AvalonMiner 1, Jupiter, RockerBox, BE300, PickAxe, Antminer S9 (there are many more)
Sometimes common hardware is mentioned by its number only. For example, "5870" may refer to a Radeon card. "S9" may refer to the Antminer S9. Those are also valid hardware names.

If the thread contains a name of mining hardware, say "Yes", followed by its name. Otherwise, say "No".

The thread:
""".strip()

dataset = []

for (id,row) in df.sample(2).iterrows():
    if(len(row["post"]) < 100):
        continue
    # continue

    thread = ""
    thread += "category: " + row["category"] + "\n"
    thread += "topic: " + row["topic"] + "\n"
    thread += "date: " + row["dates"].split("<sep>")[-1][:7] + "\n\n"
    thread += "### Original post:\n"
    i = 1
    for (post, date) in zip(row["post"].split("<sep>"), row["dates"].split("<sep>")):
        if len(post) > 800:
            thread += post[:800] + "<rest of post truncated>\n\n"
            thread += f"### Reply {i}:\n"
            i += 1
        elif len(post) < 5:
            pass
        else:
            thread += post + "\n\n"
            thread += f"### Reply {i}:\n"
            i += 1
    #remove the last line
    thread = thread[:-len(f"### Reply {i-1}:\n")]
    if len(thread) > 4000:
        thread = thread[:4000] + "<rest of thread truncated>\n"
    
    # print(thread)

    actual_prompt = prompt + "\n" + thread + "\n\n\n\n" + ""
    # print(actual_prompt)

    response = get_openai_response(actual_prompt, tokens = 3)
    # print("response:", response)

    dataset.append({
        "thread": thread,
        "response": response,
    })

In [None]:
dataset

In [None]:
#save the dataset
torch.save(dataset, "datasets/hardware_name.pt")

# testing

In [None]:
# prompt = """
# You are a bitcoin mining expert.

# The will be given a thread from a bitcoin forum.

# Your task is to analyze if the thread contains a name of mining hardware. 

# Example hardware names are ARM Cortex A9, X6500, AvalonMiner 1, Jupiter, RockerBox, BE300, PickAxe, Antminer S9 (there are many more)
# Sometimes common hardware is mentioned by its number only. For example, "5870" may refer to a Radeon card. "S9" may refer to the Antminer S9. Those are also valid hardware names.

# If the thread contains a name of mining hardware, say "Yes", followed by its name. Otherwise, say "No".

# The thread:
# """.strip()

# # thread = """
# # category: mining_support
# # topic: Any help? - 5970 not hitting ~700 mhash/s
# # date: 2012-02

# # original post:
# # Just got my 5970 in the mail and got it set up and it's getting me only ~550 mhash/s using GUIMiner.

# # reply 1:
# # Look for SDK 2.1.
# # """.strip()

# # thread = """
# # category: pools
# # topic: Ghash.io good or bad
# # date: 2014-01
# # original post:
# # do you think that ghash.iois good or bad pool??//

# # reply 1:
# # The pool is not bad but be prepared to sometime lose and other times to win. As a mining pool not bad as very good total hash.

# # reply 2:
# # Good if we use great hardware for mining :d
# # Nothing wrong with the pool itself, but their closing in on 51% is VERY BAD.
# # """.strip()

# thread = """
# category: mining
# topic: Recommended Safety & Cooling
# date: 2015-10

# ### Original post:
# Hey Again Everyone!Well, I am up and running now! I did shutdown my unit last night because the units fan was LOUD! I want to make sure I am running this safe and have proper cooling! I have a extra room in the grudge that is air conditioned but I would need to connect the S5 via Wi-Fi. Any tips?
# """.strip()

# actual_prompt = prompt + "\n" + thread + "\n\n\n\n\n\n\n\n" + ""

# print(actual_prompt)


# response = get_openai_response(actual_prompt, tokens = 10)
# # response = get_openai_response(prompt + "\n" + thread, tokens = 3, model="gpt-4")
# print(response)