In [6]:
import pandas as pd
import os
import gzip
import pickle
import openai
import torch.nn.functional as F
import torch

In [7]:
def get_openai_response(prompt, tokens = 2000, model="gpt-4"):
    # model="gpt-3.5-turbo"
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant"
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        max_tokens=tokens,
        top_p=1,
    )
    text = response.choices[0].message.content


    # print("returning ai text:", text)
    return text

In [8]:
df = pd.read_csv('datasets/extracted/name_and_price.csv')

# sort by date
df = df.sort_values(by=['date'])

# remove rows where hardware_price contains the string "unknown"
df = df[df['hardware_price'].str.contains("unknown") == False]

# save
df.to_csv('datasets/extracted/name_and_price_no_unknown.csv', index=False)

# remove ambiguous hardware

In [9]:
#read the file back as a string
with open('datasets/extracted/name_and_price_no_unknown.csv', 'r') as file:
    data = file.read()


prompt_start = """
I have analyzed posts from a bitcoin forum and created this csv:
""".strip()

prompt_end = """
Rewrite this csv, so that it includes a new column called "clean"

Rows should get the value "true" if:
The hardware name is well defined and not ambiguous
The hardware name is something that is directly used for mining bitcoin, like an ASIC or a GPU
The hardware name does not appear to be a subscription or a piece of software
It is realistically possible to obtain the hashrate of this hardware by looking it up from a table

Other rows should get the value "false"

""".strip()


actual_prompt = prompt_start + "\n\n" + data + "\n\n" + prompt_end

print(actual_prompt)

response = get_openai_response(actual_prompt) # the response is a csv as a string with columns date,hardware_name,hardware_price,clean


# save the response
with open('datasets/extracted/name_and_price_no_ambiguous.csv', 'w') as file:
    file.write(response)

df = pd.read_csv('datasets/extracted/name_and_price_no_ambiguous.csv')
# remove rows where clean is false
df = df[df['clean'] == True]

#remove the clean column
df = df.drop(columns=['clean'])

df.to_csv('datasets/extracted/name_and_price_no_ambiguous.csv', index=False)

I have analyzed posts from a bitcoin forum and created this csv:

date,hardware_name,hardware_price
2013-06-09,Bitfury-Metabank ASIC,3000$
2013-06-17,Avalon chips,40 EUR
2013-07-01,BFL,0.29 BTC
2013-07-09,ASICMiner USB,0.99 BTC
2013-07-12,BFL ASIC Chips,$59/chip
2013-07-16,ASICMINER ERUPTER USB,1 BTC
2013-07-23,KnCMiner Jupiter,1.2 BTC
2013-07-23,KnCMiner Jupiter,1.1 BTC
2013-08-16,KNCminer Jupiter,4 bitcoin per share
2013-08-29,ASICMINER USB Block Erupter,0.4BTC
2013-08-30,CoinTerra,1 BTC/12 GH/s
2013-09-03,Hashfast Babyjet,1.09 btc
2013-09-04,NEW Blade Miners,4 BTC or less
2013-09-04,CoinTerra,"$13,999"
2013-09-09,XCrowd Cloud Hosting,"$85,000 USD"
2013-10-08,BFL ASICs,$12.5/(GH/s)
2013-10-18,Terraminer IV,1.5BTC
2013-10-26,Black Arrow Bullet Run,$123
2013-11-08,ASICMiner Blades (Rev2),.80 BTC
2013-11-28,Neptune GB,4 BTC
2013-12-02,KNCminer Neptune,0.66 BTC per share
2013-12-14,KnC NEPTUNE,0.05 B/Sh
2014-02-04,ANTMINER S1 Dual Blades,1.35btc
2014-02-05,Neptune,0.07 BTC
2014-02-17,Ant

# fix prices formatting

In [11]:
#read the file back as a string
with open('datasets/extracted/name_and_price_no_ambiguous.csv', 'r') as file:
    data = file.read()

prompt_start = """
I have analyzed posts from a bitcoin forum and created this csv:
""".strip()

prompt_end = """
Rewrite the csv with the following change:
Rewrite the price in a standard format, use this format for all rows: "1234$" or "1234BTC" or "1234€" etc. depending on the currency
If a price is per unit, keep only the price
If a price is a range, keep only the first price
If a price has no currency, assume it is in $


""".strip()

actual_prompt = prompt_start + "\n\n" + data + "\n\n" + prompt_end

print(actual_prompt)

response = get_openai_response(actual_prompt)

# save the response
with open('datasets/extracted/name_and_price_standardized.csv', 'w') as file:
    file.write(response)

I have analyzed posts from a bitcoin forum and created this csv:

date,hardware_name,hardware_price
2013-06-09,Bitfury-Metabank ASIC,3000$
2013-06-17,Avalon chips,40 EUR
2013-07-09,ASICMiner USB,0.99 BTC
2013-07-12,BFL ASIC Chips,$59/chip
2013-07-16,ASICMINER ERUPTER USB,1 BTC
2013-07-23,KnCMiner Jupiter,1.2 BTC
2013-07-23,KnCMiner Jupiter,1.1 BTC
2013-08-29,ASICMINER USB Block Erupter,0.4BTC
2013-09-03,Hashfast Babyjet,1.09 btc
2013-10-08,BFL ASICs,$12.5/(GH/s)
2013-10-18,Terraminer IV,1.5BTC
2013-11-08,ASICMiner Blades (Rev2),.80 BTC
2014-02-04,ANTMINER S1 Dual Blades,1.35btc
2014-02-17,AntminerS1,0.0205BTC
2014-07-11,TECHNOBIT's HEX16A,"49,30"
2017-07-05,Antminer S7,$400
2017-07-05,Antminer S5,$250
2022-02-27,Bitmain S17 pro 56th,$35
2023-09-08,Antminer S9,$25-35


Rewrite the csv with the following change:
Rewrite the price in a standard format, use this format for all rows: "1234$" or "1234BTC" or "1234€" etc. depending on the currency
If a price is per unit, keep only the price
I