In [24]:
import pandas as pd
import os
import gzip
import pickle
import openai
import torch.nn.functional as F
import torch
import re

In [25]:
def get_openai_response(prompt, tokens = 2000, model="gpt-4"):
    # model="gpt-3.5-turbo"
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant"
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        max_tokens=tokens,
        top_p=1,
    )
    text = response.choices[0].message.content


    # print("returning ai text:", text)
    return text

In [26]:
df = pd.read_csv('datasets/extracted/date_name_price_original.csv')

# sort by date
df = df.sort_values(by=['date'])

# remove rows where hardware_price contains the string "unknown"
df = df[df['hardware_price'].str.contains("unknown") == False]

# save
df.to_csv('datasets/extracted/date_name_price_no_unknown_prices.csv', index=False)

# drop the date column
df = df.drop(columns=['date'])

# drop the hardware_price column
df = df.drop(columns=['hardware_price'])

# save
df.to_csv('datasets/extracted/name_only.csv', index=False)

# GPT-4 processing (prices)

In [27]:
#read the file back as a string
with open('datasets/extracted/date_name_price_no_unknown_prices.csv', 'r') as file:
    table_text = file.read()

prompt = f"""
{table_text}


I have analyzed posts from a bitcoin forum and created this csv.
Rewrite the csv with the following change:
Rewrite the price in a standard format, use this format for all rows: "1234$" or "1234BTC" or "1234€" etc. depending on the currency
If a price is per unit, keep only the price
If a price is a range, keep only the first price
If a price has no currency, assume it is in $


""".strip()



print(prompt)

# response = get_openai_response(prompt) # the response is a csv as a string with columns date,hardware_name,hardware_price,hashrate,efficiency


# # save the response
# with open('datasets/extracted/name_and_price_cleaned.csv', 'w') as file:
#     file.write(response)


# df = pd.read_csv('datasets/extracted/date_name_price_cleaned_prices.csv')


date,hardware_name,hardware_price
2011-03-06,5850,$70
2011-05-06,Amazon EC2 Cluster GPU Instances,$2.10 per hour
2011-05-16,Thermaltake W0133RU ToughPower 1200W Modular Power Supply,$200
2011-05-16,ToughPower 1350W supply,$220
2011-05-16,Intel D975XBX,$55
2011-05-16,Coolermaster 700W Silent Pro,$79
2011-05-22,5850,$185
2011-05-22,5830,$99
2011-06-01,HD5830,$280
2011-06-01,HD5830,$126
2011-06-01,HD5850,$288
2011-06-03,XFX 5830,$109
2011-06-07,5830,$150
2011-06-07,5850,$200-300
2011-07-13,5830,$800
2012-04-21,x6500,107 BTC
2012-04-21,x6500 fpga,$580
2012-05-13,BeagleBone,$89
2012-07-14,BitForce Jalapeno,$150
2013-02-02,Icarus FPGA,26 BTC
2013-03-25,AVALON,$75 and $101
2013-04-02,6870,$80
2013-04-12,BFL 5 gh/s units,$275
2013-04-14,25 GH/s ASIC miner,"$2,000"
2013-04-15,Avalon ASIC,$1200
2013-04-15,ztex 1.15x FGPA,$200
2013-04-27,Avalon ASICs,807.1501152800002 BTC
2013-05-13,Ztex 1.15y,$599
2013-05-19,Avalon Chips,0.079 BTC
2013-05-22,Sapphire 7950 Vapor X,$320
2013-05-30,ASICMiner Erupte

# adding efficiency and hashrate

In [30]:
#read the file back as a string
with open('datasets/extracted/name_only.csv', 'r') as file:
    table_text = file.read()
    table_text = table_text.replace("hardware_name", "").strip()

with open('hardware.txt', 'r') as file:
    hardware_txt = file.read()

hardware_df = pd.DataFrame([x.split(';') for x in hardware_txt.split('\n')], columns=['hardware_name', 'hashrate', 'efficiency'])


#remove rows that contain x2,x3 etc
hardware_df = hardware_df[~hardware_df['hardware_name'].str.contains("x[0-9]")]
hardware_df = hardware_df[~hardware_df['hardware_name'].str.contains("cards")]

#remove text in brackets from hardware_name
hardware_df['hardware_name'] = hardware_df['hardware_name'].apply(lambda x: re.sub(r"\(.*\)","", x).strip())
hardware_df['hardware_name'] = hardware_df['hardware_name'].apply(lambda x: re.sub(r"OC","", x).strip())
hardware_df['hardware_name'] = hardware_df['hardware_name'].apply(lambda x: re.sub(r"\d+ *Gh/s","", x).strip())
hardware_df['hardware_name'] = hardware_df['hardware_name'].apply(lambda x: re.sub(r"\d+ *GH/S","", x).strip())
hardware_df['hardware_name'] = hardware_df['hardware_name'].apply(lambda x: re.sub(r"\d+ *GH/s","", x).strip())

#remove duplicate hardware names
hardware_df = hardware_df.drop_duplicates(subset=['hardware_name'])

#reset index
hardware_df = hardware_df.reset_index(drop=True)
hardware_df["index"] = hardware_df.index
hardware_df.to_csv('hardware_full.csv', index=False)

# save it as a csv with columns "index,hardware_name"
hardware_df = hardware_df[['index', 'hardware_name']]
hardware_df.to_csv('hardware_index.csv', index=False)

with open('hardware_index.csv', 'r') as file:
    hardware = file.read()

def split_string(s, num_chunks):
    lines = s.splitlines()
    num_lines = len(lines)
    chunk_size = num_lines // num_chunks
    chunks = [lines[i:i+chunk_size] for i in range(0, num_lines, chunk_size)]
    # If there are leftover lines, add them to the last chunk
    if num_lines % num_chunks != 0:
        chunks[-1] += lines[num_chunks*chunk_size:]
    return ['\n'.join(chunk) for chunk in chunks]

In [33]:
for text in split_string(table_text, 3):

    prompt = f"""
Here is a table of hardware that is used for mining bitcoin:

{hardware}




I have analyzed posts from a bitcoin forum and created this list of hardware:
{text}



Using the hardware table, rewrite this list as a csv with columns: hardware_name,hardware_index
If a hardware name is unclear or not in the table of mining hardware, write "unknown" for the index value. The names don't need to match exactly, use common sense. For example, "Sapphire 7950 Vapor X" refers to the 7950 in the hardware table.

    """.strip()



    print(prompt)
    print("\n"*100)

    # response = get_openai_response(prompt) # the response is a csv as a string with columns date,hardware_name,hardware_price,hashrate,efficiency


    # # save the response
    # with open('datasets/extracted/name_and_price_cleaned.csv', 'w') as file:
    #     file.write(response)




Here is a table of hardware that is used for mining bitcoin:

index,hardware_name
0,AntMiner S1
1,AntMiner S2
2,AntMiner S3
3,AntMiner S3+
4,AntMiner S4
5,AntMiner S5
6,AntMiner S5+
7,AntMiner S7
8,AntMiner S9
9,AntMiner U1
10,AntMiner U2+
11,AntMiner U3
12,ASICMiner BE Blade
13,ASICMiner BE Cube
14,ASICMiner BE Prisma
15,ASICMiner BE Sapphire
16,ASICMiner BE Tube
17,Avalon Batch 1
18,Avalon Batch 2
19,Avalon Batch 3
20,Avalon2
21,Avalon3
22,Avalon6
23,Avalon721
24,Avalon741
25,Avalon761
26,Avalon821
27,BFL  Rack Mount
28,BFL  Mini Rig SC
29,BFL Little Single
30,BFL Monarch
31,BFL SC
32,BFL Single 'SC'
33,bi*fury
34,BitFury S.B.
35,Bitmine.ch Avalon Clone 85GH
36,Black Arrow Prospero X-1
37,Black Arrow Prospero X-3
38,Blue Fury
39,BTC Garden AM-V1
40,CoinTerra TerraMiner IV
41,Drillbit
42,Ebit E10
43,Ebit E9
44,Ebit E9+
45,Ebit E9++
46,HashBuster Micro
47,HashBuster Nano
48,HashCoins Apollo v3
49,HashCoins Zeus v3
50,HashFast Baby Jet
51,HashFast Sierra
52,HashFast Sierra Evo 3
53,Klon

# join the tables

In [58]:
name_index = pd.read_csv('datasets/extracted/name_index.csv')
hardware_full = pd.read_csv('hardware_full.csv')
date_name_price_cleaned_prices = pd.read_csv('datasets/extracted/date_name_price_cleaned_prices.csv')

date_name_price_cleaned_prices = date_name_price_cleaned_prices.drop(columns=['hardware_name'])

name_index_prices = pd.concat([name_index, date_name_price_cleaned_prices], axis=1)

hardware_full = hardware_full.drop(columns=['hardware_name'])

name_index_prices["index"] = name_index_prices["hardware_index"]
name_index_prices = name_index_prices.drop(columns=['hardware_index'])
#drop rows where index is unknown
name_index_prices = name_index_prices[name_index_prices['index'] != "unknown"]

name_index_prices["index"] = name_index_prices["index"].astype(int)
hardware_full["index"] = hardware_full["index"].astype(int)

#join the tables
name_index_prices = name_index_prices.merge(hardware_full, on='index', how='left')
name_index_prices = name_index_prices[['date', 'hardware_name', 'hardware_price', 'hashrate', 'efficiency']]

# save it
name_index_prices.to_csv('datasets/extracted/full_table.csv', index=False)

In [57]:
name_index_prices

Unnamed: 0,date,hardware_name,hardware_price,hashrate,efficiency
0,2011-03-06,5850,70$,411.67,1.9060
1,2011-05-22,5850,185$,411.67,1.9060
2,2011-05-22,5830,99$,300.18,1.5452
3,2011-06-01,HD5830,280$,300.18,1.5452
4,2011-06-01,HD5830,126$,300.18,1.5452
...,...,...,...,...,...
108,2017-11-29,Antminer S9 controller board,100$,140000,10182
109,2018-01-16,ANTMINER S5,0.319$,115500,1957
110,2018-12-07,S9 Rackmount Self,3600$,140000,10182
111,2023-02-20,Antminer S9,200$,140000,10182


# fix btc prices

In [69]:
df = pd.read_csv('datasets/extracted/full_table.csv')

In [70]:
from torch import save, load
date_to_btc_price = load('date_open_dict.pt')

In [71]:
# for every row where hardware_price contains "BTC", get the price in dollars using the dict and the amount of BTC
for index, row in df.iterrows():
    if "BTC" in row['hardware_price']:
        btc_price = row['hardware_price'].replace(",", "").split("BTC")[0].strip()
        try:
            btc_price = float(btc_price)
        except:
            btc_price = -1
        date = row['date']
        
        # get the price in dollars
        dollars_price = date_to_btc_price[date] * btc_price
        dollars_price = round(dollars_price, 3)

        # replace the price in the dataframe
        if not btc_price == -1:
            df.at[index, 'hardware_price'] = f"{dollars_price}$"
        else:
            df.at[index, 'hardware_price'] = f""

# drop rows where hardware_price contains €
df = df[df['hardware_price'].str.contains("€") == False]

#remove all characters except for numbers and .
df['hardware_price'] = df['hardware_price'].apply(lambda x: re.sub(r"[^0-9.]", "", x).strip())

#columns are date,hardware_name,hardware_price,hashrate,efficiency
#they should be date,hardware_name,hardware_price_usd,Mhash/s/$,Mhash/J
#rename columns
df = df.rename(columns={"hardware_price": "hardware_price_usd", "hashrate": "Mhash/s/$", "efficiency": "Mhash/J"})

# save the dataframe
df.to_csv('datasets/extracted/final.csv', index=False)
        