In [81]:
import builtins

# Save the original print function
original_print = print

# Override the print function
def custom_print(*args, **kwargs):
    new_args = []
    for arg in args:
        if isinstance(arg, float):
            # Customize the format here
            new_args.append(f'{arg:.10f}')
        else:
            new_args.append(arg)
    original_print(*new_args, **kwargs)

# Override the built-in print with the custom one
builtins.print = custom_print

In [31]:
import pandas as pd
import os
import gzip
import pickle
import openai
import torch.nn.functional as F
import torch
import re
import copy
import tqdm

In [32]:
def get_openai_response(prompt, tokens = 99999, model="gpt-4"):
    # model="gpt-3.5-turbo-16k"
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "Always answer with the requested csv file."
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        # max_tokens=tokens,
    )
    text = response.choices[0].message.content


    # print("returning ai text:", text)
    return text

In [33]:
df = pd.read_csv('datasets/extracted/date_name_price_original.csv')

# sort by date
df = df.sort_values(by=['date'])

# add row index
df = df.reset_index(drop=True)
df["row_index"] = df.index

# move row index left
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

df_price_only = copy.deepcopy(df)
df_name_only = copy.deepcopy(df)
df_date_only = copy.deepcopy(df)

In [34]:
# remove rows where hardware_price contains the string "unknown"
df_price_only = df_price_only[df_price_only['hardware_price'].str.contains("unknown") == False]

df_price_only = df_price_only.drop(columns=['date'])
df_price_only = df_price_only.drop(columns=['hardware_name'])

# save
df_price_only.to_csv('datasets/extracted/price_only.csv', index=False)

In [35]:
# drop the date, hardware_price columns
df_name_only = df_name_only.drop(columns=['date'])
df_name_only = df_name_only.drop(columns=['hardware_price'])

# save
df_name_only.to_csv('datasets/extracted/name_only.csv', index=False)

In [36]:
# drop the hardware_price, hardware_name columns
df_date_only = df_date_only.drop(columns=['hardware_price'])
df_date_only = df_date_only.drop(columns=['hardware_name'])

# save
df_date_only.to_csv('datasets/extracted/date_only.csv', index=False)

# GPT-4 processing (prices)

In [6]:
#read the file back as a string
with open('datasets/extracted/price_only.csv', 'r') as file:
    table_text = file.read()

prompt = f"""
{table_text}


The prices in this csv need to be cleaned.
Rewrite the csv with the following change:
Rewrite the price in a standard format, use this format for all rows: "1234$" or "1234BTC" or "1234€" etc. depending on the currency
If a price is a range, keep only the first price. For example "6$-7$" should be written as "6$"
If a price has no currency then assume it is in dollars if it is above 5, otherwise assume it is in BTC
If a price is not valid, for example "USB bitcoin miners", write "unknown"
If a price is per unit of something then here are some examples of how to handle them:
"1.1 btc/Blade" should be written as "1.1BTC per Blade"
".071 / 13 GH/s" should be written as "0.071BTC per 13 GH/s"
"0.40btc each" should be written as "0.40BTC per unit"
"0.5 BTC shares" should be written as "0.5BTC per share"
etc.

""".strip()



print(prompt)

# response = get_openai_response(prompt) # the response is a csv as a string with columns date,hardware_name,hardware_price,hashrate,efficiency


# # save the response
# with open('datasets/extracted/name_and_price_cleaned.csv', 'w') as file:
#     file.write(response)


# df = pd.read_csv('datasets/extracted/price_only_cleaned.csv')


row_index,hardware_price
5,$70
11,$2.10 per hour
13,$200
17,$220
22,$55
23,$79
33,$185
36,$99
45,$280
46,$126
47,$288
52,$109
58,$150
59,$200-300
85,$800
102,107 BTC
103,$580
104,$89
113,$150
135,26 BTC
142,$75 and $101
148,$80
150,$275
151,"$2,000"
152,$1200
153,$200
158,807.1501152800002 BTC
164,$599
171,0.079 BTC
173,$320
176,$2.03636
177,$2.03636
182,$5000
183,4.5 Ghz
184,0.5 BTC shares
185,60EUR
190,0.082BTC
191,0.5 BTC shares
192,2.3 BTC
193,0.5 BTC shares
196,20.7
197,60EUR
198,0.082BTC
199,$2.04236
200,2.04236 btc
209,1 BTC per share
210,0.65 BTC
213,175 GHs Miner
214,.57 btc per chip
215,4.56 BTC
217,4 Btc
224,1.01618 ea.
226,0.082BTC
227,75$
228,60EUR
230,0.081 BTC
235,$7k
237,0.99 BTC
238,1.01693878 ea.
239,$59/chip
240,0.89BTC
241,1BTC/share
243,1btc= 3 GH/s
244,$0.95 per device
245,BTC1.15
246,BTC1.15
247,0.95 BTC
251,4BTC
253,$65
262,"$27,5/chip"
265,.41 to .55 btc
266,$58
267,$20
271,$58
272,0.35 BTC
273,$45
274,1.475 BTC
275,0.10 btc
276,1.65 btc
277,฿1.65
282,1k
285,$4

# adding efficiency and hashrate

In [7]:
#read the file back as a string
with open('datasets/extracted/name_only.csv', 'r') as file:
    table_text = file.read()
    # table_text = table_text.replace("hardware_name", "").strip()



with open('hardware_index.csv', 'r') as file:
    hardware = file.read()

def split_string(s, num_chunks):
    lines = s.splitlines()
    num_lines = len(lines)
    chunk_size = num_lines // num_chunks
    chunks = [lines[i:i+chunk_size] for i in range(0, num_lines, chunk_size)]
    # If there are leftover lines, add them to the last chunk
    if num_lines % num_chunks != 0:
        chunks[-1] += lines[num_chunks*chunk_size:]
    return ['\n'.join(chunk) for chunk in chunks]

In [None]:
output_csv = ""

# for text in split_string(table_text, 5):
for (i,text) in enumerate(tqdm.tqdm(split_string(table_text, 12))):

    prompt = f"""
Here is a table of hardware that is used for mining bitcoin:

{hardware}




I have analyzed posts from a bitcoin forum and created this csv of hardware:

{text.replace("Bitmain ","").replace("  "," ")}



Using the hardware table, rewrite this csv with columns: row_index,hardware_name,hardware_index
If a hardware name is unclear or not in the table of mining hardware, write "unknown" for the hardware_index value. The names don't need to match exactly, use common sense. For example, "Sapphire 7950 Vapor X" refers to the 7950 in the hardware table.

    """.strip()



    # print(prompt)
    # print("\n"*100)

    response = get_openai_response(prompt) # the response is a csv as a string with columns row_index,hardware_name,hardware_index

    print(response)
    print("\n"*100)

    if i == 0:
        output_csv = response
    else:
        output_csv += "\n" + response.replace("row_index,hardware_name,hardware_index", "").strip()


print(output_csv)
# save the output_csv
with open('datasets/extracted/name_index.csv', 'w') as file:
    file.write(output_csv)




# join the tables

In [37]:
name_index = pd.read_csv('datasets/extracted/name_index.csv') # columns: row_index,hardware_name,hardware_index
hardware_full = pd.read_csv('hardware_full.csv') # columns: hardware_name,hashrate,efficiency,hardware_index
price_only_cleaned = pd.read_csv('datasets/extracted/price_only_cleaned.csv') # columns: row_index,hardware_price
date_only = pd.read_csv('datasets/extracted/date_only.csv') # columns: row_index,date

# left join name_index and price_only_cleaned on row_index
df = pd.merge(name_index, price_only_cleaned, on='row_index', how='left')

# left join df and date_only on row_index
df = pd.merge(df, date_only, on='row_index', how='left')

# replace "unknown" with -1
df["hardware_index"]  = df["hardware_index"].replace("unknown", -1)

# left join df and hardware_full on hardware_index
df["hardware_index"] = df["hardware_index"].astype(int)
hardware_full["hardware_index"] = hardware_full["hardware_index"].astype(int)
hardware_full = hardware_full.drop(columns=['hardware_name'])
df = pd.merge(df, hardware_full, on='hardware_index', how='left')

# save it
df.to_csv('datasets/extracted/full_table.csv', index=False)

In [38]:
df.sample(20)

Unnamed: 0,row_index,hardware_name,hardware_index,hardware_price,date,hashrate,efficiency
951,951,Antminers,-1,,2017-05-23,,
371,371,BLACK ARROW Bullet Run,-1,123$,2013-10-28,,
744,744,SP20,-1,,2015-02-13,,
558,558,hex miner 15 Gigahashes USB Stick,-1,,2014-03-12,,
143,143,Avalon,20,,2013-03-26,300000.0,unknown
725,725,KNC Neptune,55,0.6BTC,2014-11-20,300000.0,1429
635,635,Corsair CX750M,-1,129.99$,2014-06-04,,
475,475,P2Pool,-1,,2014-01-15,,
721,721,Spondoolies Tech SP20,70,,2014-11-10,170000.0,1545
871,871,r7 370,-1,,2016-03-01,,


# fix btc prices

In [39]:
df = pd.read_csv('datasets/extracted/full_table.csv')
df['hardware_price'] = df['hardware_price'].astype(str)

In [40]:
from torch import save, load
date_to_btc_price = load('date_open_dict.pt')

In [41]:
# for every row where hardware_price contains "BTC", get the price in dollars using the dict and the amount of BTC
for index, row in df.iterrows():
    if "BTC" in row['hardware_price']:
        btc_price = row['hardware_price'].replace(",", "").split("BTC")[0].strip()
        try:
            btc_price = float(btc_price)
        except:
            btc_price = -1
        date = row['date']
        
        # get the price in dollars
        dollars_price = date_to_btc_price[date] * btc_price
        dollars_price = round(dollars_price, 3)

        # replace the price in the dataframe
        if not btc_price == -1:
            df.at[index, 'hardware_price'] = f"{dollars_price}$"
        else:
            df.at[index, 'hardware_price'] = f""

# drop rows where hardware_price contains €
df = df[df['hardware_price'].str.contains("€") == False]

#remove all characters except for numbers and .
df['hardware_price'] = df['hardware_price'].apply(lambda x: re.sub(r"[^0-9.]", "", x).strip())

#columns are date,hardware_name,hardware_price,hashrate,efficiency
#they should be date,hardware_name,hardware_price_usd,Mhash/s/$,Mhash/J
#rename columns
df = df.rename(columns={"hardware_price": "hardware_price_usd", "hashrate": "Mhash/s/$", "efficiency": "Mhash/J"})

# save the dataframe
df.to_csv('datasets/extracted/final.csv', index=False)
        

In [42]:
#remove rows where hardware_index is -1
df = df[df['hardware_index'] != -1]

# save the dataframe
df.to_csv('datasets/extracted/final2.csv', index=False)

In [78]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)
df = pd.read_csv('datasets/extracted/final2.csv')

# remove all rows that are unknown for the power efficiency
df = df[df['Mhash/J'] != "unknown"]

# change Mhash/J to TH/J
df['Mhash/J'] = df['Mhash/J'].astype(float) / 1000000
df = df.rename(columns={"Mhash/J": "TH/J"})

# # change Mhash/J to GH/J
# df['Mhash/J'] = df['Mhash/J'].astype(float) / 1000
# df = df.rename(columns={"Mhash/J": "GH/J"})


# columns are now row_index,hardware_name,hardware_index,hardware_price_usd,date,Mhash/s/$,TH/J
# move the date to the left
cols = df.columns.tolist()
cols = [cols[-3]] + cols[:-3] + cols[-2:]
df = df[cols]

# remove hardware_index
df = df.drop(columns=['hardware_index'])

# print the row index when the efficiency is over 50 times smaller than in the previous row
last_efficiency = 0
for index, row in df.iterrows():
    if index == 0:
        last_efficiency = row['TH/J']
    else:
        if row['TH/J'] < last_efficiency / 50:
            print("row index:", row['row_index'])
            print("hardware_name:", row['hardware_name'])
            print("hardware_price_usd:", row['hardware_price_usd'])
            print("date:", row['date'])
            print("TH/J:", row['TH/J'])
            print("last_efficiency:", last_efficiency)
            print("\n"*3)
        last_efficiency = row['TH/J']

# Format the column to avoid scientific notation
# df["TH/J"] = df["TH/J"].apply(lambda x: f"{x:.9f}".rstrip('0').rstrip('.'))

# save the dataframe
df.to_csv('datasets/extracted/final3.csv', index=False)

row index: 127
hardware_name: 5830
hardware_price_usd: nan
date: 2012-12-31
TH/J: 1.5451999999999998e-06
last_efficiency: 0.00025




row index: 139
hardware_name: 5970
hardware_price_usd: nan
date: 2013-02-13
TH/J: 2.0053e-06
last_efficiency: 0.00025




row index: 162
hardware_name: AMD Radeon HD 6750M
hardware_price_usd: nan
date: 2013-05-02
TH/J: 1.467e-06
last_efficiency: 0.00025




row index: 172
hardware_name: 6950s
hardware_price_usd: nan
date: 2013-05-20
TH/J: 1.9399e-06
last_efficiency: 0.00025




row index: 447
hardware_name: Sapphire 7950s
hardware_price_usd: nan
date: 2014-01-07
TH/J: 1.9401e-06
last_efficiency: 0.001429




row index: 478
hardware_name: Raedon 7970 cards
hardware_price_usd: nan
date: 2014-01-17
TH/J: 1.9059999999999998e-06
last_efficiency: 0.000185




row index: 509
hardware_name: Nvidia GeForce GTX 275
hardware_price_usd: nan
date: 2014-02-25
TH/J: 2.32e-07
last_efficiency: 0.0005






In [94]:
from datetime import datetime
max_efficiency = pd.read_csv('../hardwarelist/Bitcoin Paper Datasheet - Exponential Daily.csv') # rows are Date, Hardware (TH/J)

# loop through rows
for i in range(len(max_efficiency)):
    # get the date
    date = max_efficiency.iloc[i]['Date']
    # convert d/m/y to y/m/d
    date = date.split(".")
    date = date[2] + "-" + date[1] + "-" + date[0]
    max_efficiency.loc[i, 'Date'] = date
    date_format = "%Y-%m-%d"
    # if datetime.strptime(date, date_format) > datetime.strptime("2012-11-19", date_format):
    #     max_efficiency.loc[i, 'Hardware (TH/J)'] = max(max_efficiency.loc[i, 'Hardware (TH/J)'], 0.00025)
    # if datetime.strptime(date, date_format) > datetime.strptime("2013-06-22", date_format):
    #     max_efficiency.loc[i, 'Hardware (TH/J)'] = max(max_efficiency.loc[i, 'Hardware (TH/J)'], 0.0004)
    updates = [
        ("2012-11-19", 0.00025), # BFL
        ("2013-06-22", 0.0004), # KNC
        ("2013-09-18", 	0.000909), #hashfast
        ("2013-11-26", 	0.001429), # KNC neptune
    ]
    for (update_date, update_value) in updates:
        if datetime.strptime(date, date_format) > datetime.strptime(update_date, date_format):
            max_efficiency.loc[i, 'Hardware (TH/J)'] = max(max_efficiency.loc[i, 'Hardware (TH/J)'], update_value)

max_efficiency.to_csv('../hardwarelist/Bitcoin max updated.csv', index=False)

def get_max_efficiency(date):
    date = str(date)
    try:
        return max_efficiency[max_efficiency['Date'] == date]['Hardware (TH/J)'].values[0]
    except:
        return -1
    
print(get_max_efficiency("2012-11-23"))
    
# print rows where TH/J is larger than the max efficiency for that date
df['max_efficiency'] = df['date'].apply(lambda x: get_max_efficiency(x))
df_bad = df[df['TH/J'] > df['max_efficiency']*1.5]
# df_bad

0.0002500000


In [93]:
df_bad = df_bad[['date','hardware_name','TH/J','max_efficiency']]
df_bad["TH/J"] = df_bad["TH/J"].apply(lambda x: f"{x:.9f}".rstrip('0').rstrip('.'))
df_bad["max_efficiency"] = df_bad["max_efficiency"].apply(lambda x: f"{x:.9f}".rstrip('0').rstrip('.'))
df_bad.to_csv('datasets/extracted/bad.csv', index=False)
df_bad

Unnamed: 0,date,hardware_name,TH/J,max_efficiency
40,2011-06-08,i5-2500k,2.06e-05,3.78e-06
71,2012-09-11,BFL Single,0.00025,2.33e-05
128,2013-08-16,Baby Jet,0.000909,0.0004
129,2013-08-20,Monarch,0.001428,0.0004
165,2013-11-26,KnCMiner Neptune,0.001429,0.000909
274,2015-09-11,S9,0.010182,0.00535
377,2023-02-18,S9,0.010182,-1.0
378,2023-02-20,Antminer S9,0.010182,-1.0
379,2023-02-20,Antminer S9,0.010182,-1.0
380,2023-05-09,Antminer S1,0.0005,-1.0
