In [1]:
import builtins

# Save the original print function
original_print = print

# Override the print function
def custom_print(*args, **kwargs):
    new_args = []
    for arg in args:
        if isinstance(arg, float):
            # Customize the format here
            new_args.append(f'{arg:.10f}')
        else:
            new_args.append(arg)
    original_print(*new_args, **kwargs)

# Override the built-in print with the custom one
builtins.print = custom_print

In [2]:
import pandas as pd
import os
import gzip
import pickle
import openai
import torch.nn.functional as F
import torch
import re
import copy
from tqdm import tqdm

In [3]:
def get_openai_response(prompt, tokens = 99999, model="gpt-4-1106-preview"):
    # model="gpt-3.5-turbo-16k"
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "Always answer with the requested csv file."
            },
            {
                "role": "user",
                "content": prompt
            }
            ],
        temperature=0,
        # max_tokens=tokens,
    )
    text = response.choices[0].message.content


    # print("returning ai text:", text)
    return text

In [4]:
df = pd.read_csv('datasets/extracted/1_original.csv') # columns: date,hardware_name,owned

# keep only rows where owned > 0.8
df = df[df['owned'] > 0.8]

# sort by date
df = df.sort_values(by=['date'])

# add row index
df = df.reset_index(drop=True)
df["row_index"] = df.index
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]


# load the not_hardware.txt file
not_hardware = []
with open('not_hardware.txt') as f:
    not_hardware = f.read().splitlines()

# remove rows that contain not_hardware
indices_to_drop = []  # List to store indices of rows to drop
for row in df.iterrows():
    for not_hw in not_hardware:
        if not_hw.strip() in row[1]["hardware_name"]:
            indices_to_drop.append(row[0])  # Add index to the list instead of dropping immediately

# Drop all the collected indices at once
df = df.drop(index=indices_to_drop)

# df_price_only = copy.deepcopy(df)
df_name_only = copy.deepcopy(df)
df_date_only = copy.deepcopy(df)

In [5]:
# # remove rows where hardware_price contains the string "unknown"
# df_price_only = df_price_only[df_price_only['hardware_price'].str.contains("unknown") == False]

# df_price_only = df_price_only.drop(columns=['date'])
# df_price_only = df_price_only.drop(columns=['hardware_name'])

# # save
# df_price_only.to_csv('datasets/extracted/price_only.csv', index=False)

In [6]:
# drop the date, hardware_price columns
df_name_only = df_name_only.drop(columns=['date'])
df_name_only = df_name_only.drop(columns=['owned'])
# df_name_only = df_name_only.drop(columns=['hardware_price'])

# save
df_name_only.to_csv('datasets/extracted/2_name_only.csv', index=False)

In [7]:
# drop the hardware_price, hardware_name columns
# df_date_only = df_date_only.drop(columns=['hardware_price'])
df_date_only = df_date_only.drop(columns=['owned'])
df_date_only = df_date_only.drop(columns=['hardware_name'])

# save
df_date_only.to_csv('datasets/extracted/3_date_only.csv', index=False)

In [8]:
# for row in df.iterrows():
#     print(row[1]["hardware_name"])

# GPT-4 processing (prices)

In [9]:
# #read the file back as a string
# with open('datasets/extracted/price_only.csv', 'r') as file:
#     table_text = file.read()

# prompt = f"""
# {table_text}


# The prices in this csv need to be cleaned.
# Rewrite the csv with the following change:
# Rewrite the price in a standard format, use this format for all rows: "1234$" or "1234BTC" or "1234€" etc. depending on the currency
# If a price is a range, keep only the first price. For example "6$-7$" should be written as "6$"
# If a price has no currency then assume it is in dollars if it is above 5, otherwise assume it is in BTC
# If a price is not valid, for example "USB bitcoin miners", write "unknown"
# If a price is per unit of something then here are some examples of how to handle them:
# "1.1 btc/Blade" should be written as "1.1BTC per Blade"
# ".071 / 13 GH/s" should be written as "0.071BTC per 13 GH/s"
# "0.40btc each" should be written as "0.40BTC per unit"
# "0.5 BTC shares" should be written as "0.5BTC per share"
# etc.

# """.strip()



# print(prompt)

# # response = get_openai_response(prompt) # the response is a csv as a string with columns date,hardware_name,hardware_price,hashrate,efficiency


# # # save the response
# # with open('datasets/extracted/name_and_price_cleaned.csv', 'w') as file:
# #     file.write(response)


# # df = pd.read_csv('datasets/extracted/price_only_cleaned.csv')


# adding efficiency and hashrate

In [10]:
#read the file back as a string
with open('datasets/extracted/2_name_only.csv', 'r') as file:
    table_text = file.read()
    # table_text = table_text.replace("hardware_name", "").strip()


with open('../hardwarelist/hardware_index_new.csv', 'r') as file:
    hardware = file.read()

output_csv = ""

In [11]:
lines = table_text.split('\n')

# Calculate the number of chunks
num_chunks = len(lines) // 100 + (1 if len(lines) % 100 != 0 else 0)

# Iterate through the lines 100 at a time
# for i in tqdm(range(num_chunks)):
for i in range(num_chunks):
    # Calculate start and end indices for slicing
    start_index = i * 100
    end_index = start_index + 100
    chunk = lines[start_index:end_index]
    text = "\n".join(chunk)

    if not text:  # Make sure the chunk is not empty
        continue

    prompt = f"""
Here is a table of hardware that is used for mining bitcoin:

{hardware}




I have analyzed posts from a bitcoin forum and created this csv of hardware:

{text.replace("  "," ")}



Using the hardware table, rewrite this csv with columns: row_index,hardware_name,hardware_index
If a hardware name is unclear or not in the table of mining hardware, write "unknown" for the hardware_index value. The names don't need to match exactly, use common sense. For example, "Sapphire 7950 Vapor X" refers to the 7950 in the hardware table.

    """.strip()



    print(prompt)
    print("\n"*10)

    response = get_openai_response(prompt) # the response is a csv as a string with columns row_index,hardware_name,hardware_index
    response = response.replace("```csv\n","").replace("```","").strip()

    print(response)
    print("\n"*10)

    if i == 0:
        output_csv = response
    else:
        output_csv += "\n" + response.replace("row_index,hardware_name,hardware_index", "").strip()


print(output_csv)
# save the output_csv
with open('datasets/extracted/4_joined.csv', 'w') as file:
    file.write(output_csv)




Here is a table of hardware that is used for mining bitcoin:

hardware_index,hardware_name
0,Bitmain Antminer S21 Hyd
1,Bitmain Antminer S21
2,MicroBT WhatsMiner M60S
3,Bitmain Antminer T21
4,MicroBT WhatsMiner M60
5,Canaan Avalon Made A1466
6,Canaan Avalon Made A1446
7,MicroBT Whatsminer M53S
8,MicroBT Whatsminer M53
9,MicroBT Whatsminer M36S+
10,Bitmain Antminer S19k Pro
11,MicroBT WhatsMiner M56S
12,MicroBT WhatsMiner M56
13,Bitmain Antminer S19 Pro Hyd
14,Bitmain Antminer S19j Pro+
15,MicroBT Whatsminer M33S++
16,Canaan Avalon Made A1366
17,Canaan Avalon Made A1346
18,Bitmain Antminer S19 XP Hyd
19,Bitmain Antminer S19 Hydro
20,Bitmain Antminer T19 Hydro
21,Bitmain Antminer T19 Hydro
22,Bitmain Antminer S19 XP
23,MicroBT Whatsminer M50S
24,MicroBT Whatsminer M50
25,Bitmain Antminer S19 Pro+ Hyd
26,iPollo B1L
27,Ebang Ebit E10D
28,Canaan AvalonMiner 1126 Pro
29,Bitmain Antminer S19j Pro
30,Bolon Miner B11
31,Bitmain Antminer T19
32,Innosilicon T2 Turbo HF+
33,Bitmain Antminer S19j P

# join the tables

In [45]:
name_index = pd.read_csv('datasets/extracted/4_joined.csv') # columns: row_index,hardware_name,hardware_index
hardware_full = pd.read_csv('../hardwarelist/hardware_full_new.csv') # columns: hardware_name,hashrate,efficiency,hardware_index
# price_only_cleaned = pd.read_csv('datasets/extracted/price_only_cleaned.csv') # columns: row_index,hardware_price
date_only = pd.read_csv('datasets/extracted/3_date_only.csv') # columns: row_index,date

# # left join name_index and price_only_cleaned on row_index
# df = pd.merge(name_index, price_only_cleaned, on='row_index', how='left')
df = name_index

# left join df and date_only on row_index
df = pd.merge(df, date_only, on='row_index', how='left')

# replace "unknown" with -1
df["hardware_index"]  = df["hardware_index"].replace("unknown", -1)

# left join df and hardware_full on hardware_index
df["hardware_index"] = df["hardware_index"].astype(int)
hardware_full["hardware_index"] = hardware_full["hardware_index"].astype(int)
hardware_full = hardware_full.drop(columns=['hardware_name'])
df = pd.merge(df, hardware_full, on='hardware_index', how='left')

# save it
df.to_csv('datasets/extracted/full_table.csv', index=False)

In [46]:
df.sample(20)

Unnamed: 0,row_index,hardware_name,hardware_index,date,hardware_release_date,speed,power,noise,hash,profit,Mhash/J
37,61,Avalon,-1,2021-11-24 00:16:11,,,,,,,
28,45,Asic miner,-1,2019-06-26 03:14:49,,,,,,,
27,44,Antminer S9,115,2019-06-24 06:08:48,Nov 2017,14Th/s,1372W,85db,SHA-256,-$2.72/day,10204.08163
6,14,Bitmain AW3+++ PSU,-1,2018-02-23 22:47:34,,,,,,,
16,33,Antminer S9,115,2018-04-30 15:34:15,Nov 2017,14Th/s,1372W,85db,SHA-256,-$2.72/day,10204.08163
14,30,miner,-1,2018-03-10 16:57:56,,,,,,,
10,22,L3+,-1,2018-02-26 16:34:51,,,,,,,
7,15,S9 Miners,115,2018-02-23 22:47:34,Nov 2017,14Th/s,1372W,85db,SHA-256,-$2.72/day,10204.08163
32,53,S9,115,2019-10-06 21:48:11,Nov 2017,14Th/s,1372W,85db,SHA-256,-$2.72/day,10204.08163
3,4,S7's,126,2018-01-09 05:31:31,Sep 2015,4.73Th/s,1293W,62db,SHA-256,-$3.31/day,3658.15932


# fix btc prices

In [47]:
df = pd.read_csv('datasets/extracted/full_table.csv')
# df['hardware_price'] = df['hardware_price'].astype(str)

In [48]:
# from torch import save, load
# date_to_btc_price = load('date_open_dict.pt')

In [49]:
# # for every row where hardware_price contains "BTC", get the price in dollars using the dict and the amount of BTC
# for index, row in df.iterrows():
#     if "BTC" in row['hardware_price']:
#         btc_price = row['hardware_price'].replace(",", "").split("BTC")[0].strip()
#         try:
#             btc_price = float(btc_price)
#         except:
#             btc_price = -1
#         date = row['date']
        
#         # get the price in dollars
#         dollars_price = date_to_btc_price[date] * btc_price
#         dollars_price = round(dollars_price, 3)

#         # replace the price in the dataframe
#         if not btc_price == -1:
#             df.at[index, 'hardware_price'] = f"{dollars_price}$"
#         else:
#             df.at[index, 'hardware_price'] = f""

# # drop rows where hardware_price contains €
# df = df[df['hardware_price'].str.contains("€") == False]

# #remove all characters except for numbers and .
# df['hardware_price'] = df['hardware_price'].apply(lambda x: re.sub(r"[^0-9.]", "", x).strip())


        

In [50]:
#columns are date,hardware_name,hardware_price,hashrate,efficiency
#they should be date,hardware_name,hardware_price_usd,Mhash/s/$,Mhash/J
#rename columns
df = df.rename(columns={"hardware_price": "hardware_price_usd", "hashrate": "Mhash/s/$", "efficiency": "Mhash/J"})

# # save the dataframe
df.to_csv('datasets/extracted/final.csv', index=False)

In [51]:
#remove rows where hardware_index is -1
df = df[df['hardware_index'] != -1]

# save the dataframe
df.to_csv('datasets/extracted/final2.csv', index=False)

In [52]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)
df = pd.read_csv('datasets/extracted/final2.csv')

# remove all rows that are unknown for the power efficiency
df = df[df['Mhash/J'] != "unknown"]

# change Mhash/J to TH/J
df['Mhash/J'] = df['Mhash/J'].astype(float) / 1000000
df = df.rename(columns={"Mhash/J": "TH/J"})

# # change Mhash/J to GH/J
# df['Mhash/J'] = df['Mhash/J'].astype(float) / 1000
# df = df.rename(columns={"Mhash/J": "GH/J"})


# columns are now row_index,hardware_name,hardware_index,hardware_price_usd,date,Mhash/s/$,TH/J
# move the date to the left
cols = df.columns.tolist()
cols = [cols[3]] + cols[:3] + cols[4:]
df = df[cols]

# remove hardware_index
df = df.drop(columns=['hardware_index'])

# print the row index when the efficiency is over 50 times smaller than in the previous row
last_efficiency = 0
for index, row in df.iterrows():
    if index == 0:
        last_efficiency = row['TH/J']
    else:
        if row['TH/J'] < last_efficiency / 50:
            print("row index:", row['row_index'])
            print("hardware_name:", row['hardware_name'])
            print("hardware_price_usd:", row['hardware_price_usd'])
            print("date:", row['date'])
            print("TH/J:", row['TH/J'])
            print("last_efficiency:", last_efficiency)
            print("\n"*3)
        last_efficiency = row['TH/J']

# Format the column to avoid scientific notation
# df["TH/J"] = df["TH/J"].apply(lambda x: f"{x:.9f}".rstrip('0').rstrip('.'))

# save the dataframe
df.to_csv('datasets/extracted/final3.csv', index=False)

In [53]:
# from datetime import datetime
# max_efficiency = pd.read_csv('../hardwarelist/Bitcoin Paper Datasheet - Exponential Daily.csv') # rows are Date, Hardware (TH/J)

# # loop through rows
# for i in range(len(max_efficiency)):
#     # get the date
#     date = max_efficiency.iloc[i]['Date']
#     # convert d/m/y to y/m/d
#     date = date.split(".")
#     date = date[2] + "-" + date[1] + "-" + date[0]
#     max_efficiency.loc[i, 'Date'] = date
#     date_format = "%Y-%m-%d"
#     # if datetime.strptime(date, date_format) > datetime.strptime("2012-11-19", date_format):
#     #     max_efficiency.loc[i, 'Hardware (TH/J)'] = max(max_efficiency.loc[i, 'Hardware (TH/J)'], 0.00025)
#     # if datetime.strptime(date, date_format) > datetime.strptime("2013-06-22", date_format):
#     #     max_efficiency.loc[i, 'Hardware (TH/J)'] = max(max_efficiency.loc[i, 'Hardware (TH/J)'], 0.0004)
#     updates = [
#         ("2012-11-19", 0.00025), # BFL
#         ("2013-06-22", 0.0004), # KNC
#         ("2013-09-18", 	0.000909), #hashfast
#         ("2013-11-26", 	0.001429), # KNC neptune
#     ]
#     for (update_date, update_value) in updates:
#         if datetime.strptime(date, date_format) > datetime.strptime(update_date, date_format):
#             max_efficiency.loc[i, 'Hardware (TH/J)'] = max(max_efficiency.loc[i, 'Hardware (TH/J)'], update_value)

# max_efficiency.to_csv('../hardwarelist/Bitcoin max updated.csv', index=False)

In [54]:
max_efficiency = pd.read_csv('../hardwarelist/Bitcoin max updated2.csv') # rows are date,max (TH/J),archaicity (TH/J)

def get_max_efficiency(date):
    date = str(date)[:10]
    try:
        return max_efficiency[max_efficiency['date'] == date]['max (TH/J)'].values[0]
    except:
        return -1
    

# print rows where TH/J is larger than the max efficiency for that date
df['max_efficiency'] = df['date'].apply(lambda x: get_max_efficiency(x))
df_bad = df[df['TH/J'] > df['max_efficiency']*1.5]
# df_bad

In [55]:
print(get_max_efficiency("2012-11-23"))
print(get_max_efficiency("2018-01-05"))

0.0002500000
0.0102000000


In [56]:
df_bad = df_bad[['date','hardware_name','TH/J','max_efficiency']]
df_bad["TH/J"] = df_bad["TH/J"].apply(lambda x: f"{x:.9f}".rstrip('0').rstrip('.'))
df_bad["max_efficiency"] = df_bad["max_efficiency"].apply(lambda x: f"{x:.9f}".rstrip('0').rstrip('.'))
df_bad.to_csv('datasets/extracted/bad.csv', index=False)
df_bad

Unnamed: 0,date,hardware_name,TH/J,max_efficiency
24,2023-02-13 20:06:05,s19Jpro,0.033898305,-1
25,2023-08-02 23:27:17,Innosilicon T2T-26T,0.012380952,-1
