In [1]:
import pandas as pd
from datetime import datetime, timedelta

# Load the datasets
plotdata_df = pd.read_csv('../5_processing_extracted_data/plotdata.csv', usecols=['date', 'TH/J'])
plotdata_df['date'] = pd.to_datetime(plotdata_df['date']).dt.date
max_efficiency_df = pd.read_csv('../../hardwarelist/Bitcoin max updated2.csv', usecols=['date', 'max (TH/J)'])
max_efficiency_df['date'] = pd.to_datetime(max_efficiency_df['date']).dt.date
price_df = pd.read_csv('../../pricehistory/price_full.csv', usecols=['Start', 'Open'])
price_df['Start'] = pd.to_datetime(price_df['Start']).dt.date

# Process plotdata.csv to get average power efficiency per day
avg_efficiency = plotdata_df.groupby('date')['TH/J'].mean().reset_index()
avg_efficiency.columns = ['date', 'avg_efficiency']

# get number of entries per day
counts = plotdata_df['date'].value_counts().reset_index()

In [2]:
counts

Unnamed: 0,date,count
0,2011-05-23,39
1,2017-12-21,34
2,2011-06-10,34
3,2011-06-07,31
4,2014-08-07,28
...,...,...
3120,2021-02-21,1
3121,2021-02-27,1
3122,2014-12-01,1
3123,2021-03-02,1


In [3]:
general_df = pd.read_csv('../../bitcoinforum_general/5_processing_extracted_data/general.csv')
# keep only dates after 2011
general_df['date'] = pd.to_datetime(general_df['date']).dt.date
general_df = general_df[general_df['date'] >= datetime(2011, 1, 1).date()]
general_df

Unnamed: 0,date,optimistic_speculation,pessimistic_speculation,bitcoin_adoption,bitcoin_technology,bitcoin_challenges,altcoins,educational_resources,posts_count
12,2011-01-01,0.043919,0.000000,0.108108,0.760761,0.277428,0.228434,0.759301,0.036770
13,2011-02-01,0.182487,0.089906,0.235294,0.654783,0.346352,0.271189,0.631424,0.062887
14,2011-03-01,0.050958,0.000000,0.139373,0.713511,0.380042,0.088349,0.766594,0.097251
15,2011-04-01,0.114035,0.122898,0.198830,0.598765,0.618097,0.172995,0.589012,0.116151
16,2011-05-01,0.080277,0.113258,0.215569,0.594145,0.575501,0.177139,0.638004,0.228179
...,...,...,...,...,...,...,...,...,...
166,2023-11-01,0.711246,0.127755,0.279635,0.417089,0.535635,0.308282,0.439779,0.111684
167,2023-12-01,0.678969,0.280989,0.267409,0.405757,0.463757,0.470867,0.429969,0.121993
168,2024-01-01,0.721154,0.298447,0.331361,0.393491,0.337393,0.300073,0.525151,0.114777
169,2024-02-01,0.717532,0.036391,0.225108,0.374699,0.553019,0.329301,0.544901,0.078007


In [4]:

# Forward fill to handle days without entries
all_dates = pd.date_range(start=avg_efficiency['date'].min(), end=avg_efficiency['date'].max(), freq='D')
all_dates_df = pd.DataFrame(all_dates, columns=['date'])
all_dates_df['date'] = all_dates_df['date'].dt.date  # Ensure 'date' is of type date

avg_efficiency = pd.merge(all_dates_df, avg_efficiency, on='date', how='left').fillna(method='ffill')

# Merge with max efficiency
max_efficiency_df = max_efficiency_df.rename(columns={'max (TH/J)': 'max_efficiency'})
merged_df = pd.merge(avg_efficiency, max_efficiency_df, on='date', how='left')

# Merge with open price
price_df = price_df.rename(columns={'Start': 'date', 'Open': 'open_price'})
merged_df = pd.merge(merged_df, price_df, on='date', how='left')

# Merge with general
merged_df = pd.merge(merged_df, general_df, on='date', how='left').fillna(method='ffill')

  avg_efficiency = pd.merge(all_dates_df, avg_efficiency, on='date', how='left').fillna(method='ffill')
  merged_df = pd.merge(merged_df, general_df, on='date', how='left').fillna(method='ffill')


In [5]:
# keep only dates after 2011
merged_df = merged_df[merged_df['date'] >= datetime(2011, 1, 1).date()]
merged_df

Unnamed: 0,date,avg_efficiency,max_efficiency,open_price,optimistic_speculation,pessimistic_speculation,bitcoin_adoption,bitcoin_technology,bitcoin_challenges,altcoins,educational_resources,posts_count
114,2011-01-01,2.810000e-07,0.000004,0.300,0.043919,0.000000,0.108108,0.760761,0.277428,0.228434,0.759301,0.036770
115,2011-01-02,2.810000e-07,0.000004,0.300,0.043919,0.000000,0.108108,0.760761,0.277428,0.228434,0.759301,0.036770
116,2011-01-03,2.810000e-07,0.000004,0.295,0.043919,0.000000,0.108108,0.760761,0.277428,0.228434,0.759301,0.036770
117,2011-01-04,2.810000e-07,0.000004,0.299,0.043919,0.000000,0.108108,0.760761,0.277428,0.228434,0.759301,0.036770
118,2011-01-05,2.810000e-07,0.000004,0.299,0.043919,0.000000,0.108108,0.760761,0.277428,0.228434,0.759301,0.036770
...,...,...,...,...,...,...,...,...,...,...,...,...
4794,2023-10-25,2.068425e-02,0.046500,33948.120,0.802407,0.208851,0.285714,0.347481,0.328861,0.314983,0.605574,0.109278
4795,2023-10-26,2.068425e-02,0.046500,34467.340,0.802407,0.208851,0.285714,0.347481,0.328861,0.314983,0.605574,0.109278
4796,2023-10-27,2.382353e-02,0.046500,34132.800,0.802407,0.208851,0.285714,0.347481,0.328861,0.314983,0.605574,0.109278
4797,2023-10-28,2.382353e-02,0.046500,33899.700,0.802407,0.208851,0.285714,0.347481,0.328861,0.314983,0.605574,0.109278


In [6]:
# Merge with counts
merged_df = pd.merge(merged_df, counts, on='date', how='left')
merged_df['count'] = merged_df['count'].fillna(0)

# Fill NaN values for open_price with the most recent non-NaN value
merged_df['open_price'] = merged_df['open_price'].fillna(method='ffill')

# Save to inputs.csv
merged_df.to_csv('inputs.csv', index=False)

print("Merging complete. Data saved to inputs.csv.")

Merging complete. Data saved to inputs.csv.


  merged_df['open_price'] = merged_df['open_price'].fillna(method='ffill')
