In [2]:
import pandas as pd
# Read the cleaned dataset from the Parquet file
bitcoin_tweets_cleaned = pd.read_parquet("bitcoin_tweets_cleaned.parquet", engine='pyarrow')

# Display the first few rows of the dataframe
bitcoin_tweets_cleaned.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,,,,NaT,0,0,0,True,NaT,35,20,False,True
1,Iconic Holding,"Frankfurt am Main, Germany",Professional Crypto Asset Ventures \nhttps://t...,2021-01-05 13:22:24,301,1075,361,True,2021-02-05 10:52:04,📖 Weekend Read 📖\n\nKeen to learn about #cryp...,['crypto'],Twitter Web App,False
2,Iconic Holding,"Frankfurt am Main, Germany",Professional Crypto Asset Ventures \nhttps://t...,2021-01-05 13:22:24,301,1075,361,True,2021-02-05 10:52:04,2⃣ Debunking 9 #Bitcoin Myths by @Patrick_Lo...,"['Bitcoin', 'cryptocurrency', 'bitcoin', 'cryp...",Twitter Web App,False
3,Iconic Holding,"Frankfurt am Main, Germany",Professional Crypto Asset Ventures \nhttps://t...,2021-01-05 13:22:24,301,1075,361,True,2021-02-05 10:52:06,4⃣ 🎙️ Bloomberg LP #CryptoOutlook 2021 with @...,"['CryptoOutlook', 'cryptocurrency', 'bitcoin',...",Twitter Web App,False
4,Iconic Holding,"Frankfurt am Main, Germany",Professional Crypto Asset Ventures \nhttps://t...,2021-01-05 13:22:24,301,1075,361,True,2021-02-05 10:52:07,"5⃣ #Blockchain 50 2021 by @DelRayMan, @Forbe...","['Blockchain', 'cryptocurrency', 'bitcoin', 'c...",Twitter Web App,False


In [3]:
# Count the number of records with null values in the 'date' column
null_date_count = bitcoin_tweets_cleaned['date'].isnull().sum()
print(f"Number of records with null 'date' values: {null_date_count}")

Number of records with null 'date' values: 82958687


In [4]:
# Filter the DataFrame to get rows where 'date' is not null
bitcoin_tweets_not_null_date = bitcoin_tweets_cleaned[bitcoin_tweets_cleaned['date'].notnull()]

# Display the first few rows of the filtered DataFrame
len(bitcoin_tweets_not_null_date)

12803781

In [5]:
min_date = bitcoin_tweets_cleaned['date'].min()
max_date = bitcoin_tweets_cleaned['date'].max()

print(f"Minimum date: {min_date}")
print(f"Maximum date: {max_date}")

Minimum date: 2021-02-05 10:52:04
Maximum date: 2023-01-09 23:59:54


In [6]:
# Ensure the 'date' column is in datetime format

# Extract the month and year from the 'date' column
bitcoin_tweets_not_null_date['year_month'] = bitcoin_tweets_not_null_date['date'].dt.to_period('M')

# Count the number of records for each month
monthly_counts = bitcoin_tweets_not_null_date['year_month'].value_counts().sort_index()

# Calculate the percentage of records for each month
monthly_percentage = (monthly_counts / len(bitcoin_tweets_not_null_date)) * 100

# Display the percentage of records for each month
print(monthly_percentage)

year_month
2021-02     0.347108
2021-03     0.032334
2021-04     0.453468
2021-05     0.170122
2021-06     0.982499
2021-07     3.640183
2021-08     3.819091
2021-09     0.183618
2021-10     2.747618
2021-11     2.808795
2021-12     0.431911
2022-01     2.031330
2022-02     0.620715
2022-03     2.812146
2022-04     3.258061
2022-05     2.780585
2022-06     2.703983
2022-07     1.512475
2022-08     0.327442
2022-09     1.421814
2022-10     1.146169
2022-11     1.579713
2022-12     7.078214
2023-01    57.110607
Freq: M, Name: count, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bitcoin_tweets_not_null_date['year_month'] = bitcoin_tweets_not_null_date['date'].dt.to_period('M')


In [7]:
# Count the number of records with non-null dates and null text
null_text_count = bitcoin_tweets_not_null_date['text'].isnull().sum()
print(f"Number of records with non-null dates and null text values: {null_text_count}")

Number of records with non-null dates and null text values: 0


In [8]:
# Calculate the average percentage of records for months other than January 2023
average_percentage = monthly_percentage[monthly_percentage.index != '2023-01'].mean()

# Calculate the number of records to keep for January 2023
num_records_to_keep = int(average_percentage / 100 * len(bitcoin_tweets_not_null_date))

# Filter the data for January 2023
january_2023_data = bitcoin_tweets_not_null_date[bitcoin_tweets_not_null_date['year_month'] == '2023-01']

# Randomly sample the required number of records from January 2023 data
january_2023_sampled = january_2023_data.sample(n=num_records_to_keep, random_state=42)

# Filter the data for months other than January 2023
other_months_data = bitcoin_tweets_not_null_date[bitcoin_tweets_not_null_date['year_month'] != '2023-01']

# Concatenate the sampled January 2023 data with the data from other months
balanced_data = pd.concat([other_months_data, january_2023_sampled])

# Display the percentage of records for each month in the balanced dataset
balanced_monthly_percentage = (balanced_data['year_month'].value_counts().sort_index() / len(balanced_data)) * 100
print(balanced_monthly_percentage)

year_month
2021-02     0.775589
2021-03     0.072248
2021-04     1.013242
2021-05     0.380125
2021-06     2.195325
2021-07     8.133732
2021-08     8.533490
2021-09     0.410281
2021-10     6.139360
2021-11     6.276056
2021-12     0.965076
2022-01     4.538863
2022-02     1.386944
2022-03     6.283543
2022-04     7.279909
2022-05     6.213022
2022-06     6.041859
2022-07     3.379519
2022-08     0.731647
2022-09     3.176944
2022-10     2.561035
2022-11     3.529758
2022-12    15.815772
2023-01     4.166662
Freq: M, Name: count, dtype: float64


In [11]:
balanced_monthly_percentage

year_month
2021-02     0.775589
2021-03     0.072248
2021-04     1.013242
2021-05     0.380125
2021-06     2.195325
2021-07     8.133732
2021-08     8.533490
2021-09     0.410281
2021-10     6.139360
2021-11     6.276056
2021-12     0.965076
2022-01     4.538863
2022-02     1.386944
2022-03     6.283543
2022-04     7.279909
2022-05     6.213022
2022-06     6.041859
2022-07     3.379519
2022-08     0.731647
2022-09     3.176944
2022-10     2.561035
2022-11     3.529758
2022-12    15.815772
2023-01     4.166662
Freq: M, Name: count, dtype: float64

In [None]:
# Calculate the average percentage of records for months other than December 2022
average_percentage_dec = monthly_percentage[monthly_percentage.index != '2022-12'].mean()

# Calculate the number of records to keep for December 2022
num_records_to_keep_dec = int(average_percentage_dec / 100 * len(balanced_data))

# Filter the data for December 2022
december_2022_data = balanced_data[balanced_data['year_month'] == '2022-12']

# Randomly sample the required number of records from December 2022 data
december_2022_sampled = december_2022_data.sample(n=num_records_to_keep_dec, random_state=42)

# Filter the data for months other than December 2022
other_months_data_dec = balanced_data[balanced_data['year_month'] != '2022-12']

# Concatenate the sampled December 2022 data with the data from other months
balanced_data_dec = pd.concat([other_months_data_dec, december_2022_sampled])

# Display the percentage of records for each month in the balanced dataset
balanced_monthly_percentage_dec = (balanced_data_dec['year_month'].value_counts().sort_index() / len(balanced_data_dec)) * 100
print(balanced_monthly_percentage_dec)


year_month
2021-02    0.879111
2021-03    0.081892
2021-04    1.148484
2021-05    0.430862
2021-06    2.488345
2021-07    9.219379
2021-08    9.672494
2021-09    0.465043
2021-10    6.958808
2021-11    7.113750
2021-12    1.093889
2022-01    5.144686
2022-02    1.572066
2022-03    7.122236
2022-04    8.251592
2022-05    7.042302
2022-06    6.848294
2022-07    3.830599
2022-08    0.829303
2022-09    3.600986
2022-10    2.902868
2022-11    4.000891
2022-12    4.579316
2023-01    4.722805
Freq: M, Name: count, dtype: float64


In [None]:
balanced_data_dec.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,year_month
1,Iconic Holding,"Frankfurt am Main, Germany",Professional Crypto Asset Ventures \nhttps://t...,2021-01-05 13:22:24,301,1075,361,True,2021-02-05 10:52:04,📖 Weekend Read 📖\n\nKeen to learn about #cryp...,['crypto'],Twitter Web App,False,2021-02
2,Iconic Holding,"Frankfurt am Main, Germany",Professional Crypto Asset Ventures \nhttps://t...,2021-01-05 13:22:24,301,1075,361,True,2021-02-05 10:52:04,2⃣ Debunking 9 #Bitcoin Myths by @Patrick_Lo...,"['Bitcoin', 'cryptocurrency', 'bitcoin', 'cryp...",Twitter Web App,False,2021-02
3,Iconic Holding,"Frankfurt am Main, Germany",Professional Crypto Asset Ventures \nhttps://t...,2021-01-05 13:22:24,301,1075,361,True,2021-02-05 10:52:06,4⃣ 🎙️ Bloomberg LP #CryptoOutlook 2021 with @...,"['CryptoOutlook', 'cryptocurrency', 'bitcoin',...",Twitter Web App,False,2021-02
4,Iconic Holding,"Frankfurt am Main, Germany",Professional Crypto Asset Ventures \nhttps://t...,2021-01-05 13:22:24,301,1075,361,True,2021-02-05 10:52:07,"5⃣ #Blockchain 50 2021 by @DelRayMan, @Forbe...","['Blockchain', 'cryptocurrency', 'bitcoin', 'c...",Twitter Web App,False,2021-02
5,Nick Doevendans,"Edam-Volendam, Nederland","Amateur historicus m.n. WW2, schrijver, muziek...",2020-06-12 16:50:07,37,123,410,True,2021-02-05 10:52:26,#reddcoin #rdd @reddcoin to the moon #altcoin ...,"['reddcoin', 'rdd', 'altcoin', 'turnreddcoinin...",Twitter for iPhone,False,2021-02


In [14]:
balanced_data_dec.tail()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,year_month
4612131,,,,NaT,716,5,4,True,2022-12-25 22:37:46,$PRQ price go up 0.71% in last minute on #OKX,,,True,2022-12
4656751,,,,NaT,716,5,4,True,2022-12-25 22:57:47,$FAME price go up 0.93% in last minute on #OKX,,,True,2022-12
5237727,,,,NaT,713,5,4,True,2022-12-27 22:33:50,Top 3 #cryptocurrency #price jumps in last min...,,,True,2022-12
4739623,,,,NaT,716,5,4,True,2022-12-25 23:23:47,Top 3 #cryptocurrency #price jumps in last min...,,,True,2022-12
5468905,,,,NaT,713,5,4,True,2022-12-27 23:43:49,Top 3 #cryptocurrency #price jumps in last min...,,,True,2022-12


In [15]:
balanced_data_dec.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5055449 entries, 1 to 5468905
Data columns (total 14 columns):
 #   Column            Dtype         
---  ------            -----         
 0   user_name         object        
 1   user_location     object        
 2   user_description  object        
 3   user_created      datetime64[ns]
 4   user_followers    int64         
 5   user_friends      int64         
 6   user_favourites   int64         
 7   user_verified     bool          
 8   date              datetime64[ns]
 9   text              object        
 10  hashtags          object        
 11  source            category      
 12  is_retweet        bool          
 13  year_month        period[M]     
dtypes: bool(2), category(1), datetime64[ns](2), int64(3), object(5), period[M](1)
memory usage: 482.2+ MB


In [16]:
len(balanced_data_dec)

5055449

In [17]:
balanced_data_dec.describe()

Unnamed: 0,user_created,user_followers,user_friends,user_favourites,date
count,4598248,5055449.0,5055449.0,5055449.0,5055449
mean,2018-09-25 05:59:05.137139712,7514.433,989.9971,6024.156,2022-03-03 12:37:19.042106112
min,1970-01-01 00:00:00,0.0,0.0,0.0,2021-02-05 10:52:04
25%,2016-08-23 03:11:10,61.0,17.0,22.0,2021-10-19 05:31:07
50%,2020-09-19 07:21:11,317.0,156.0,418.0,2022-03-17 19:29:42
75%,2021-08-13 21:07:41,1146.0,650.0,3145.0,2022-06-29 12:09:01
max,2023-01-08 22:19:14,24131350.0,4201104.0,1567894.0,2023-01-09 23:56:52
std,,95109.11,5761.52,21896.24,


In [18]:
# Save the balanced_data_dec DataFrame as a Parquet file
balanced_data_dec.to_parquet("curated_final_dataset.parquet", engine='pyarrow')