In [1]:
!pip install nltk gensim lxml pandas numpy scikit_learn matplotlib keras tensorflow scipy yfinance

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting gensim
  Using cached gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Collecting lxml
  Using cached lxml-5.2.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pandas
  Using cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting scikit_learn
  Using cached scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting matplotlib
  Using cached matplotlib-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting keras
  Using cached keras-3.2.1-py3-none-any.whl.metadata (5.6 kB)
Collecting tensorflow
  Using cached tensorflow-2.16.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting scipy
  Using cached scipy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

In [2]:
import pandas as pd
import numpy as np
from numpy import random
# import gensim
import nltk
import lxml
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import yfinance as yf

# Data

In [3]:
df_tweet = pd.read_csv('dataset_52-person-from-2021-02-05_2023-06-12_21-34-17-266_with_sentiment.csv')
del df_tweet['Unnamed: 0']

#Standardise date format
df_tweet['created_at'] = pd.to_datetime(df_tweet.created_at)

#Rename date column
df_tweet = df_tweet.rename(columns={'created_at': 'Date'})

print(df_tweet.shape)
df_tweet.head(3)

(16512, 12)


Unnamed: 0,Date,favorite_count,full_text,reply_count,retweet_count,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,scores,compound,sentiment_type
0,2021-02-01,154,#privacy is a human right. learn how to make y...,18,23,privacy human right learn make bitcoin transac...,340.0,0.000588,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL
1,2021-02-01,17,"overall btc trading volume has increased, but ...",1,5,overall btc trading volume increased average t...,39.5,6.8e-05,(btc),"{'neg': 0.0, 'neu': 0.95, 'pos': 0.05, 'compou...",0.2124,POSITIVE
2,2021-02-01,3,"on average, the return distribution of btc ske...",0,1,average return distribution btc skews slightly...,7.0,1.2e-05,(btc),"{'neg': 0.053, 'neu': 0.769, 'pos': 0.177, 'co...",0.701,POSITIVE


In [4]:
df_btcusd = pd.read_csv('BTC-USD.csv')
df_btcusd.columns = ['Date','BTCOpen','BTCHigh','BTCLow','BTCClose','BTCAdjCLose','BTCVolume']

#Standardise date format
df_btcusd['Date'] = pd.to_datetime(df_btcusd.Date)

print(df_btcusd.shape)
df_btcusd.head(1)

(799, 7)


Unnamed: 0,Date,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume
0,2021-10-12,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949


In [5]:
# Trim the df_tweet to match number of rows with df_btcusd

#Obtain the beginning and end date of examples in df_btcusd
print ('df_btcusd')
print ('Start: ',df_btcusd.iloc[0, 0])
print ('End: ',df_btcusd.iloc[798, 0])

# Obtain the beginning and end date of examples in df_btcusd
print ('df_tweet')
print ('Start: ',df_tweet.iloc[0, 0])
print ('End: ',df_tweet.iloc[16511, 0])

# Trim some head of df_tweet
#Obtain the row number in df_tweet for Start date of df_btcusd
print ('Row numbers in df_tweet: ', df_tweet[df_tweet['Date'] == '2021-10-12 00:00:00'].index)
trim_df_tweet = df_tweet.iloc[981:]

# Trim some tail of df_btcusd
#Obtain the row number in df_btcused for End date of df_tweet
print ('Row number in df_btcusd: ', df_btcusd[df_btcusd['Date'] == '2023-06-12 00:00:00'].index)
trim_df_btcusd = df_btcusd.iloc[:609]

df_btcusd
Start:  2021-10-12 00:00:00
End:  2023-12-19 00:00:00
df_tweet
Start:  2021-02-01 00:00:00
End:  2023-06-12 00:00:00
Row numbers in df_tweet:  Index([981, 982], dtype='int64')
Row number in df_btcusd:  Index([608], dtype='int64')


In [6]:
# Inner join the two trimmed dfs together on 'Date'
join_df = pd.merge(trim_df_tweet,trim_df_btcusd, on="Date" , how="inner")
print(join_df.shape)
join_df.head(1)

(15531, 18)


Unnamed: 0,Date,favorite_count,full_text,reply_count,retweet_count,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,scores,compound,sentiment_type,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume
0,2021-10-12,5,#bitcoin is king,1,0,bitcoin king,10.5,1.8e-05,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949


In [7]:
# Create normalised BTC Volume
min_val = join_df['BTCVolume'].min()
max_val = join_df['BTCVolume'].max()
join_df['normalized_vol'] = (join_df['BTCVolume'] - min_val) / (max_val - min_val)

#Create a new column for the next day's BTC-USD closing price
# Shift the 'Close' column by one row to get the next day's Close value
join_df['BTC-USD Next Day Close'] = join_df['BTCClose'].shift(-1)
join_df.head(1)


Unnamed: 0,Date,favorite_count,full_text,reply_count,retweet_count,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,scores,compound,sentiment_type,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,normalized_vol,BTC-USD Next Day Close
0,2021-10-12,5,#bitcoin is king,1,0,bitcoin king,10.5,1.8e-05,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,0.304061,56041.058594


In [13]:
# Replace the NaN value in the last row of 'Next Day Close' column with 43,652.25
join_df.loc[join_df['Date'] == '2023-12-19', 'BTC-USD Next Day Close'] = 43652.25
join_df

Unnamed: 0,Date,favorite_count,full_text,reply_count,retweet_count,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,scores,...,sentiment_type,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,normalized_vol,BTC-USD Next Day Close,Output
0,2021-10-12,5,#bitcoin is king,1,0,bitcoin king,10.5,0.000018,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,NEUTRAL,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,0.304061,56041.058594,0
1,2021-10-12,1296,#fantom is one of the most mentioned ecosystem...,61,385,fantom one mentioned ecosystem recently let se...,3007.5,0.005199,"(fantom,ftm)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,NEUTRAL,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,0.304061,57401.097656,1
2,2021-10-13,0,rt @coin98_wallet: ð¥ boom! the coin98 walle...,0,155,rt coin98_wallet ð boom coin98 wallet universe...,155.0,0.000268,(coin98),"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou...",...,POSITIVE,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57401.097656,0
3,2021-10-13,15,memecoin $shibâs price has increased by more...,1,6,memecoin shibâs price increased 300 past month...,36.5,0.000063,"(doge,btc,shiba)","{'neg': 0.048, 'neu': 0.873, 'pos': 0.079, 'co...",...,POSITIVE,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57401.097656,0
4,2021-10-13,54,#bitcoin below 100k is cheap,2,4,bitcoin 100k cheap,113.0,0.000195,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,NEUTRAL,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57321.523438,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15526,2023-06-12,3,booomð¥\n\nour #ai bot/indicator crushes ano...,5,0,booomð ai botindicator crush another lina trad...,8.5,0.000015,"(doge,hbar,inj,usdt,matic,ftm)","{'neg': 0.092, 'neu': 0.75, 'pos': 0.158, 'com...",...,POSITIVE,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0
15527,2023-06-12,0,rt @crypto_crib_: the deadline is today for bi...,0,4,rt crypto_crib_ deadline today binance binance...,4.0,0.000007,"(binance,request)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,NEUTRAL,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0
15528,2023-06-12,0,rt @crypto_crib_: ð²chinese bank boci issues...,0,8,rt crypto_crib_ ð²chinese bank boci issue coun...,8.0,0.000014,(ethereum),"{'neg': 0.0, 'neu': 0.844, 'pos': 0.156, 'comp...",...,POSITIVE,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0
15529,2023-06-12,56,"bitcoin, not crypto.\n\ncrypto, not security.",16,7,bitcoin crypto crypto security,127.0,0.000220,(bitcoin),"{'neg': 0.289, 'neu': 0.711, 'pos': 0.0, 'comp...",...,NEGATIVE,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0


In [14]:
join_df['Output'] = pd.cut(join_df['BTC-USD Next Day Close'] - join_df['BTCClose'], bins=[float('-inf'), 0, float('inf')], labels=[0, 1])
join_df['Output'] = join_df['Output'].cat.codes
join_df

Unnamed: 0,Date,favorite_count,full_text,reply_count,retweet_count,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,scores,...,sentiment_type,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,normalized_vol,BTC-USD Next Day Close,Output
0,2021-10-12,5,#bitcoin is king,1,0,bitcoin king,10.5,0.000018,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,NEUTRAL,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,0.304061,56041.058594,0
1,2021-10-12,1296,#fantom is one of the most mentioned ecosystem...,61,385,fantom one mentioned ecosystem recently let se...,3007.5,0.005199,"(fantom,ftm)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,NEUTRAL,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,0.304061,57401.097656,1
2,2021-10-13,0,rt @coin98_wallet: ð¥ boom! the coin98 walle...,0,155,rt coin98_wallet ð boom coin98 wallet universe...,155.0,0.000268,(coin98),"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou...",...,POSITIVE,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57401.097656,0
3,2021-10-13,15,memecoin $shibâs price has increased by more...,1,6,memecoin shibâs price increased 300 past month...,36.5,0.000063,"(doge,btc,shiba)","{'neg': 0.048, 'neu': 0.873, 'pos': 0.079, 'co...",...,POSITIVE,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57401.097656,0
4,2021-10-13,54,#bitcoin below 100k is cheap,2,4,bitcoin 100k cheap,113.0,0.000195,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,NEUTRAL,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57321.523438,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15526,2023-06-12,3,booomð¥\n\nour #ai bot/indicator crushes ano...,5,0,booomð ai botindicator crush another lina trad...,8.5,0.000015,"(doge,hbar,inj,usdt,matic,ftm)","{'neg': 0.092, 'neu': 0.75, 'pos': 0.158, 'com...",...,POSITIVE,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0
15527,2023-06-12,0,rt @crypto_crib_: the deadline is today for bi...,0,4,rt crypto_crib_ deadline today binance binance...,4.0,0.000007,"(binance,request)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,NEUTRAL,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0
15528,2023-06-12,0,rt @crypto_crib_: ð²chinese bank boci issues...,0,8,rt crypto_crib_ ð²chinese bank boci issue coun...,8.0,0.000014,(ethereum),"{'neg': 0.0, 'neu': 0.844, 'pos': 0.156, 'comp...",...,POSITIVE,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0
15529,2023-06-12,56,"bitcoin, not crypto.\n\ncrypto, not security.",16,7,bitcoin crypto crypto security,127.0,0.000220,(bitcoin),"{'neg': 0.289, 'neu': 0.711, 'pos': 0.0, 'comp...",...,NEGATIVE,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0


In [15]:
join_df_subset = join_df[['clean_text','importance_coefficient_normalized','normalized_vol','Output']]
join_df_subset.head(3)

Unnamed: 0,clean_text,importance_coefficient_normalized,normalized_vol,Output
0,bitcoin king,1.8e-05,0.304061,0
1,fantom one mentioned ecosystem recently let se...,0.005199,0.304061,1
2,rt coin98_wallet ð boom coin98 wallet universe...,0.000268,0.309425,0


In [16]:
join_df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15531 entries, 0 to 15530
Data columns (total 4 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   clean_text                         15531 non-null  object 
 1   importance_coefficient_normalized  15531 non-null  float64
 2   normalized_vol                     15531 non-null  float64
 3   Output                             15531 non-null  int8   
dtypes: float64(2), int8(1), object(1)
memory usage: 379.3+ KB


#### Extracting Dataset for Vinny

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(join_df_subset['clean_text'])
sequences = tokenizer.texts_to_sequences(join_df_subset['clean_text'])
max_len = max([len(seq) for seq in sequences])
tokenised_clean_text = pad_sequences(sequences, maxlen=max_len)

tokenised_clean_text

2024-04-17 13:15:43.550999: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-17 13:15:43.554777: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-17 13:15:43.601526: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


array([[    0,     0,     0, ...,     0,     2,  1640],
       [    0,     0,     0, ...,   299,   299, 11379],
       [    0,     0,     0, ...,  2134,   402,  3199],
       ...,
       [    0,     0,     0, ...,    31,   213, 34513],
       [    0,     0,     0, ...,     9,     9,   210],
       [    0,     0,     0, ...,  5588,    68, 34514]], dtype=int32)

In [13]:
join_df['tokenised_clean_text'] = tokenised_clean_text.tolist()

In [14]:
join_df

Unnamed: 0,Date,favorite_count,full_text,reply_count,retweet_count,clean_text,importance_coefficient,importance_coefficient_normalized,new_coins,scores,...,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,normalized_vol,BTC-USD Next Day Close,Output,tokenised_clean_text
0,2021-10-12,5,#bitcoin is king,1,0,bitcoin king,10.5,0.000018,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,0.304061,56041.058594,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2021-10-12,1296,#fantom is one of the most mentioned ecosystem...,61,385,fantom one mentioned ecosystem recently let se...,3007.5,0.005199,"(fantom,ftm)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,0.304061,57401.097656,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2021-10-13,0,rt @coin98_wallet: ð¥ boom! the coin98 walle...,0,155,rt coin98_wallet ð boom coin98 wallet universe...,155.0,0.000268,(coin98),"{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compou...",...,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57401.097656,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2021-10-13,15,memecoin $shibâs price has increased by more...,1,6,memecoin shibâs price increased 300 past month...,36.5,0.000063,"(doge,btc,shiba)","{'neg': 0.048, 'neu': 0.873, 'pos': 0.079, 'co...",...,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57401.097656,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,2021-10-13,54,#bitcoin below 100k is cheap,2,4,bitcoin 100k cheap,113.0,0.000195,(bitcoin),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783,0.309425,57321.523438,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15526,2023-06-12,3,booomð¥\n\nour #ai bot/indicator crushes ano...,5,0,booomð ai botindicator crush another lina trad...,8.5,0.000015,"(doge,hbar,inj,usdt,matic,ftm)","{'neg': 0.092, 'neu': 0.75, 'pos': 0.158, 'com...",...,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1372, 170, 34508, ..."
15527,2023-06-12,0,rt @crypto_crib_: the deadline is today for bi...,0,4,rt crypto_crib_ deadline today binance binance...,4.0,0.000007,"(binance,request)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",...,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15528,2023-06-12,0,rt @crypto_crib_: ð²chinese bank boci issues...,0,8,rt crypto_crib_ ð²chinese bank boci issue coun...,8.0,0.000014,(ethereum),"{'neg': 0.0, 'neu': 0.844, 'pos': 0.156, 'comp...",...,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15529,2023-06-12,56,"bitcoin, not crypto.\n\ncrypto, not security.",16,7,bitcoin crypto crypto security,127.0,0.000220,(bitcoin),"{'neg': 0.289, 'neu': 0.711, 'pos': 0.0, 'comp...",...,25934.285156,26087.919922,25675.197266,25902.500000,25902.500000,11677889997,0.041385,25902.500000,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
vinny_data = join_df[['Date','clean_text','tokenised_clean_text','importance_coefficient','importance_coefficient_normalized','BTCVolume','normalized_vol','BTC-USD Next Day Close','Output']]
vinny_data

Unnamed: 0,Date,clean_text,tokenised_clean_text,importance_coefficient,importance_coefficient_normalized,BTCVolume,normalized_vol,BTC-USD Next Day Close,Output
0,2021-10-12,bitcoin king,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",10.5,0.000018,41083758949,0.304061,56041.058594,0
1,2021-10-12,fantom one mentioned ecosystem recently let se...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3007.5,0.005199,41083758949,0.304061,57401.097656,1
2,2021-10-13,rt coin98_wallet ð boom coin98 wallet universe...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",155.0,0.000268,41684252783,0.309425,57401.097656,0
3,2021-10-13,memecoin shibâs price increased 300 past month...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",36.5,0.000063,41684252783,0.309425,57401.097656,0
4,2021-10-13,bitcoin 100k cheap,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",113.0,0.000195,41684252783,0.309425,57321.523438,0
...,...,...,...,...,...,...,...,...,...
15526,2023-06-12,booomð ai botindicator crush another lina trad...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1372, 170, 34508, ...",8.5,0.000015,11677889997,0.041385,25902.500000,0
15527,2023-06-12,rt crypto_crib_ deadline today binance binance...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.0,0.000007,11677889997,0.041385,25902.500000,0
15528,2023-06-12,rt crypto_crib_ ð²chinese bank boci issue coun...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8.0,0.000014,11677889997,0.041385,25902.500000,0
15529,2023-06-12,bitcoin crypto crypto security,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",127.0,0.000220,11677889997,0.041385,25902.500000,0


In [16]:

vinny_data.rename(columns={'normalized_vol': 'BTCVolume_normalized'}, inplace=True)


vinny_data.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vinny_data.rename(columns={'normalized_vol': 'BTCVolume_normalized'}, inplace=True)


Unnamed: 0,Date,clean_text,tokenised_clean_text,importance_coefficient,importance_coefficient_normalized,BTCVolume,BTCVolume_normalized,BTC-USD Next Day Close,Output
0,2021-10-12,bitcoin king,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",10.5,1.8e-05,41083758949,0.304061,56041.058594,0
1,2021-10-12,fantom one mentioned ecosystem recently let se...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3007.5,0.005199,41083758949,0.304061,57401.097656,1
2,2021-10-13,rt coin98_wallet ð boom coin98 wallet universe...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",155.0,0.000268,41684252783,0.309425,57401.097656,0
3,2021-10-13,memecoin shibâs price increased 300 past month...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",36.5,6.3e-05,41684252783,0.309425,57401.097656,0
4,2021-10-13,bitcoin 100k cheap,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",113.0,0.000195,41684252783,0.309425,57321.523438,0


In [17]:
vinny_data.to_csv('vinny_data.csv', index=False)

# Model

## LSTM Approach

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

### Taking Tweet Importance + BTC Volume + Clean Text as Features

In [18]:
tokenizer = Tokenizer()

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_len = 100

# Tokenizes texts into numbers
tokenizer.fit_on_texts(join_df_subset['clean_text'])
sequences = tokenizer.texts_to_sequences(join_df_subset['clean_text'])
max_len = max([len(seq) for seq in sequences])
tweet_data = pad_sequences(sequences, maxlen=max_len)

# Scale numerical features
scaler = StandardScaler()
numerical_data = scaler.fit_transform(join_df_subset[['importance_coefficient_normalized','normalized_vol']])

# Combine text and numerical data
X = np.hstack((tweet_data, numerical_data))

# Target variable
y = join_df_subset['Output']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the RNN model
model = Sequential()
# model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim)) 
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_len,))) 
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True , kernel_regularizer=regularizers.l2(0.01),))  # Add another LSTM layer
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=regularizers.l2(0.05)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # Change learning rate
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model  .fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")

# Round y_pred_prob to obtain y_pred
y_pred = y_pred.round()
# Print classification report
print(classification_report(y_test, y_pred))



Epoch 1/10


  super().__init__(**kwargs)
2024-04-17 19:54:07.442585: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 26ms/step - accuracy: 0.9808 - loss: 0.9816 - val_accuracy: 0.9797 - val_loss: 0.0996
Epoch 2/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.9824 - loss: 0.0891 - val_accuracy: 0.9797 - val_loss: 0.0997
Epoch 3/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 26ms/step - accuracy: 0.9832 - loss: 0.0855 - val_accuracy: 0.9797 - val_loss: 0.0993
Epoch 4/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 26ms/step - accuracy: 0.9814 - loss: 0.0933 - val_accuracy: 0.9797 - val_loss: 0.1016
Epoch 5/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 26ms/step - accuracy: 0.9817 - loss: 0.0916 - val_accuracy: 0.9797 - val_loss: 0.1000
Epoch 6/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 25ms/step - accuracy: 0.9818 - loss: 0.0914 - val_accuracy: 0.9797 - val_loss: 0.0991
Epoch 7/10
[1m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# Backtesting

import re
def remove_special_characters(text):
    pattern = r'[^a-zA-Z0-9\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

df_backtest_subset = pd.read_csv('new_and_tweets_backtest.csv')
df_backtest_subset['cleaned_tweets'] = df_backtest_subset['tweets'].apply(lambda x: remove_special_characters(x))
df_backtest_subset['cleaned_news'] = df_backtest_subset['news'].apply(lambda x: remove_special_characters(x))



# Tokenizes texts into numbers
tokenizer.fit_on_texts(df_backtest_subset['cleaned_news'])
sequences = tokenizer.texts_to_sequences(df_backtest_subset['cleaned_news'])
max_len = max([len(seq) for seq in sequences])
tweet_data = pad_sequences(sequences, maxlen=max_len)

# Target variable
# y = df_backtest_subset['Output']

# Evaluate the model
# loss, accuracy = model.evaluate(X_test, y_test)
# print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Make predictions
y_pred_prob = model.predict(tweet_data)
y_pred = (y_pred_prob > 0.5).astype("int32")

# Round y_pred_prob to obtain y_pred
y_pred = y_pred.round()
print(y_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]


In [22]:
import pickle

# Save the model
model.save("model_tweet.h5")

# Save the tokenizer
with open('tokenizer_tweet.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)




# Backtesting

In [19]:
df_backtest_subset = pd.read_csv('1new_and_tweets_backtest.csv')
#Standardise date format
df_backtest_subset['Date'] = pd.to_datetime(df_backtest_subset.Date)

df_backtest_subset.info()
df_backtest_subset.tail(3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               31 non-null     datetime64[ns]
 1   news               31 non-null     object        
 2   news_polarity      31 non-null     float64       
 3   news_subjectivity  31 non-null     float64       
 4   news_sentiment     31 non-null     object        
 5   tweets             31 non-null     object        
 6   source             31 non-null     object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 1.8+ KB


Unnamed: 0,Date,news,news_polarity,news_subjectivity,news_sentiment,tweets,source
28,2020-07-02,"Nexus Mutual is seeing a surge in demand. ""Our...",0.3,0.95,Neutral,"2020 is 2016 If you know, you know. If you don...",https://x.com/RookieXBT/status/127841753021485...
29,2020-07-03,"The crypto market is bloody today, with all th...",-0.32,0.38,Negative,Few understand this.. yet.. #Bitcoin is ready ...,https://x.com/RD_btc/status/1278799043280482305
30,2020-07-04,ICON reveals exciting news for the community. ...,0.18,0.51,Neutral,Why trust banks when #bitcoin requires no trust?,https://twitter.com/LuchoPoletti/status/127915...


In [20]:
# Get bitcoin data
bitcoin_data = yf.download("BTC-USD", start="2020-06-04", end="2020-07-04")
bitcoin_data.info()
bitcoin_data.head(1)

[*********************100%%**********************]  1 of 1 completed

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 30 entries, 2020-06-04 to 2020-07-03
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       30 non-null     float64
 1   High       30 non-null     float64
 2   Low        30 non-null     float64
 3   Close      30 non-null     float64
 4   Adj Close  30 non-null     float64
 5   Volume     30 non-null     int64  
dtypes: float64(5), int64(1)
memory usage: 1.6 KB





Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-06-04,9655.854492,9887.610352,9525.24707,9800.636719,9800.636719,25921805072


In [21]:
# Inner join
backtest_join_df = pd.merge(df_backtest_subset,bitcoin_data, on="Date" , how="inner")
print(join_df.shape)
backtest_join_df.head(1)

(15531, 21)


Unnamed: 0,Date,news,news_polarity,news_subjectivity,news_sentiment,tweets,source,Open,High,Low,Close,Adj Close,Volume
0,2020-06-04,"DeFi stands for decentralized finance, and itâ...",-0.03,0.07,Negative,Nothing more bullish for #gold and #bitcoinBeg...,https://twitter.com/DTAPCAP/status/12685922950...,9655.854492,9887.610352,9525.24707,9800.636719,9800.636719,25921805072


In [22]:
#Create a new column for the next day's BTC-USD closing price
# Shift the 'Close' column by one row to get the next day's Close value
backtest_join_df['BTC-USD Next Day Close'] = backtest_join_df['Close'].shift(-1)
backtest_join_df.head(1)

# Replace the NaN value in the last row 
backtest_join_df.loc[backtest_join_df['Date'] == '2020-07-02', 'BTC-USD Next Day Close'] = 0.095603

backtest_join_df['Output'] = pd.cut(backtest_join_df['BTC-USD Next Day Close'] - backtest_join_df['Close'], bins=[float('-inf'), 0, float('inf')], labels=[0, 1])
backtest_join_df['Output'] = backtest_join_df['Output'].cat.codes
backtest_join_df.to_csv('news_tweets_output_backtest.csv', index=False)




In [23]:

# Tokenizes texts into numbers
tokenizer.fit_on_texts(backtest_join_df['news'])
sequences = tokenizer.texts_to_sequences(backtest_join_df['news'])
max_len = max([len(seq) for seq in sequences])
tweet_data = pad_sequences(sequences, maxlen=max_len)

# Target variable
y = backtest_join_df['Output']

# # Evaluate the model
# loss, accuracy = model.evaluate(X_test, y_test)
# print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Make predictions
y_pred_prob = model.predict(tweet_data)
y_pred = (y_pred_prob > 0.5).astype("int32")

# Round y_pred_prob to obtain y_pred
y_pred = y_pred.round()

# Print classification report
print(classification_report(y_pred, y))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 258ms/step
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       1.00      0.57      0.72        30
           1       0.00      0.00      0.00         0

    accuracy                           0.57        30
   macro avg       0.33      0.19      0.24        30
weighted avg       1.00      0.57      0.72        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Only clean text as features

In [41]:
tokenizer = Tokenizer()

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_len = 100


tokenizer.fit_on_texts(join_df_subset['clean_text'])
sequences = tokenizer.texts_to_sequences(join_df_subset['clean_text'])
max_len = max([len(seq) for seq in sequences])
tweet_data = pad_sequences(sequences, maxlen=max_len)

# Scale numerical features
scaler = StandardScaler()
numerical_data = scaler.fit_transform(join_df_subset[['importance_coefficient_normalized','normalized_vol']])

# Combine text and numerical data
# X = np.hstack((tweet_data, numerical_data))
X = tweet_data

# Target variable
y = join_df_subset['Output']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim)) 
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True , kernel_regularizer=regularizers.l2(0.01),))  # Add another LSTM layer
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=regularizers.l2(0.05)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # Change learning rate
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")

# Round y_pred_prob to obtain y_pred
y_pred = y_pred.round()
# Print classification report
print(classification_report(y_test, y_pred))


Epoch 1/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 25ms/step - accuracy: 0.9779 - loss: 0.9592 - val_accuracy: 0.9797 - val_loss: 0.0998
Epoch 2/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.9824 - loss: 0.0894 - val_accuracy: 0.9797 - val_loss: 0.1024
Epoch 3/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.9835 - loss: 0.0855 - val_accuracy: 0.9797 - val_loss: 0.0991
Epoch 4/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.9817 - loss: 0.0904 - val_accuracy: 0.9797 - val_loss: 0.0995
Epoch 5/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.9842 - loss: 0.0811 - val_accuracy: 0.9797 - val_loss: 0.1007
Epoch 6/10
[1m1243/1243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.9821 - loss: 0.0903 - val_accuracy: 0.9797 - val_loss: 0.1020
Epoc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pickle

# Save the model
model.save("model_tweet_textonly.h5")

# Save the tokenizer
with open('tokenizer_tweet_textonly.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# News

## Data

In [43]:
df_news = pd.read_csv('cryptonews.csv')
# del df_tweet['Unnamed: 0']

#Standardise date format
df_news['date'] = df_news['date'].apply(lambda x: x[:10] if isinstance(x, str) else None)
df_news['date'] = pd.to_datetime(df_news.date, format='mixed')

# Sort rows in ascending date
df_news = df_news.sort_values(by='date')
df_news = df_news.rename(columns={'date': 'Date'})


df_news.head(3)

Unnamed: 0,Date,sentiment,source,subject,text,title,url
31036,2021-10-12,"{'class': 'positive', 'polarity': 0.16, 'subje...",CryptoNews,blockchain,"Within a little more than a year, Celo aims to...","Celo to Be Fastest EVM Chain by End of 2022, C...",https://cryptonews.com/news/celo-to-be-fastest...
31035,2021-10-15,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,Chinese companies are still topping the blockc...,Tech Crackdown Hasn't Halted Chinese Firms' Bl...,https://cryptonews.com/news/tech-crackdown-has...
31034,2021-10-18,"{'class': 'positive', 'polarity': 0.14, 'subje...",CryptoNews,blockchain,Advancing its project to become \x9caÂ\xa0meta...,"Facebook To Add 10,000 Jobs In EU For Metavers...",https://cryptonews.com/news/facebook-to-add-10...


In [44]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31037 entries, 31036 to 0
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       31037 non-null  datetime64[ns]
 1   sentiment  31037 non-null  object        
 2   source     31037 non-null  object        
 3   subject    31037 non-null  object        
 4   text       31037 non-null  object        
 5   title      31037 non-null  object        
 6   url        31037 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 1.9+ MB


In [45]:
#Obtain the beginning and end date of examples in df_btcusd
print ('df_btcusd')
print ('Start: ',df_btcusd.iloc[0, 0])
print ('End: ',df_btcusd.iloc[798, 0])

# Obtain the beginning and end date of examples in df_btcusd
print ('df_news')
print ('Start: ',df_news.iloc[0, 0])
print ('End: ',df_news.iloc[31036, 0])

df_btcusd
Start:  2021-10-12 00:00:00
End:  2023-12-19 00:00:00
df_news
Start:  2021-10-12 00:00:00
End:  2023-12-19 00:00:00


In [46]:
# Inner join the two trimmed dfs together on 'Date'
news_join_df = pd.merge(df_news,trim_df_btcusd, on="Date" , how="inner")
# Create a new column for text+title
news_join_df['title_text'] = news_join_df['title']+news_join_df['text']
print(news_join_df.shape)
news_join_df.head(1)

(21568, 14)


Unnamed: 0,Date,sentiment,source,subject,text,title,url,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,title_text
0,2021-10-12,"{'class': 'positive', 'polarity': 0.16, 'subje...",CryptoNews,blockchain,"Within a little more than a year, Celo aims to...","Celo to Be Fastest EVM Chain by End of 2022, C...",https://cryptonews.com/news/celo-to-be-fastest...,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,"Celo to Be Fastest EVM Chain by End of 2022, C..."


In [47]:
# Create normalised BTC Volume
min_val = join_df['BTCVolume'].min()
max_val = join_df['BTCVolume'].max()
news_join_df['normalized_vol'] = (news_join_df['BTCVolume'] - min_val) / (max_val - min_val)

#Create a new column for the next day's BTC-USD closing price
# Shift the 'Close' column by one row to get the next day's Close value
news_join_df['BTC-USD Next Day Close'] = news_join_df['BTCClose'].shift(-1)
news_join_df.head(1)


Unnamed: 0,Date,sentiment,source,subject,text,title,url,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,title_text,normalized_vol,BTC-USD Next Day Close
0,2021-10-12,"{'class': 'positive', 'polarity': 0.16, 'subje...",CryptoNews,blockchain,"Within a little more than a year, Celo aims to...","Celo to Be Fastest EVM Chain by End of 2022, C...",https://cryptonews.com/news/celo-to-be-fastest...,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,"Celo to Be Fastest EVM Chain by End of 2022, C...",0.304061,61593.949219


In [48]:

# Replace the NaN value in the last row of 'Next Day Close' column with 43,652.25
news_join_df.loc[news_join_df['Date'] == '2023-12-19', 'BTC-USD Next Day Close'] = 43652.25
news_join_df
news_join_df['Output'] = pd.cut(news_join_df['BTC-USD Next Day Close'] - news_join_df['BTCClose'], bins=[float('-inf'), 0, float('inf')], labels=[0, 1])
news_join_df['Output'] = news_join_df['Output'].cat.codes
news_join_df.head(20)

Unnamed: 0,Date,sentiment,source,subject,text,title,url,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,title_text,normalized_vol,BTC-USD Next Day Close,Output
0,2021-10-12,"{'class': 'positive', 'polarity': 0.16, 'subje...",CryptoNews,blockchain,"Within a little more than a year, Celo aims to...","Celo to Be Fastest EVM Chain by End of 2022, C...",https://cryptonews.com/news/celo-to-be-fastest...,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,"Celo to Be Fastest EVM Chain by End of 2022, C...",0.304061,61593.949219,1
1,2021-10-15,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,Chinese companies are still topping the blockc...,Tech Crackdown Hasn't Halted Chinese Firms' Bl...,https://cryptonews.com/news/tech-crackdown-has...,57345.902344,62757.128906,56868.144531,61593.949219,61593.949219,51780081801,Tech Crackdown Hasn't Halted Chinese Firms' Bl...,0.399608,62026.078125,1
2,2021-10-18,"{'class': 'positive', 'polarity': 0.14, 'subje...",CryptoNews,blockchain,Advancing its project to become \x9caÂ\xa0meta...,"Facebook To Add 10,000 Jobs In EU For Metavers...",https://cryptonews.com/news/facebook-to-add-10...,61548.804688,62614.660156,60012.757813,62026.078125,62026.078125,38055562075,"Facebook To Add 10,000 Jobs In EU For Metavers...",0.277011,64261.992188,1
3,2021-10-19,"{'class': 'positive', 'polarity': 0.1, 'subjec...",CryptoNews,blockchain,Banque de France disclosed the results of its ...,French Central Bank's Blockchain Bond Trial Br...,https://cryptonews.com/news/french-central-ban...,62043.164063,64434.535156,61622.933594,64261.992188,64261.992188,40471196346,French Central Bank's Blockchain Bond Trial Br...,0.298589,58482.386719,0
4,2021-10-27,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,defi,Cream Finance (CREAM) suffered another flash l...,Cream Finance Suffers Another Exploit as Attac...,https://cryptonews.com/news/cream-finance-suff...,60352.0,61435.183594,58208.1875,58482.386719,58482.386719,43657076893,Cream Finance Suffers Another Exploit as Attac...,0.327047,62227.964844,1
5,2021-10-29,"{'class': 'positive', 'polarity': 0.2, 'subjec...",CryptoNews,defi,The crypto community has issued a withering re...,FATF Wants to 'Gut' DeFi with 'Vague' New Guid...,https://cryptonews.com/news/fatf-wants-to-gut-...,60624.871094,62927.609375,60329.964844,62227.964844,62227.964844,36856881767,FATF Wants to 'Gut' DeFi with 'Vague' New Guid...,0.266303,61004.40625,0
6,2021-11-01,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,defi,'This is finally getting to the point where cr...,Google's Parent Increases its Crypto Bet by Jo...,https://cryptonews.com/news/google-increases-i...,61320.449219,62419.003906,59695.183594,61004.40625,61004.40625,36150572843,Google's Parent Increases its Crypto Bet by Jo...,0.259994,63226.402344,1
7,2021-11-02,"{'class': 'positive', 'polarity': 0.15, 'subje...",CryptoNews,nft,"The buyer, confronting an over 99% discount on...",CryptoPunk Mistakenly Sells at Over 99% Discou...,https://cryptonews.com/news/cryptopunk-mistake...,60963.253906,64242.792969,60673.054688,63226.402344,63226.402344,37746665647,CryptoPunk Mistakenly Sells at Over 99% Discou...,0.274251,63226.402344,0
8,2021-11-02,"{'class': 'negative', 'polarity': -0.2, 'subje...",CryptoNews,nft,'Each NFT at auction contains 'secret' content...,SCRT Rallies As Quentin Tarantino Releases NFT...,https://cryptonews.com/news/scrt-rallies-as-qu...,60963.253906,64242.792969,60673.054688,63226.402344,63226.402344,37746665647,SCRT Rallies As Quentin Tarantino Releases NFT...,0.274251,62970.046875,0
9,2021-11-03,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,nft,The Matrix NFT owners will be given the choice...,NFTs Tied to 'The Matrix Resurrections' to Lau...,https://cryptonews.com/news/nfts-tied-to-the-m...,63254.335938,63516.9375,61184.238281,62970.046875,62970.046875,36124731509,NFTs Tied to 'The Matrix Resurrections' to Lau...,0.259763,62970.046875,0


### Extracting Data for Vinny

In [49]:
news_join_df.head(1)

Unnamed: 0,Date,sentiment,source,subject,text,title,url,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,title_text,normalized_vol,BTC-USD Next Day Close,Output
0,2021-10-12,"{'class': 'positive', 'polarity': 0.16, 'subje...",CryptoNews,blockchain,"Within a little more than a year, Celo aims to...","Celo to Be Fastest EVM Chain by End of 2022, C...",https://cryptonews.com/news/celo-to-be-fastest...,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,"Celo to Be Fastest EVM Chain by End of 2022, C...",0.304061,61593.949219,1


In [50]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(news_join_df['title_text'])
sequences = tokenizer.texts_to_sequences(news_join_df['title_text'])
max_len = max([len(seq) for seq in sequences])
tokenised_clean_text = pad_sequences(sequences, maxlen=max_len)

tokenised_clean_text

array([[    0,     0,     0, ...,  2494,  3359,  2106],
       [    0,     0,     0, ...,    28,  6628,  2241],
       [    0,     0,     0, ...,   128, 10200,  1677],
       ...,
       [    0,     0,     0, ...,  6490,   338,   261],
       [    0,     0,     0, ...,    37,     1,   797],
       [    0,     0,     0, ...,   541,    47,  2067]], dtype=int32)

In [51]:
news_join_df['tokenised_clean_text'] = tokenised_clean_text.tolist()
news_join_df.head()

Unnamed: 0,Date,sentiment,source,subject,text,title,url,BTCOpen,BTCHigh,BTCLow,BTCClose,BTCAdjCLose,BTCVolume,title_text,normalized_vol,BTC-USD Next Day Close,Output,tokenised_clean_text
0,2021-10-12,"{'class': 'positive', 'polarity': 0.16, 'subje...",CryptoNews,blockchain,"Within a little more than a year, Celo aims to...","Celo to Be Fastest EVM Chain by End of 2022, C...",https://cryptonews.com/news/celo-to-be-fastest...,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949,"Celo to Be Fastest EVM Chain by End of 2022, C...",0.304061,61593.949219,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2021-10-15,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,Chinese companies are still topping the blockc...,Tech Crackdown Hasn't Halted Chinese Firms' Bl...,https://cryptonews.com/news/tech-crackdown-has...,57345.902344,62757.128906,56868.144531,61593.949219,61593.949219,51780081801,Tech Crackdown Hasn't Halted Chinese Firms' Bl...,0.399608,62026.078125,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2021-10-18,"{'class': 'positive', 'polarity': 0.14, 'subje...",CryptoNews,blockchain,Advancing its project to become \x9caÂ\xa0meta...,"Facebook To Add 10,000 Jobs In EU For Metavers...",https://cryptonews.com/news/facebook-to-add-10...,61548.804688,62614.660156,60012.757813,62026.078125,62026.078125,38055562075,"Facebook To Add 10,000 Jobs In EU For Metavers...",0.277011,64261.992188,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2021-10-19,"{'class': 'positive', 'polarity': 0.1, 'subjec...",CryptoNews,blockchain,Banque de France disclosed the results of its ...,French Central Bank's Blockchain Bond Trial Br...,https://cryptonews.com/news/french-central-ban...,62043.164063,64434.535156,61622.933594,64261.992188,64261.992188,40471196346,French Central Bank's Blockchain Bond Trial Br...,0.298589,58482.386719,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,2021-10-27,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,defi,Cream Finance (CREAM) suffered another flash l...,Cream Finance Suffers Another Exploit as Attac...,https://cryptonews.com/news/cream-finance-suff...,60352.0,61435.183594,58208.1875,58482.386719,58482.386719,43657076893,Cream Finance Suffers Another Exploit as Attac...,0.327047,62227.964844,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [52]:
vinny_news_data = news_join_df[['Date','title_text','tokenised_clean_text','BTCVolume','normalized_vol','Output']]
vinny_news_data.head()

Unnamed: 0,Date,title_text,tokenised_clean_text,BTCVolume,normalized_vol,Output
0,2021-10-12,"Celo to Be Fastest EVM Chain by End of 2022, C...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",41083758949,0.304061,1
1,2021-10-15,Tech Crackdown Hasn't Halted Chinese Firms' Bl...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",51780081801,0.399608,1
2,2021-10-18,"Facebook To Add 10,000 Jobs In EU For Metavers...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",38055562075,0.277011,1
3,2021-10-19,French Central Bank's Blockchain Bond Trial Br...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",40471196346,0.298589,0
4,2021-10-27,Cream Finance Suffers Another Exploit as Attac...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",43657076893,0.327047,1


In [53]:
vinny_news_data.to_csv('vinny_news_data.csv', index=False)

## Model

### Taking Tokenised text as the only feature

In [54]:
tokenizer = Tokenizer()

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_len = 100


tokenizer.fit_on_texts(news_join_df['text'])
sequences = tokenizer.texts_to_sequences(news_join_df['text'])
max_len = max([len(seq) for seq in sequences])
news_data = pad_sequences(sequences, maxlen=max_len)

# Scale numerical features
scaler = StandardScaler()

# Combine text and numerical data
# X = np.hstack((tweet_data, numerical_data))
X = news_data

# Target variable
y = news_join_df['Output']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim)) 
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True , kernel_regularizer=regularizers.l2(0.01),))  # Add another LSTM layer
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=regularizers.l2(0.05)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # Change learning rate
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")

# Round y_pred_prob to obtain y_pred
y_pred = y_pred.round()
# Print classification report
print(classification_report(y_test, y_pred))

Epoch 1/10
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 44ms/step - accuracy: 0.9824 - loss: 0.7574 - val_accuracy: 0.9873 - val_loss: 0.0687
Epoch 2/10
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 44ms/step - accuracy: 0.9870 - loss: 0.0701 - val_accuracy: 0.9873 - val_loss: 0.0686
Epoch 3/10
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 43ms/step - accuracy: 0.9874 - loss: 0.0679 - val_accuracy: 0.9873 - val_loss: 0.0692
Epoch 4/10
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 41ms/step - accuracy: 0.9879 - loss: 0.0657 - val_accuracy: 0.9873 - val_loss: 0.0691
Epoch 5/10
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 41ms/step - accuracy: 0.9878 - loss: 0.0662 - val_accuracy: 0.9873 - val_loss: 0.0688
Epoch 6/10
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 41ms/step - accuracy: 0.9884 - loss: 0.0637 - val_accuracy: 0.9873 - val_loss: 0.0683
Epoc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [55]:
import pickle

# Save the model
model.save("model_news_textonly.h5")

# Save the tokenizer
with open('tokenizer_news_textonly.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


