Comparing the actual label with the VADER prediction, we get an accuracy score of 0.768

# Imports

In [75]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

In [76]:
# Imports
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### Stopwords
Stopwords are words that are very common and add little meaning
examples: a, of, the

In [77]:
stopwords = nltk.corpus.stopwords.words('english')

### PorterStemmer
Words that have the same stem, typically have the same meaning
PorterStemmer cuts off the affixes so you just use the stem -> reduces word count (features)

In [78]:
ps = nltk.PorterStemmer()

# Read in Data

In [79]:
data_news = pd.read_csv("Financial_News_Data_NLP.csv", encoding = "ISO-8859-1")
data_news.columns = ["sentiment", "headlines"]
data_news.head()

Unnamed: 0,sentiment,headlines
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [80]:
# What is the length of the dataset?
len(data_news)

4845

# Prepare Data

## Feature Creation
1. Punctuation percentage
2. Text Length
3. Captialization percentage

In [81]:
# Function to count punctuation
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation]) # Count how many characters are punctuation
    if (len(text) - text.count(" ")) ==0:   # Need to avoid dividing by 0
        return 0
    return round(count/(len(text) - text.count(" ")), 3)*100    # Determined the percentage of the headlines that is punctuation. Make sure not to include spaces. 

# Apply function to make new column
data_news['punct%'] = data_news['headlines'].apply(lambda x: count_punct(x))

In [82]:
# Determine length of headline and make it a column
data_news['text_len'] = data_news['headlines'].apply(lambda x: len(x) - x.count(" "))

In [83]:
# Function to determine capitalization percentage
def capital_percent(text):
    count = sum([1 for char in text if char.isupper()]) # Count how many characters are capitalized. 
    if (len(text) - text.count(" ")) == 0:      # Avoid dividing by 0
        return 0
    return round(count/(len(text) - text.count(" ")), 3)*100    # Calculate percentage of characters in headline that are uppercase. Make sure to not include spaces.

# Apply function to make new column
data_news['capital%'] = data_news['headlines'].apply(lambda x: capital_percent(x))

## Clean Data

In [84]:
# Function to clean up data
# Eliminate punctuation
# Make everything lowercase
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]  # Use portstemmer
    return text

## Vectorization

### TF-IDF
Inverse Document Frequency Weighting 
* Creates a document-term matrix where the cells contain a weighting of how important that word is to the text
* How much does a word differentiate a text message from othes? Pulls out important but seldom used words

In [85]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
tfidf_vect_news = TfidfVectorizer(analyzer=clean_text)

# Fit and transform model 
X_tfidf_news = tfidf_vect_news.fit_transform(data_news['headlines'])

# Create df to see vectorization and concatenated created features
X_tfidf_feat_news = pd.concat([data_news['text_len'], data_news['punct%'], data_news['capital%'], pd.DataFrame(X_tfidf_news.toarray())], axis=1)
X_tfidf_feat_news.head(5)

Unnamed: 0,text_len,punct%,capital%,0,1,2,3,4,5,6,...,8901,8902,8903,8904,8905,8906,8907,8908,8909,8910
0,160,1.9,0.6,0.039911,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,193,1.6,2.1,0.036051,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,174,0.6,0.6,0.039099,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,163,6.7,1.2,0.03829,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,154,1.3,21.4,0.033612,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Count Vectorizer
* Creates a document term matrix where the entry of each cell will be a count of the number of times that word occurred in that document

In [89]:
# Instantiate the object and state our parameters. Pass in the function we created to clean the text (clean_text)
count_vect_news = CountVectorizer(analyzer=clean_text)

# Fit and Transform model
X_count_news = count_vect_news.fit_transform(data_news['headlines'])

# Create df to see vectorization and concatenated created features
X_count_feat_news = pd.concat([data_news['text_len'], data_news['punct%'], data_news['capital%'], pd.DataFrame(X_count_news.toarray())], axis=1)

X_count_feat_news.head()

Unnamed: 0,text_len,punct%,capital%,0,1,2,3,4,5,6,...,8901,8902,8903,8904,8905,8906,8907,8908,8909,8910
0,160,1.9,0.6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,193,1.6,2.1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,174,0.6,0.6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,163,6.7,1.2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,154,1.3,21.4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Grid Search to find Optimal Parameters

### Search with TF-IDF

In [None]:
# Set instance of the randomforest algorithm
# Feed in a list of n_estimators and max depths to test 
# Use the TF-IDF vectorizer 
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat_news, data_news['sentiment'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,39.483314,1.167703,0.522142,0.087084,,300,"{'max_depth': None, 'n_estimators': 300}",0.69453,0.688338,0.635707,0.652219,0.663571,0.666873,0.022012,1
8,51.573586,2.324569,1.263336,0.273069,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.696594,0.698658,0.629515,0.639835,0.663571,0.665635,0.028368,2
10,34.212505,2.768572,0.906632,0.386668,,150,"{'max_depth': None, 'n_estimators': 150}",0.692466,0.691434,0.635707,0.640867,0.665635,0.665222,0.024058,3
5,42.425284,1.016262,1.271919,0.36479,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.713106,0.683179,0.623323,0.631579,0.653251,0.660888,0.033323,4
7,26.763226,1.563173,1.016106,0.264996,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.702786,0.682147,0.630547,0.626419,0.659443,0.660268,0.029381,5


mean_test_score = 0.666873
param_max_depth = None
param_n_estimators = 300

## Search with Count Vector

In [90]:
# Set instance of the randomforest algorithm
# Feed in a list of n_estimators and max depths to test 
# Use the Count vectorizer 
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat_news, data_news['sentiment'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,20.065043,0.529723,0.455671,0.123835,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.703818,0.68937,0.625387,0.648091,0.663571,0.666047,0.028116,1
10,29.943766,1.905171,0.571874,0.228395,,150,"{'max_depth': None, 'n_estimators': 150}",0.685243,0.69453,0.635707,0.654283,0.655315,0.665015,0.021673,2
11,36.099276,3.116764,0.320528,0.047059,,300,"{'max_depth': None, 'n_estimators': 300}",0.692466,0.684211,0.629515,0.659443,0.656347,0.664396,0.022301,3
5,31.55895,1.492394,0.432839,0.098882,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.707946,0.686275,0.626419,0.641899,0.656347,0.663777,0.029619,4
8,38.304628,1.36587,0.664323,0.208403,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.69453,0.685243,0.628483,0.648091,0.662539,0.663777,0.024082,4


# Random Forest on Holdout Test Set

In [91]:
# X_tfidf_feat_twitter
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [92]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_feat_news, data_news['sentiment'], test_size=0.2)

In [93]:
from sklearn.ensemble import RandomForestClassifier

# The number of jobs to run in parallel for both fit and predict. If -1, then the number of jobs is set to the number of cores. Training the Random Forest model with more than one core is obviously more performant than on a single core.
# Used optimal parametes found by the grid search
rf = RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)



In [94]:
# Apply the model to the holdout test data
# Calculate accuracy metrics
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred)



In [183]:
from sklearn.metrics import confusion_matrix

In [186]:
# Create nicer confusion matrix
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: {}\n".format(accuracy))

# Show the classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.7585139318885449

              precision    recall  f1-score   support

    negative       0.75      0.35      0.47       113
     neutral       0.76      0.96      0.85       593
    positive       0.77      0.48      0.59       263

    accuracy                           0.76       969
   macro avg       0.76      0.60      0.64       969
weighted avg       0.76      0.76      0.73       969



### Save the model and vectorizer

In [161]:
import joblib

# Save the model and vectorizer
joblib.dump(rf, "NLP.joblib")
joblib.dump(tfidf_vect_news, "NLP_vectorizer.joblib")

# Compare with Vader

In [97]:
import nltk

# Download the lexicon
#nltk.download("vader_lexicon")

# Import the lexicon 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# SentimentIntensityAnalyzer.polarity_score()function provides the polarity of the text rendering the dictionary format
# of 4 keys neg, neu, pos and compound
# neg, neu, and pos should add to 1
# Compound is overall and is between -1 and 1
# Create an instance of SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

### Functions for Vader

In [105]:
# Function to gives a polarity rather than numerical output
def format_output(prediction):
  
  polarity = "neutral"

  if(prediction>= 0.05):
    polarity = "positive"

  elif(prediction<= -0.05):
    polarity = "negative"

  return polarity

In [101]:
def apply_vader(df):
    # Predict sentiment for each article
    df["vader_prediction"] = df["headlines"].apply(lambda text: sent_analyzer.polarity_scores(text)['compound'])

    return df

### Use Vader on Financial News Training Data

In [106]:
data_news_vader = apply_vader(data_news)

In [107]:
data_news_vader["vader_sentiment"] = data_news_vader["vader_prediction"].apply(lambda x: format_output(x))
data_news_vader

Unnamed: 0,sentiment,headlines,punct%,text_len,capital%,vader_prediction,vader_sentiment
0,neutral,Technopolis plans to develop in stages an area...,1.9,160,0.6,-0.2960,negative
1,negative,The international electronic industry company ...,1.6,193,2.1,0.0000,neutral
2,positive,With the new production plant the company woul...,0.6,174,0.6,0.8555,positive
3,positive,According to the company 's updated strategy f...,6.7,163,1.2,0.6705,positive
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,1.3,154,21.4,0.3485,positive
...,...,...,...,...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,2.5,119,12.6,-0.7269,negative
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,6.7,119,2.5,0.0000,neutral
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,5.8,86,11.6,0.7430,positive
4843,negative,Net sales of the Paper segment decreased to EU...,3.8,183,7.7,0.4404,positive


### Evaluate how Vader did! 

In [110]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(data_news_vader['sentiment'], data_news_vader['vader_sentiment'])

print("Accuracy: {}\n".format(accuracy))

# Show the classification report
print(classification_report(data_news_vader['sentiment'], data_news_vader['vader_sentiment']))

Accuracy: 0.5430340557275541

              precision    recall  f1-score   support

    negative       0.41      0.30      0.34       604
     neutral       0.74      0.52      0.61      2878
    positive       0.40      0.71      0.51      1363

    accuracy                           0.54      4845
   macro avg       0.52      0.51      0.49      4845
weighted avg       0.60      0.54      0.55      4845



# Use NLP model on BTC news after SVB

### Import Data

In [152]:
btc_after_SVB_df = pd.read_csv("../Resources_crypto_articles/btc_snapshot_after_SVB.csv", index_col=[0])
btc_after_SVB_df.rename(columns={"summary": "headlines", "date":"begins_at"}, inplace=True)
btc_after_SVB_df.head(2)

Unnamed: 0,begins_at,headlines
0,2023-04-13,Twitter Partners with eToro to Let Users Buy a...
1,2023-04-06,Here’s How to Find the Original Bitcoin Manife...


In [124]:
len(btc_after_SVB_df)

100

### Feature Creation

Use same feature engineering as training set

In [153]:
# Apply function to calculate punctuation
btc_after_SVB_df['punct%'] = btc_after_SVB_df['headlines'].apply(lambda x: count_punct(x))

# Determine length of headline and make it a column
btc_after_SVB_df['text_len'] = btc_after_SVB_df['headlines'].apply(lambda x: len(x) - x.count(" "))

# Apply function to make new column
btc_after_SVB_df['capital%'] = btc_after_SVB_df['headlines'].apply(lambda x: capital_percent(x))


### Vectorization and model

In [169]:
# Import model and vectorizer fit to training data
nlp_jenn = joblib.load("NLP.joblib")
vectorizer_nlp = joblib.load("NLP_vectorizer.joblib")

In [170]:
# vectorize the BTC news headlines to feed into the model
X_new = vectorizer_nlp.transform(btc_after_SVB_df['headlines'])

In [173]:
# Create df of engineered features to the vectorized headlines 
X_new_with_features = pd.concat([btc_after_SVB_df['text_len'], btc_after_SVB_df['punct%'], btc_after_SVB_df['capital%'], pd.DataFrame(X_new.toarray())], axis=1)
X_new_with_features.head(5)

Unnamed: 0,text_len,punct%,capital%,0,1,2,3,4,5,6,...,8901,8902,8903,8904,8905,8906,8907,8908,8909,8910
0,268,3.0,5.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,275,1.5,5.5,0.032834,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,101,3.0,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,79,2.5,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,200,4.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Apply the model

In [174]:
# Apply model to predict sentiment 
btc_jenn_nlp_predictions = nlp_jenn.predict(X_new_with_features)



In [175]:
# Add predictions to dataset
btc_after_SVB_df["sentiment_NLP"] = btc_jenn_nlp_predictions
btc_after_SVB_df

Unnamed: 0,begins_at,headlines,punct%,text_len,capital%,sentiment_NLP
0,2023-04-13,Twitter Partners with eToro to Let Users Buy a...,3.0,268,5.2,neutral
1,2023-04-06,Here’s How to Find the Original Bitcoin Manife...,1.5,275,5.5,neutral
2,2023-04-04,Cryptoverse: Bitcoin traders like their option...,3.0,101,5.9,neutral
3,2023-03-20,In which Balaji gives away at least a million ...,2.5,79,3.8,neutral
4,2023-04-05,Michael Saylor's MicroStrategy adds to its bit...,4.5,200,2.5,neutral
...,...,...,...,...,...,...
95,2023-04-05,Bitcoin: MicroStrategy Buys 1045 BTC for $29.3...,2.6,268,4.1,neutral
96,2023-04-10,"Cathie Wood Backs Balaji’s $1M BTC Forecast, D...",2.5,276,16.7,neutral
97,2023-03-17,Why Bitcoin Miner Stocks Soared This WeekBitco...,3.7,134,7.5,neutral
98,2023-03-17,Over $55M Crypto Shorts Blown Away In 12 Hours...,4.3,276,9.4,neutral


In [176]:
# Examine the distribution of polarities 
btc_after_SVB_df["sentiment_NLP"].value_counts()

neutral     93
positive     7
Name: sentiment_NLP, dtype: int64

## Compare NLP Vader Results

In [178]:
btc_after_SVB_df["headlines"]

0     Twitter Partners with eToro to Let Users Buy a...
1     Here’s How to Find the Original Bitcoin Manife...
2     Cryptoverse: Bitcoin traders like their option...
3     In which Balaji gives away at least a million ...
4     Michael Saylor's MicroStrategy adds to its bit...
                            ...                        
95    Bitcoin: MicroStrategy Buys 1045 BTC for $29.3...
96    Cathie Wood Backs Balaji’s $1M BTC Forecast, D...
97    Why Bitcoin Miner Stocks Soared This WeekBitco...
98    Over $55M Crypto Shorts Blown Away In 12 Hours...
99    Analyst Warns: Bitcoin Price Surge Above $26,0...
Name: headlines, Length: 100, dtype: object

In [179]:
# Apply vader to the btc dataset 
btc_news_vader = apply_vader(btc_after_SVB_df)

In [181]:
# Add column to dataframe to directly compare results with the model
btc_news_vader["vader_sentiment"] = btc_news_vader["vader_prediction"].apply(lambda x: format_output(x))
btc_news_vader

Unnamed: 0,begins_at,headlines,punct%,text_len,capital%,sentiment_NLP,vader_prediction,vader_sentiment
0,2023-04-13,Twitter Partners with eToro to Let Users Buy a...,3.0,268,5.2,neutral,0.0000,neutral
1,2023-04-06,Here’s How to Find the Original Bitcoin Manife...,1.5,275,5.5,neutral,0.7506,positive
2,2023-04-04,Cryptoverse: Bitcoin traders like their option...,3.0,101,5.9,neutral,0.6124,positive
3,2023-03-20,In which Balaji gives away at least a million ...,2.5,79,3.8,neutral,0.3875,positive
4,2023-04-05,Michael Saylor's MicroStrategy adds to its bit...,4.5,200,2.5,neutral,0.3400,positive
...,...,...,...,...,...,...,...,...
95,2023-04-05,Bitcoin: MicroStrategy Buys 1045 BTC for $29.3...,2.6,268,4.1,neutral,0.2411,positive
96,2023-04-10,"Cathie Wood Backs Balaji’s $1M BTC Forecast, D...",2.5,276,16.7,neutral,0.0772,positive
97,2023-03-17,Why Bitcoin Miner Stocks Soared This WeekBitco...,3.7,134,7.5,neutral,0.3400,positive
98,2023-03-17,Over $55M Crypto Shorts Blown Away In 12 Hours...,4.3,276,9.4,neutral,0.2263,positive


In [182]:
# Examine the distribution of polarities 
btc_news_vader["vader_sentiment"].value_counts()

positive    58
negative    31
neutral     11
Name: vader_sentiment, dtype: int64