<a href="https://colab.research.google.com/github/tomalexsmith/Bitcoin-sentiment-analysis/blob/main/Vader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import corpus

In [None]:
!pip install vaderSentiment

import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/My Drive/preprocessedCorpus.csv'

with open(path, 'r', encoding = 'latin-1') as f:
  tweets_df = pd.read_csv(f, index_col=0)

tweets_df.head(len(tweets_df))

Unnamed: 0,polarity_label,content
0,Positive,devilsnight hey bug martini glass
1,Positive,lumifish prejudice
2,Negative,bed get drive dallas tomorrow
3,Negative,get feeling job hunt never come close
4,Negative,man damn weather suppose oahu right
...,...,...
499995,Positive,mikerelm show granada last night sick del hard...
499996,Negative,cannot find telephone
499997,Positive,littledotty enjoy cuppa go put kettle
499998,Negative,finally sidekick stuff yay go sf pride weekend...


In [None]:
content=tweets_df['content']
polarity=tweets_df['polarity_label']

In [None]:
from sklearn.model_selection import train_test_split

# Split datasets into training, testing, and validation datasets

X_train, X_val_test, y_train, y_val_test = train_test_split(content,
                                                            polarity,
                                                            random_state=42,
                                                            train_size=.8
                                                            )

X_val, X_test, y_val, y_test = train_test_split(X_val_test,
                                                y_val_test,
                                                random_state=42,
                                                train_size=.5
                                                )

In [None]:
# Confirm dataset sizes
print('Train: ', len(X_train),len(y_train))
print('Test: ', len(X_val),len(y_val))
print('Validation: ', len(X_test),len(y_test))

In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert polarity labels to binary
encoder = LabelEncoder()
encoder.fit(tweets_df['polarity_label'].to_list())

In [None]:
data = {'Actual polarity': encoder.transform(y_test.to_list()),
        'Content': X_test}

results = pd.DataFrame(data)
results = results.dropna()

In [None]:
results.head(len(results))

Unnamed: 0,Actual polarity,Content
32134,0,clairenick lose follower laugh loud miss clair...
469167,0,think I loose
59371,0,another dead turtle one last week half
427358,1,dwighthoward congratulation guy awesome
183371,1,laurenconrad cute lovely
...,...,...
378588,1,caitra really ah man end summer approaching
67919,0,sabrinabryan booo fail
428822,1,akr nokia green room
356555,0,friday almost follower today guy like


# Run VADER sentiment analysis

In [None]:
# Create instance of VADER sentiment analyser
analyser = SentimentIntensityAnalyzer()

In [None]:
# Calculate sentiment polarity of tweet
def calculate_polarity(content):
    score = analyser.polarity_scores(content)
    return score['compound']

In [None]:
# Convert sentiment score from -1 to 1 into a binary format
def adjust_score(score):
  if score >= 0:
    return 1
  return 0

In [None]:
results['Predicted_polarity'] = results['Content'].apply(calculate_polarity)
results['Predicted_polarity'] = results['Predicted_polarity'].apply(adjust_score)

In [None]:
results.head(len(tweets_df))

Unnamed: 0,Actual polarity,Content,Predicted_polarity
32134,0,clairenick lose follower laugh loud miss clair...,1
469167,0,think I loose,0
59371,0,another dead turtle one last week half,0
427358,1,dwighthoward congratulation guy awesome,1
183371,1,laurenconrad cute lovely,1
...,...,...,...
378588,1,caitra really ah man end summer approaching,1
67919,0,sabrinabryan booo fail,0
428822,1,akr nokia green room,1
356555,0,friday almost follower today guy like,1


# Calculate performance metrics

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def calculate_metrics(test_var, pred_var):

  accuracy = accuracy_score(test_var, pred_var)
  recall = recall_score(test_var, pred_var, average='macro')
  precision = precision_score(test_var, pred_var, average='macro')
  f1 = f1_score(test_var, pred_var, average='macro')

  return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = calculate_metrics(results["Actual polarity"], results["Predicted_polarity"])

print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1:", f1)

Accuracy: 0.6438032734403137
Recall: 0.6432369973913827
Precision: 0.7005527288179814
F1: 0.616160678695751


In [None]:
comparison_metrics = pd.DataFrame({'Accuracy':[accuracy],
                                  'Recall':[recall],
                                  'Accuracy':[precision],
                                  'F1':[f1]},
                                   index=['VADER']
                                  )

path = '/content/drive/My Drive/comparison_metrics.csv'

with open(path, 'a', encoding = 'utf-8') as f:
  comparison_metrics.to_csv(f, header=False)