# Data analytics coursework 

In [61]:
%load_ext autoreload
%autoreload 2

# Use HuggingFace's datasets library to access the financial_phrasebank dataset
from datasets import load_dataset

import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
train_files = [
    'data/FiQA_ABSA_task1/task1_headline_ABSA_train.json',
    'data/FiQA_ABSA_task1/task1_post_ABSA_train.json'
]

In [63]:
## Loading data from JSON
import json

def load_fiqa_sa_from_json(json_files):
    train_text = []
    train_labels = []

    for file in json_files:
        with open(file, 'r', encoding='UTF-8') as handle:
            dataf = json.load(handle)

        dataf_text = [dataf[k]["sentence"] for k in dataf.keys()]
        # print(len(dataf_text))
        train_text.extend(dataf_text)

        dataf_labels = [float(dataf[k]["info"][0]["sentiment_score"]) for k in dataf.keys()]
        # print(len(dataf_labels))
        train_labels.extend(dataf_labels)

    train_text = np.array(train_text)
    train_labels = np.array(train_labels)
    
    return train_text, train_labels


def threshold_scores(scores):
    """
    Convert sentiment scores to discrete labels.
    0 = negative.
    1 = neutral.
    2 = positive.
    """
    labels = []
    for score in scores:
        if score < -0.2:
            labels.append(0)
        elif score > 0.2:
            labels.append(2)
        else:
            labels.append(1)
            
    return np.array(labels)


all_text, all_labels = load_fiqa_sa_from_json(train_files)
    
print(f'Number of instances: {len(all_text)}')
print(f'Number of labels: {len(all_labels)}')

all_labels = threshold_scores(all_labels)
print(f'Number of negative labels: {np.sum(all_labels==0)}')
print(f'Number of neutral labels: {np.sum(all_labels==1)}')
print(f'Number of positive labels: {np.sum(all_labels==2)}')

Number of instances: 1111
Number of labels: 1111
Number of negative labels: 310
Number of neutral labels: 195
Number of positive labels: 606


In [64]:
from sklearn.model_selection import train_test_split

# Split test data from training data
train_documents, test_documents, train_labels, test_labels = train_test_split(
    all_text, 
    all_labels, 
    test_size=0.2, 
    stratify=all_labels  # make sure the same proportion of labels is in the test set and training set
)

# Split validation data from training data
train_documents, val_documents, train_labels, val_labels = train_test_split(
    train_documents, 
    train_labels, 
    test_size=0.15, 
    stratify=train_labels  # make sure the same proportion of labels is in the test set and training set
)

print(f'Number of training instances = {len(train_documents)}')
print(f'Number of validation instances = {len(val_documents)}')
print(f'Number of test instances = {len(test_documents)}')


Number of training instances = 754
Number of validation instances = 134
Number of test instances = 223


In [68]:
print(f'What does one instance look like from the training set? \n\n{train_documents[234]}')
print(f'...and here is its corresponding label \n\n{train_labels[234]}')

What does one instance look like from the training set? 

$ETN UPGRADE today by MS to overweight.  Excellent company and leadership
...and here is its corresponding label 

2


# Normalization using Lemmatization and bi-grams + unigrams

In [69]:
# Lemmatization and bi+uni-grams
import nltk as nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [70]:
class LemmaTokenizer(object):
    
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def __call__(self, tweets):
        return [self.wnl.lemmatize(self.wnl.lemmatize(self.wnl.lemmatize(tok, pos='n'), pos='v'), pos='a') for tok in word_tokenize(tweets)]
    

In [71]:
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1,2))
vectorizer.fit(train_documents)
X_train = vectorizer.transform(train_documents)
X_test = vectorizer.transform(test_documents)

# Print out some of the features in the vocabulary:
print(list(vectorizer.vocabulary_)[:20])



['update', '3-bp', 'settle', 'oil', 'spill-related', 'claim', 'with', 'halliburton', ',', 'transocean', 'update 3-bp', '3-bp settle', 'settle oil', 'oil spill-related', 'spill-related claim', 'claim with', 'with halliburton', 'halliburton ,', ', transocean', 'u.k.']


In [51]:
print(f'Vocabulary size: {len(vectorizer.vocabulary_)}')
vocabulary = vectorizer.vocabulary_

Vocabulary size: 11151


# Using lexicon features

In [72]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [74]:
vocabulary = vectorizer.vocabulary_

lex_pos_scores = np.zeros((1, len(vocabulary)))
lex_neg_scores = np.zeros((1, len(vocabulary)))

for i, term in enumerate(vocabulary):
    if term in analyser.lexicon and analyser.lexicon[term] > 0:
        lex_pos_scores[0, i] = 1
    elif term in analyser.lexicon and analyser.lexicon[term] < 0:
        lex_neg_scores[0, i] = 1

In [75]:
# This enables u to get the total positive and total negative counts for each set:
lex_pos_train = np.sum(X_train.multiply(lex_pos_scores), axis=1)
lex_pos_test = np.sum(X_test.multiply(lex_pos_scores), axis=1)

lex_neg_train = np.sum(X_train.multiply(lex_neg_scores), axis=1)
lex_neg_test = np.sum(X_test.multiply(lex_neg_scores), axis=1)

In [76]:
from scipy.sparse import hstack

X_train = hstack((X_train, lex_pos_train, lex_neg_train))
X_test = hstack((X_test, lex_pos_test, lex_neg_test))

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
classifier = LogisticRegression()
classifier.fit(X_train, train_labels)
y_test_pred = classifier.predict(X_test)

print(classification_report(test_labels, y_test_pred))

              precision    recall  f1-score   support

           0       0.72      0.45      0.55        62
           1       0.67      0.21      0.31        39
           2       0.66      0.93      0.78       122

    accuracy                           0.67       223
   macro avg       0.68      0.53      0.55       223
weighted avg       0.68      0.67      0.63       223



In [78]:
# Key part is investigating the errors, so let's do that:
error_indexes = y_test_pred != test_labels  # compare predictions to gold labels

# get the text of tweets where the classifier made an error:
tweets_err = np.array(test_documents)[error_indexes]

# WRITE YOUR CODE HERE
pred_err = y_test_pred[error_indexes]
gold_err = np.array(test_labels)[error_indexes]

for i in range(10):  # just print the first ten
    print(f'Tweet: {tweets_err[i]}; true label = {gold_err[i]}, prediction = {pred_err[i]}.')

Tweet: $EA points to the two Facebook games and accuses $ZNGA of copyright infringement. http://stks.co/g3A1; true label = 0, prediction = 2.
Tweet: $MFLX up  pre mkt. Looks like GAP has been filled....lookin 4 short opt here; true label = 0, prediction = 2.
Tweet: $AAPL afternoon selloff as usual will be brutal. get ready to lose a ton of money.; true label = 0, prediction = 2.
Tweet: @mugatushair Now it is time to short $TSLA; true label = 0, prediction = 2.
Tweet: $SPY Less than 0.2% down and people are calling it bearish. Some heading for exits already. Maybe 1% down will be "the crash"? Disturbing!; true label = 1, prediction = 0.
Tweet: @chessNwine: $IWM 30-Minute Chart. Small caps threatening descending triangle breakdown under $110.20.  http://stks.co/r0KKm; true label = 0, prediction = 2.
Tweet: Insight hires Aviva's David Hillier for multi-asset team; true label = 1, prediction = 2.
Tweet: Bilfinger Industrial Services win Â£100m BP contract extension; true label = 1, predict

In [None]:
cross_validation = KFold(n_splits=10, shuffle=True)
# Setting max_depth
max_d = 30
# Creating train and validation error arrays.
train_error = [[] for _ in range(max_d)]
val_error = [[] for _ in range(max_d)]
for d in tqdm(range(max_d)): # Using tqdm to show progress bar :)
    #Instantiate our RF regressor
    regressor = RandomForestRegressor(n_estimators= 10, max_depth=d+1)
    # Split data into training and validation
    for train_index, val_index in cross_validation.split(Xtr):
        Xtrain, Xval, Ytrain, Yval = Xtr.iloc[train_index], Xtr.iloc[val_index], Ytr.iloc[train_index], Ytr.iloc[val_index]
        # fitting the model on our data
        RFG_model=regressor.fit(Xtrain, Ytrain.values.flatten())
        # now we make predictions for training and validation splits
        y_pred_train=RFG_model.predict(Xtrain)
        y_pred_val=RFG_model.predict(Xval)
        # we get the root mean squared error
        train_error[d].append(mean_squared_error(Ytrain,y_pred_train, squared=False))
        val_error[d].append(mean_squared_error(Yval,y_pred_val, squared=False))
# Fetching the mean for training and validation errors across splits for all depths
avg_train_error = np.mean(train_error, axis=1)
avg_val_error = np.mean(val_error, axis=1)   