# Data analytics coursework 

In [133]:
%load_ext autoreload
%autoreload 2

# Use HuggingFace's datasets library to access the financial_phrasebank dataset
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
import nltk as nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
nltk.download('wordnet')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
train_files = [
    'data/FiQA_ABSA_task1/task1_headline_ABSA_train.json',
    'data/FiQA_ABSA_task1/task1_post_ABSA_train.json'
]

In [63]:
## Loading data from JSON
import json

def load_fiqa_sa_from_json(json_files):
    train_text = []
    train_labels = []

    for file in json_files:
        with open(file, 'r', encoding='UTF-8') as handle:
            dataf = json.load(handle)

        dataf_text = [dataf[k]["sentence"] for k in dataf.keys()]
        # print(len(dataf_text))
        train_text.extend(dataf_text)

        dataf_labels = [float(dataf[k]["info"][0]["sentiment_score"]) for k in dataf.keys()]
        # print(len(dataf_labels))
        train_labels.extend(dataf_labels)

    train_text = np.array(train_text)
    train_labels = np.array(train_labels)
    
    return train_text, train_labels


def threshold_scores(scores):
    """
    Convert sentiment scores to discrete labels.
    0 = negative.
    1 = neutral.
    2 = positive.
    """
    labels = []
    for score in scores:
        if score < -0.2:
            labels.append(0)
        elif score > 0.2:
            labels.append(2)
        else:
            labels.append(1)
            
    return np.array(labels)


all_text, all_labels = load_fiqa_sa_from_json(train_files)
    
print(f'Number of instances: {len(all_text)}')
print(f'Number of labels: {len(all_labels)}')

all_labels = threshold_scores(all_labels)
print(f'Number of negative labels: {np.sum(all_labels==0)}')
print(f'Number of neutral labels: {np.sum(all_labels==1)}')
print(f'Number of positive labels: {np.sum(all_labels==2)}')

Number of instances: 1111
Number of labels: 1111
Number of negative labels: 310
Number of neutral labels: 195
Number of positive labels: 606


In [138]:
from sklearn.model_selection import train_test_split

# Split test data from training data
train_documents, test_documents, train_labels, test_labels = train_test_split(
    all_text, 
    all_labels, 
    test_size=0.2, 
    stratify=all_labels  # make sure the same proportion of labels is in the test set and training set
)

# Split validation data from training data
train_documents, val_documents, train_labels, val_labels = train_test_split(
    train_documents, 
    train_labels, 
    test_size=0.15, 
    stratify=train_labels  # make sure the same proportion of labels is in the test set and training set
)

print(f'Number of training instances = {len(train_documents)}')
print(f'Number of validation instances = {len(val_documents)}')
print(f'Number of test instances = {len(test_documents)}')


Number of training instances = 754
Number of validation instances = 134
Number of test instances = 223


In [68]:
print(f'What does one instance look like from the training set? \n\n{train_documents[234]}')
print(f'...and here is its corresponding label \n\n{train_labels[234]}')

What does one instance look like from the training set? 

$ETN UPGRADE today by MS to overweight.  Excellent company and leadership
...and here is its corresponding label 

2


# Normalization using Lemmatization and bi-grams + unigrams

In [151]:
# Lemmatization and bi+uni-grams


In [152]:
class LemmaTokenizer(object):
    
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def __call__(self, tweets):
        return [self.wnl.lemmatize(self.wnl.lemmatize(self.wnl.lemmatize(tok, pos='n'), pos='v'), pos='a') for tok in word_tokenize(tweets)]
    

In [153]:
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1,2))
vectorizer.fit(train_documents)
X_train = vectorizer.transform(train_documents)
X_val = vectorizer.transform(val_documents)

# Print out some of the features in the vocabulary:
print(list(vectorizer.vocabulary_)[:20])

['$', 'aapl', 'reject', 'the', 'hod', '...', 'should', 'head', 'low', '.', 'sit', 'on', 'sideline', 'for', 'now', '$ aapl', 'aapl reject', 'reject the', 'the hod', 'hod ...']


In [154]:
print(f'Vocabulary size: {len(vectorizer.vocabulary_)}')
vocabulary = vectorizer.vocabulary_

Vocabulary size: 11170


In [157]:
classifier = LogisticRegression()
classifier.fit(X_train, train_labels)
y_test_pred = classifier.predict(X_val)
print(classification_report(val_labels, y_test_pred))

              precision    recall  f1-score   support

           0       0.74      0.46      0.57        37
           1       0.55      0.25      0.34        24
           2       0.68      0.93      0.79        73

    accuracy                           0.68       134
   macro avg       0.65      0.55      0.57       134
weighted avg       0.67      0.68      0.65       134



# Using lexicon features

In [111]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [112]:
vocabulary = vectorizer.vocabulary_

lex_pos_scores = np.zeros((1, len(vocabulary)))
lex_neg_scores = np.zeros((1, len(vocabulary)))

for i, term in enumerate(vocabulary):
    if term in analyser.lexicon and analyser.lexicon[term] > 0:
        lex_pos_scores[0, i] = 1
    elif term in analyser.lexicon and analyser.lexicon[term] < 0:
        lex_neg_scores[0, i] = 1

In [113]:
# This enables u to get the total positive and total negative counts for each set:
lex_pos_train = np.sum(X_train.multiply(lex_pos_scores), axis=1)
lex_pos_test = np.sum(X_val.multiply(lex_pos_scores), axis=1)

lex_neg_train = np.sum(X_train.multiply(lex_neg_scores), axis=1)
lex_neg_test = np.sum(X_val.multiply(lex_neg_scores), axis=1)

In [114]:
from scipy.sparse import hstack

X_train = hstack((X_train, lex_pos_train, lex_neg_train))
X_val = hstack((X_val, lex_pos_test, lex_neg_test))

# Using LogisticRegression model

In [116]:

classifier = LogisticRegression()
classifier.fit(X_train, train_labels)
y_test_pred = classifier.predict(X_val)

print(classification_report(val_labels, y_test_pred))

              precision    recall  f1-score   support

           0       0.48      0.38      0.42        37
           1       0.75      0.25      0.38        24
           2       0.64      0.85      0.73        73

    accuracy                           0.61       134
   macro avg       0.62      0.49      0.51       134
weighted avg       0.62      0.61      0.58       134



In [121]:
# Key part is investigating the errors, so let's do that:
error_indexes = y_test_pred != val_labels  # compare predictions to gold labels

# get the text of tweets where the classifier made an error:
tweets_err = np.array(val_documents)[error_indexes]

# WRITE YOUR CODE HERE
pred_err = y_test_pred[error_indexes]
gold_err = np.array(val_labels)[error_indexes]

for i in range(10):  # just print the first ten
    print(f'Tweet: {tweets_err[i]}; true label = {gold_err[i]}, prediction = {pred_err[i]}.')

Tweet: Apple: Another attempt to break out on Apple but with weak PVT   $AAPL #Trading #investing #aapl https://t.co/DNh7Hgv22V; true label = 1, prediction = 2.
Tweet: @Stockoptionexpert: $MAT - 6%, big trader added 10000 April put contracts 3 days ago  http://stks.co/c1Ols; true label = 0, prediction = 2.
Tweet: Covered my small $MWW short @ 7.99 for a .16 loss. Flat on the day. All cash now.; true label = 2, prediction = 0.
Tweet: StanChart and RBS struggle in Bank of England stress tests; true label = 0, prediction = 2.
Tweet: FTSE rallies off three-month low, boosted by StanChart, Sainsbury; true label = 2, prediction = 0.
Tweet: Randgold profit hit by poor gold price but dividend still increases; true label = 1, prediction = 0.
Tweet: $XLB the weakest sector this year and possible false breakout  http://stks.co/dL1Z; true label = 0, prediction = 2.
Tweet: The Boeing Company (NYSE:BA) Bearish Trader bets $550K that Stock Will Sell Off 9.79% by July Expiry $BA http://stks.co/tCSE; t