In [1]:
# This block is from https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

import seaborn as sns #collection of functions for data visualization
print("seaborn version: {}". format(sns.__version__))

from sklearn.preprocessing import OneHotEncoder #OneHot Encoder
from sklearn.impute import SimpleImputer
%matplotlib inline

#misc libraries
import random
import time


#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)



Python version: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
pandas version: 1.0.1
matplotlib version: 3.1.3
NumPy version: 1.18.1
SciPy version: 1.4.1
IPython version: 7.12.0
scikit-learn version: 0.22.1
seaborn version: 0.10.0
-------------------------


In [2]:
#load data
test_raw = pd.read_csv('../input/Corona_NLP_test.csv')
train_raw = pd.read_csv('../input/Corona_NLP_train_utf8.csv')

In [3]:
train_raw.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
test_raw.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [5]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [6]:
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       3798 non-null   int64 
 1   ScreenName     3798 non-null   int64 
 2   Location       2964 non-null   object
 3   TweetAt        3798 non-null   object
 4   OriginalTweet  3798 non-null   object
 5   Sentiment      3798 non-null   object
dtypes: int64(2), object(4)
memory usage: 178.2+ KB


## We could extract a lot of information from this data via visualization, but for the purpose of this project, we want to focus on building a sentiment model

In [7]:
train_clean = train_raw.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1)
test_clean = test_raw.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1)

In [8]:
train_clean.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [9]:
train_clean.iloc[1]['OriginalTweet']

'advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order'

In [13]:
#borrowed from the Text Classification Kaggle course
import spacy

nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

In [14]:
labels = train_clean.Sentiment.unique()

In [17]:
for lable in labels:
    textcat.add_label(lable)

In [18]:
labels

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [19]:
train_texts = train_clean['OriginalTweet'].values
train_labels = [{'cats': {'Neutral': label == 'Neutral',
                          'Positive': label == 'Positive',
                         'Extremely Negative': label == 'Extremely Negative',
                         'Negative': label == 'Negative',
                         'Extremely Positive': label == 'Extremely Positive',}} 
                for label in train_clean['Sentiment']]

In [22]:
train_data = list(zip(train_texts, train_labels))

In [24]:
from spacy.util import minibatch
import random

#borrowed from the Kaggle TextCatgorization course 
def train_cat (nlp_model, data, b_size, ep):
    random.seed(1)
    spacy.util.fix_random_seed(1)
    optimizer = nlp_model.begin_training()
    
    losses = {}
    for epoch in range(ep):
        random.shuffle(data)
        # Create the batch generator with batch size = 8
        batches = minibatch(data, size=b_size)
        # Iterate through minibatches
        for batch in batches:
            # Each batch is a list of (text, label) but we need to
            # send separate lists for texts and labels to update().
            # This is a quick way to split a list of tuples into lists
            texts, labels = zip(*batch)
            nlp_model.update(texts, labels, sgd=optimizer, losses=losses)
        print(losses)
    
    return nlp_model

In [25]:
nlp_cat_trained = train_cat (nlp, train_data, 10, 3)

{'textcat': 28.74246430164203}
{'textcat': 50.08783368370496}
{'textcat': 67.78798086429015}


In [37]:
#let's see how well our model performs
texts = test_clean.OriginalTweet[:3].tolist()
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[0.20689538 0.18483678 0.16582394 0.07810719 0.02132244]
 [0.04228055 0.6522597  0.00823701 0.08055886 0.43530792]
 [0.07611483 0.49128523 0.03388204 0.3087234  0.555106  ]]


In [38]:
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['Neutral', 'Positive', 'Extremely Positive']


In [39]:
test_clean.Sentiment[:3]

0    Extremely Negative
1              Positive
2    Extremely Positive
Name: Sentiment, dtype: object

### Observation
We extracted the frist three tweets from the test data, and two out of three right, which isn't as good as we expected, let's do a proper evaluation of the accuracy

In [42]:
def pred_results (model,test_texts):
    docs = [nlp.tokenizer(text) for text in test_texts]
    textcat = model.get_pipe('textcat')
    scores, _ = textcat.predict(docs)
    predicted_labels = scores.argmax(axis=1)
    results = [textcat.labels[label] for label in predicted_labels]
    return results
    
    

In [43]:
y_pred = pred_results (nlp,test_clean.OriginalTweet.tolist())

In [45]:
y_test = test_clean.Sentiment.tolist()

In [52]:
from sklearn.metrics import accuracy_score

In [53]:
a_score = accuracy_score(y_test, y_pred)

In [54]:
a_score

0.5284360189573459

### Observation

The accuracy score is only slightly above 50%, which isn't that goodd, this could be due to the 'bow' architecture we are using for our model, for efficientcy, let's try a better architecture. Based on the spaCy document, seems like 'ensemble' is the best architecture, let's give it a try

In [61]:
nlp_en = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat_en = nlp.create_pipe(
              "textcat",
              config={
                "architecture": "ensemble"})

# Add the TextCategorizer to the empty model
nlp_en.add_pipe(textcat_en)

In [63]:
for lable in labels:
    textcat_en.add_label(lable)

In [64]:
nlp_en_cat = train_cat (nlp_en, train_data, 10, 3)

{'textcat': 25.404834992950782}
{'textcat': 41.282424603850814}
{'textcat': 52.998172056817566}


In [65]:
y_pred_en = pred_results (nlp_en_cat,test_clean.OriginalTweet.tolist())

In [66]:
a_score_en = accuracy_score(y_test, y_pred_en)

In [67]:
a_score_en

0.6977356503422855

### Observation

The ensemble architecture took a long time to run, but the result was great, over 20% improvement, with an overall score of 70%.

# Document vectors and standard ML models approach

In [76]:
# Need to load the large model to get the vectors
!python -m spacy download en_core_web_lg

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')


In [26]:
import spacy

# Need to load the large model to get the vectors
nlp_lg = spacy.load('en_core_web_lg')

In [28]:
#for faster performance 
with nlp_lg.disable_pipes():
    doc_vectors = np.array([nlp_lg(text).vector for text in train_clean.OriginalTweet])
    X_test_vectors = np.array([nlp_lg(text).vector for text in test_clean.OriginalTweet])

In [12]:
doc_vectors.shape

(41157, 300)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(doc_vectors, train_clean.Sentiment,
                                                    test_size=0.1, random_state=1)

In [22]:
#this is from https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [15]:
def base_model (X_tr, y_tr):
    #this is from https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy
    #Machine Learning Algorithm (MLA) Selection and Initialization
    MLA = [
        #Ensemble Methods
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),


        #GLM
        linear_model.LogisticRegressionCV(),


        #SVM
        svm.SVC(probability=True),
        svm.LinearSVC(),

        XGBClassifier()
 
        ]
    
    #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
    #note: this is an alternative to train_test_split
    #cv_split = model_selection.ShuffleSplit(test_size = .2, train_size = .8, random_state = 0 ) # run model 10x with 80/20 split intentionally leaving out 10%

    #create table to compare MLA metrics
    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
    MLA_compare = pd.DataFrame(columns = MLA_columns)

    #create table to compare MLA predictions
    MLA_predict = y_tr

    #index through MLA and save performance to table
    row_index = 0
    for alg in MLA:

        #set name and parameters
        MLA_name = alg.__class__.__name__
        #print(MLA_name)
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        #print(y_tr.shape)

        #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
        cv_results = model_selection.cross_validate(alg, X_tr, y_tr, cv = 5, scoring='accuracy', return_train_score=True)
        

        MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
        #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
        MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!


        #save MLA predictions - see section 6 for usage
        #alg.fit(X_tr, y_tr)
        #MLA_predict[MLA_name] = alg.predict(X_tr)

        row_index+=1


    #print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
    MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
    return MLA_compare
    #MLA_predict

In [16]:
base_models = base_model(X_train, y_train)

In [17]:
base_models

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy Mean,MLA Test Accuracy Mean,MLA Test Accuracy 3*STD,MLA Time
3,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.523325,0.487406,0.0146761,2369.49
2,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...",0.50548,0.482843,0.0113096,75.4839
4,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...",0.493872,0.467455,0.0088838,57.2498
0,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.544451,0.433627,0.0133705,1686.61
5,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.501201,0.421965,0.0208606,238.867
1,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999946,0.412705,0.0177953,51.1937


In [24]:
#I have been getting good results from RF model, though it didnt score the best in the above table, we will still use it to test how much imporvement we could get from doing hypterparameter tunning
rf = ensemble.RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [400,450,500,550],
               'criterion':['gini','entropy'],
                                  'bootstrap': [True],
                                  'max_depth': [15, 20, 25],
                                  'max_features': ['auto','sqrt', 10],
                                  'min_samples_leaf': [2,3],
                                  'min_samples_split': [2,3]}
#GS took too long, going to use rnd search instead                                  
#gs_cv_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)

rs_rf = RandomizedSearchCV(rf, param_distributions = param_grid, n_iter = 5, cv = 2, verbose = True, n_jobs = -1)
best_rf = rs_rf.fit(X_train,y_train)
print('Best Score: ' + str(best_rf.best_score_))

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.4min finished


Best Score: 0.4172132418597161


In [25]:
best_rf.best_params_

{'n_estimators': 550,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 15,
 'criterion': 'entropy',
 'bootstrap': True}

### Observation

The result is worst than the default model. For the sake of completing this analysis, we will run an evaluation against the SVC model, which the top model in our base model anlaysis. However, we don't expect this will be better than the spaCy text pipe approach.

In [29]:
y_test = test_clean.Sentiment

In [30]:
svm_ml = svm.SVC(probability=True)

In [31]:
svm_ml = svm_ml.fit(X_train,y_train)

In [32]:
y_pred = svm_ml.predict(X_test_vectors)

In [34]:
from sklearn.metrics import accuracy_score
a_score_v = accuracy_score(y_test, y_pred)

In [35]:
a_score_v

0.47130068457082674

### Observation

The score is slightly better than the one we observed in the base model analysis for the SVM model, but still much worst than the spaCy model using the ensemble architecture. In a way, we can conclude that vectorization of docs is less accurate when compared to the spaCy model with an ensemble architecture that uses a combination of bow and CNN.