# 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import math
import json

import warnings
warnings.filterwarnings('ignore')

import fasttext
import pickle

[nltk_data] Downloading package punkt to /Users/zazhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/zazhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Data import

In [2]:
train = pd.read_csv("../data/Corona_NLP_train.csv",encoding='latin1')
test = pd.read_csv("../data/Corona_NLP_test.csv",encoding='latin1')

In [3]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
test

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...,...,...,...,...
3793,3794,48746,Israel ??,16-03-2020,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,3795,48747,"Farmington, NM",16-03-2020,Did you panic buy a lot of non-perishable item...,Negative
3795,3796,48748,"Haverford, PA",16-03-2020,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,3797,48749,,16-03-2020,Gov need to do somethings instead of biar je r...,Extremely Negative


In [5]:
train['text'] = train.OriginalTweet
train["text"] = train["text"].astype(str)

test['text'] = test.OriginalTweet
test["text"] = test["text"].astype(str)

# Data has 5 classes
train.Sentiment.value_counts(normalize= True)

Positive              0.277523
Negative              0.240955
Neutral               0.187404
Extremely Positive    0.160945
Extremely Negative    0.133173
Name: Sentiment, dtype: float64

In [6]:
train.isna().sum() 

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
text                0
dtype: int64

In [7]:
test.isna().sum() 

UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
Sentiment          0
text               0
dtype: int64

# 3. Text Cleaning

In [8]:
def preprocess(x):
    '''tokenize and normalize'''

    stop_words = set(stopwords.words('english')) 

    # convert to dataframe
    data = pd.DataFrame({'text': x.text, 'label': x.Sentiment})

    # remove html
    data['text'] = data.apply(lambda t: re.sub(r'https?://\S+|www\.\S+', '', str(t['text'])), axis=1)

    # remove stopwords, number, and convert to lower case
    data['text'] = data.apply(lambda r: ' '.join(w.lower() for w in r['text'].split() if (w.lower() not in stop_words) & (w.isalpha())),axis=1)
    data['text'] = data[data['text'] != '']
    
    # discard NA reviews
    data = data.dropna()

    return data

In [9]:
train_new = preprocess(train)
test_new = preprocess(test)

In [10]:
train_new.text.apply(lambda x: len(x)).mean()

90.88795477252236

In [11]:
train_new.head()

Unnamed: 0,text,label
1,advice talk neighbours family exchange phone n...,Positive
2,coronavirus woolworths give disabled dedicated...,Positive
3,food stock one enough food everyone take stay ...,Positive
4,ready go supermarket food stock litteraly seri...,Extremely Negative
5,news first confirmed case came sullivan county...,Positive


# 4.TF-IDF

In [12]:
ngram = (1,2)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,ngram_range=ngram, stop_words='english')
tfidf.fit_transform(train_new.text.values)

# We transform each text into a vector
x_train = tfidf.transform(train_new.text.values)
x_test = tfidf.transform(test_new.text.values)
y_train = train_new.label.values
y_test = test_new.label.values

In [11]:
# save best performing svm model
with open('tfidf_vec.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# 5. Models

## Logistic

In [49]:
lr1 = LogisticRegression(random_state=66,solver='lbfgs')  # fit logistic
lr1.fit(x_train, y_train)
y_pred = lr1.predict(x_test) # predict

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.5004
Micro-averaged F1 score: 0.5004


In [50]:
lr2 = LogisticRegression(random_state=66, C=15, penalty='l2',solver='lbfgs')  # fit logistic
lr2.fit(x_train, y_train)
y_pred = lr2.predict(x_test) # predict

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.5036
Micro-averaged F1 score: 0.5036


In [51]:
lr3 = LogisticRegression(random_state=66, C=10, penalty='l2',solver='lbfgs')
lr3.fit(x_train, y_train)
y_pred = lr3.predict(x_test) # predict

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.5104
Micro-averaged F1 score: 0.5104


In [62]:
lr4 =LogisticRegression(random_state=66, C=15, penalty='l2',solver='liblinear')
lr4.fit(x_train, y_train)
y_pred = lr4.predict(x_test) # predict

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.4714
Micro-averaged F1 score: 0.4714


In [59]:
lr5 = LogisticRegression(random_state=66, C=2, penalty='l1',solver='liblinear')
lr5.fit(x_train, y_train)
y_pred = lr5.predict(x_test) # predict

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.5210
Micro-averaged F1 score: 0.5210


## fasttext

In [103]:
# fasttext requires data to be in the format of: __label__1 text
train_fasttext = train_new.apply(lambda t: '__label__' + str(t['label']) + ' ' + str(t['text']), axis=1)
test_fasttext = test_new.apply(lambda t: '__label__' + str(t['label']) + ' ' + str(t['text']), axis=1)
train_fasttext.to_csv('fasttext_train.txt',index=False, header=False)
test_fasttext.to_csv('fasttext_test.txt',index=False, header=False)

# fasttext model - default
ft_model1 = fasttext.train_supervised('fasttext_train.txt')

# calculate evaluation metrics
result = ft_model1.test('fasttext_test.txt')
precision = result[1]
recall = result[2]
print("F1 score: %0.4f"%(2*precision*recall/(precision+recall)))

F1 score: 0.7829


In [67]:
# fasttext model - setting 1
ft_model2 = fasttext.train_supervised('fasttext_train.txt',wordNgrams=2)
result = ft_model2.test('fasttext_test.txt')
precision = result[1]
recall = result[2]
print("F1 score: %0.4f"%(2*precision*recall/(precision+recall)))

F1 score: 0.7642


In [81]:
# fasttext model - setting 2
ft_model3 = fasttext.train_supervised('fasttext_train.txt',lr=0.2, wordNgrams=2)
result = ft_model3.test('fasttext_test.txt')
precision = result[1]
recall = result[2]
print("F1 score: %0.4f"%(2*precision*recall/(precision+recall)))

F1 score: 0.7629


In [80]:
# fasttext model - setting 3
ft_model4 = fasttext.train_supervised('fasttext_train.txt', lr=0.5, wordNgrams=2)
result = ft_model4.test('fasttext_test.txt')
precision = result[1]
recall = result[2]
print("F1 score: %0.4f"%(2*precision*recall/(precision+recall)))

F1 score: 0.7586


## SVM

In [93]:
svm1 = LinearSVC(random_state=66)
svm1.fit(x_train, y_train)
y_pred = svm1.predict(x_test)

print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.4730
Micro-averaged F1 score: 0.4730


In [96]:
svm2 = LinearSVC(random_state=66, penalty='l2', C=10, loss='hinge')
svm2.fit(x_train, y_train)
y_pred = svm2.predict(x_test)

print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.4495
Micro-averaged F1 score: 0.4495


In [100]:
svm3 = LinearSVC(random_state=66, penalty='l2', loss='squared_hinge', dual=False)
svm3.fit(x_train, y_train)
y_pred = svm3.predict(x_test)

print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.4727
Micro-averaged F1 score: 0.4727


In [101]:
svm4 = LinearSVC(random_state=66, penalty='l1', loss='squared_hinge', dual=False)
svm4.fit(x_train, y_train)
y_pred = svm4.predict(x_test)

print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.4996
Micro-averaged F1 score: 0.4996


## Random Forest

In [12]:
max_depth = [10,30,50]
n_estimators = [200,500]
grid_params ={'max_depth':max_depth,'n_estimators':n_estimators}

RandomFoest_model = GridSearchCV(RandomForestClassifier(class_weight = 'balanced'), grid_params,
                  scoring = 'accuracy', cv=5,n_jobs=-1, return_train_score=True)
RandomFoest_model.fit(x_train, y_train)

results = pd.DataFrame.from_dict(RandomFoest_model.cv_results_)
print(RandomFoest_model.best_estimator_)

RandomForestClassifier(class_weight='balanced', max_depth=50, n_estimators=200)


In [16]:
RandomFoest_model = RandomForestClassifier(bootstrap=True, class_weight='balanced',
                                      max_depth=50, n_estimators=200,  random_state=66, verbose=0)
RandomFoest_model.fit(x_train,y_train)

y_pred = RandomFoest_model.predict(x_test)
print("Accuracy: %0.4f"%accuracy_score(y_test, y_pred))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.4289
Micro-averaged F1 score: 0.4289


In [23]:
# store best model
ft_model1.save_model('fasttext_model')

## Prediction

In [80]:
# read fasttext model and tfidf tranformer
loaded_model = fasttext.load_model('model/fasttext_model')

comment = ['I HATE COVID',
        'Stay strong and we can make it!',
        "It's fucking horrible that I can't even breath inside the mask.",
        'Life is bad but we have to stay at home.',
        'Life will be better when the vaccine comes out.']

label = ['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive']


result = {}
for i in range(len(label)):
    predict = loaded_model.predict(comment[i])
    result[str(i) + '__label: ' + str(label[i])] = {'predicted label': predict[0][0],
                                           'Probability': predict[1][0]}

# save results
with open('predcition.json', 'w') as f:
    json.dump(result, f)

print(json.dumps(result, indent=2))

{
  "0__label: Extremely Negative": {
    "predicted label": "__label__Neutral",
    "Probability": 0.9999134540557861
  },
  "1__label: Extremely Positive": {
    "predicted label": "__label__Positive",
    "Probability": 0.9997422099113464
  },
  "2__label: Negative": {
    "predicted label": "__label__Negative",
    "Probability": 0.8929682970046997
  },
  "3__label: Neutral": {
    "predicted label": "__label__Negative",
    "Probability": 0.9983593225479126
  },
  "4__label: Positive": {
    "predicted label": "__label__Positive",
    "Probability": 0.999428391456604
  }
}


