In [1]:
import seaborn
import numpy as np
import pandas as pd
import seaborn as sns

import nltk
from sklearn.model_selection import train_test_split

import os

In [2]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pataoengineer/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pataoengineer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Data Preperation & Exploration

In [3]:
dfRaw = pd.read_csv(
    os.path.join(os.environ['HOME'], 'data', 'tweets', 'training.1600000.processed.noemoticon.csv'),
    header=None, usecols=[0,2,4,5], names=['sentiment','date','user','text'],
    parse_dates=['date'])



In [4]:
dfRaw.dtypes

sentiment             int64
date         datetime64[ns]
user                 object
text                 object
dtype: object

In [5]:
dfRaw[:3]

Unnamed: 0,sentiment,date,user,text
0,0,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...
2,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...


## Exploring sentiments

In [6]:
dfRaw.sentiment.value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

In [7]:
dfRaw[dfRaw.sentiment==0][:5]['text']

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object

In [8]:
dfRaw[dfRaw.sentiment==4][:5]['text']

800000         I LOVE @Health4UandPets u guys r the best!! 
800001    im meeting up with one of my besties tonight! ...
800002    @DaRealSunisaKim Thanks for the Twitter add, S...
800003    Being sick can be really cheap when it hurts t...
800004      @LovesBrooklyn2 he has that effect on everyone 
Name: text, dtype: object

## Distribution of most common words in each sentiments
By getting rid of following:
- mentions
- stopwords
- symbols
- top N common words in both first

In [9]:
dfWords = dfRaw[dfRaw.columns]
dfWords.loc[:,'words'] = dfWords['text'].str.split(' ')
dfWords['words'][:5]

0    [@switchfoot, http://twitpic.com/2y1zl, -, Aww...
1    [is, upset, that, he, can't, update, his, Face...
2    [@Kenichan, I, dived, many, times, for, the, b...
3    [my, whole, body, feels, itchy, and, like, its...
4    [@nationwideclass, no,, it's, not, behaving, a...
Name: words, dtype: object

Clean text by removing stopwords, symbols, and mentions

In [10]:
import re
from functools import reduce
def clean(ss):
    def clean_and_drop(a,b):
        def cl(s):
            if re.match(r'^\W+$', s):
                return []
            elif len(s)>0 and s[0]=='@':
                return ['@MENTION']
            elif re.match(r'^http\:w*', s):
                return ['@URL']
            elif all(['0'<=c<='9' or c in ['.',','] for c in s]):
                return ['@NUMBER']
            else:
                a = re.sub(r'[\t|,|\.]','', s)
                a = re.sub(r'[\!]+', ' !', a)
                a = re.sub(r'[\?]+', ' ?', a)
                return a.split(' ')
        if len(b)==0:
            return a
        else:
            return a + cl(b)
    return reduce(clean_and_drop, [[]]+ss)
    
dfWords.loc[:,'words'] = dfWords['words'].apply(clean)

In [11]:
topwords = dfWords['words'].explode()
topwords = topwords.value_counts()
topwords[:25]

@MENTION    783109
!           606423
to          556733
I           498055
the         487747
a           366409
my          280574
and         276336
i           250770
you         243743
is          221963
for         211048
it          209821
in          205962
of          180215
?           173001
on          159336
@NUMBER     153621
me          151411
have        133318
that        130601
so          128602
with        112024
be          110004
but         107809
Name: words, dtype: int64

In [12]:
stopwords = topwords[:30].reset_index(drop=False)
', '.join(stopwords['index'].tolist())

"@MENTION, !, to, I, the, a, my, and, i, you, is, for, it, in, of, ?, on, @NUMBER, me, have, that, so, with, be, but, at, was, I'm, just, not"

## Clean tweets
Remove stopwords

In [13]:
def clean_stopwords(ws):
    return [w for w in ws if w not in stopwords]

dfClean = dfWords[['sentiment','words']]
dfClean.loc[:,'words'] = dfClean['words'].apply(clean_stopwords)
dfClean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,sentiment,words
0,0,"[@MENTION, @URL, Awww, that's, a, bummer, You,..."
1,0,"[is, upset, that, he, can't, update, his, Face..."
2,0,"[@MENTION, I, dived, many, times, for, the, ba..."
3,0,"[my, whole, body, feels, itchy, and, like, its..."
4,0,"[@MENTION, no, it's, not, behaving, at, all, i..."
...,...,...
1599995,4,"[Just, woke, up, Having, no, school, is, the, ..."
1599996,4,"[TheWDBcom, Very, cool, to, hear, old, Walt, i..."
1599997,4,"[Are, you, ready, for, your, MoJo, Makeover, ?..."
1599998,4,"[Happy, 38th, Birthday, to, my, boo, of, alll,..."


## Check distribution of words in each sentiment

In [14]:
newtopwords = dfClean[['words','sentiment']]
newtopwords.loc[:, 'cnt'] = 1
newtopwords = newtopwords.explode(column='words')
newtopwords = newtopwords.groupby(['sentiment','words']).agg('sum').reset_index(drop=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [15]:
newtopwords[newtopwords.sentiment==4].sort_values(['cnt'], ascending=False)[:5]

Unnamed: 0,sentiment,words,cnt
413566,4,@MENTION,468845
362189,4,!,361363
731701,4,to,248111
727622,4,the,246624
466221,4,I,197484


In [16]:
newtopwords[newtopwords.sentiment==0].sort_values(['cnt'], ascending=False)[:5]

Unnamed: 0,sentiment,words,cnt
41316,0,@MENTION,314264
331751,0,to,308622
85429,0,I,300571
1,0,!,245060
327515,0,the,241123


# 2. Topic Modeling
Trying NLP and contextual information

## TF-IDF
Try term frequency

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline

In [18]:
df = dfClean[['sentiment','words']]
df.loc[:,'text'] = df['words'].apply(lambda xs: ' '.join(xs))
df[df.sentiment == 4][:10]

Unnamed: 0,sentiment,words,text
800000,4,"[I, LOVE, @MENTION, u, guys, r, the, best, !]",I LOVE @MENTION u guys r the best !
800001,4,"[im, meeting, up, with, one, of, my, besties, ...",im meeting up with one of my besties tonight !...
800002,4,"[@MENTION, Thanks, for, the, Twitter, add, Sun...",@MENTION Thanks for the Twitter add Sunisa ! I...
800003,4,"[Being, sick, can, be, really, cheap, when, it...",Being sick can be really cheap when it hurts t...
800004,4,"[@MENTION, he, has, that, effect, on, everyone]",@MENTION he has that effect on everyone
800005,4,"[@MENTION, You, can, tell, him, that, I, just,...",@MENTION You can tell him that I just burst ou...
800006,4,"[@MENTION, Thans, for, your, response, Ihad, a...",@MENTION Thans for your response Ihad already ...
800007,4,"[@MENTION, I, am, so, jealous, hope, you, had,...",@MENTION I am so jealous hope you had a great ...
800008,4,"[@MENTION, ah, congrats, mr, fletcher, for, fi...",@MENTION ah congrats mr fletcher for finally j...
800009,4,"[@MENTION, I, RESPONDED, Stupid, cat, is, help...",@MENTION I RESPONDED Stupid cat is helping me ...


In [19]:
dfTrain, dfTest = train_test_split(df, test_size=0.1)
len(dfTrain), len(dfTest)

(1440000, 160000)

In [20]:
def vectoriser(ngram=(1,1), min_df=0.0, max_df=1.0):
    return TfidfVectorizer(
        lowercase=True, ngram_range=ngram, min_df=min_df, max_df=max_df,
        max_features=10000)

pipesTfidf = {
    "logit": make_pipeline(vectoriser(ngram=(1,2)), LogisticRegression()),
    "logit-3gram": make_pipeline(vectoriser(ngram=(1,3)), LogisticRegression()),
    #"sgsvc": make_pipeline(vectoriser(ngram=(1,2)), 
    #    SGDClassifier(loss='hinge', max_iter=200, penalty='l2', n_jobs=3)),
    "randomforest": make_pipeline(vectoriser(ngram=(1,2)), RandomForestClassifier(
        n_estimators=25, max_depth=10, n_jobs=3
    ))
}

In [21]:
def train(m, p):
    print(f'Training : {m}')
    return p.fit(X=dfTrain['text'], y=dfTrain['sentiment'])

tfidfModels = {m: train(m, p) for m,p in pipesTfidf.items()}

Training : logit


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training : logit-3gram


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training : randomforest


Validating the TFIDF-based classifier

In [22]:
# AUC
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve

In [23]:
for m,p in tfidfModels.items():
    pred = p.predict_proba(dfTest['text'])
    score = roc_auc_score(dfTest['sentiment'], pred[:,1])
    print(f'AUC of {m} = {score:.2f}')

AUC of logit = 0.89
AUC of logit-3gram = 0.88
AUC of randomforest = 0.76


Test the models in action

In [24]:
positives = dfTest[dfTest.sentiment==4].sample(n=100)
negatives = dfTest[dfTest.sentiment==0].sample(n=100)
for m,p in tfidfModels.items():
    pred_pos = p.predict(positives['text'])
    pred_neg = p.predict(negatives['text'])
    n_correct_pos = len(pred_pos[pred_pos==4])
    n_correct_neg = len(pred_neg[pred_neg==0])
    print(f'{m} : correct positive = {n_correct_pos}, correct_negative = {n_correct_neg}')

logit : correct positive = 86, correct_negative = 76
logit-3gram : correct positive = 84, correct_negative = 76
randomforest : correct positive = 64, correct_negative = 73


## TFIDF - measure recall rate 
With confidence interval of 95%

In [25]:
from sklearn.metrics import recall_score

n_trials = 20
sample_size = 100
samples = {m: [] for m,p in tfidfModels.items()}

for n in range(n_trials):
    dfSample = []
    dfSample.append(dfTest[dfTest.sentiment==4].sample(n=sample_size//2))
    dfSample.append(dfTest[dfTest.sentiment==0].sample(n=sample_size//2))
    dfSample = pd.concat(dfSample)
    for m,p in tfidfModels.items():
        pred = p.predict(dfSample['text'])
        expected = dfSample['sentiment']
        samples[m].append(recall_score(expected, pred, pos_label=4))
        
# Calculate means with confidence interval of 95%
z = 1.960
print('Confidence interval of 95%')
print('---------------------------')
for m in tfidfModels.keys():
    mean = np.mean(samples[m])
    std = np.std(samples[m])
    stderr = std/np.sqrt(sample_size)
    margin = z * stderr
    lb, ub = mean - margin, mean + margin
    print(f'{m} : mean recall between {lb:.2f} and {ub:.2f}')


Confidence interval of 95%
---------------------------
logit : mean recall between 0.80 and 0.82
logit-3gram : mean recall between 0.80 and 0.82
randomforest : mean recall between 0.63 and 0.66


## Word Embedding

Using pretrained English wiki word embedding from Google

In [26]:
len(dfTrain), len(dfTest)

(1440000, 160000)

In [27]:
num_distinct_words = len(dfClean['words'].explode().drop_duplicates())
print(f'Number of distinct words : {num_distinct_words}')

Number of distinct words : 637598


Use word embedding to vectorise the dataset

## Fasttext
Using pretrained English wiki wordembedding

In [28]:
import fasttext

Preparation of fasttext K-fold data

In [34]:
K = 5

texts = dfTrain['text'].tolist()
labels = dfTrain['sentiment'].tolist()
chunksize = len(texts)//K
ktest = []
for k in range(K):
    a = k*chunksize
    b = (k+1)*chunksize
    test_text = texts[a:b]
    test_label = labels[a:b]
    train_text = texts[:a] + texts[b:]
    train_label = labels[:a] + labels[b:]
    with open(f'fasttext_train_{k}.txt', 'w') as f:
        for lbl, txt in zip(train_label, train_text):
            f.write(f'__label__{lbl} {txt}\n')
    ktest.append([])
    for lbl, txt in zip(test_label, test_text):
        ktest[-1].append((lbl, txt))


In [30]:
!wc -l fasttext*

  288000 fasttext_test_0.txt
  288000 fasttext_test_1.txt
  288000 fasttext_test_2.txt
  288000 fasttext_test_3.txt
  288000 fasttext_test_4.txt
 1152000 fasttext_train_0.txt
 1152000 fasttext_train_1.txt
 1152000 fasttext_train_2.txt
 1152000 fasttext_train_3.txt
 1152000 fasttext_train_4.txt
 7200000 total


In [48]:
# Cross validation K-Fold (manual way)
from sklearn.metrics import precision_score, recall_score
for k in range(5):
    print(f'Fold {k} : ')
    mk = fasttext.train_supervised(
        f'fasttext_train_{k}.txt', 
        wordNgrams=2, 
        lr=0.75)
    labels = [f'__label__{lbl}' for lbl, t in ktest[k]]
    pred = [mk.predict(t)[0][0] for lbl, t in ktest[k]]
    rc = recall_score(labels, pred, pos_label='__label__4')
    pc = precision_score(labels, pred, pos_label='__label__4')
    print(f'\rRecall = {rc:.3f}, precision = {pc:3f}')

Fold 0 : 
Recall = 0.811, precision = 0.811394
Fold 1 : 
Recall = 0.818, precision = 0.805819
Fold 2 : 
Recall = 0.814, precision = 0.809776
Fold 3 : 
Recall = 0.817, precision = 0.808384
Fold 4 : 
Recall = 0.816, precision = 0.807780
