# Identifying Sentiments (Part 4)

This section checks out the performance of the analysis when Stemming and Lemmatization were used to determine the roots of each word. The idea here is that word with similar root words may boost the probability of prediction for each class label.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
% matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import metrics
from textblob import TextBlob
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel

import spacy
nlp = spacy.load('en_core_web_sm')

from nltk.stem.snowball import SnowballStemmer

import random
random.seed (1)

train = pd.read_csv('Train_Data.csv')
test = pd.read_csv('Test_Data.csv')

  from numpy.core.umath_tests import inner1d


In [2]:
pd.set_option('display.max_colwidth', 1000)

In [3]:
# Instantiating Stemmer
s_stemmer = SnowballStemmer(language = 'english')

In [4]:
def do_stem(sente):
    '''
    Function used to change each word to the root form through stemming
    Stop words are not used in this process
    '''
    out = []
    sente_li = sente.split(' ')
    for word in sente_li:
        if word not in nlp.Defaults.stop_words:
            out.append(s_stemmer.stem(word))
    return ' '.join(out)

In [5]:
def do_lemma(senten):
    '''
    Function used to change each word to the root form through lemmatization
    Stop words are not used in this process    
    '''
    out = []
    sente = nlp(senten)
    for word in sente:
        if word not in nlp.Defaults.stop_words:
            out.append(word.lemma_)
    return ' '.join(out)

In [6]:
train.head(2)

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/


In [7]:
test.head(2)

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/


In [8]:
train['tweet_stem'] = train['tweet'].apply(do_stem)
test['tweet_stem'] = test['tweet']

train['tweet_lemma'] = train['tweet'].apply(do_lemma)
test['tweet_lemma'] = test['tweet']

In [9]:
train.head(2)

Unnamed: 0,id,label,tweet,tweet_stem,tweet_lemma
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,#fingerprint #pregnanc test https://goo.gl/h1mfqv #android #app #beauti #cute #health #iger #iphoneon #iphonesia #iphon,# fingerprint # pregnancy Test https://goo.gl/h1MfQV # android # app # beautiful # cute # health # iger # iphoneonly # iphonesia # iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,final transpar silicon case ^^ thank uncl :) #yay #soni #xperia #s #sonyexperias… http://instagram.com/p/yget5jc6jm/,finally a transparant silicon case ^^ thank to -PRON- uncle :) # yay # Sony # Xperia # s # sonyexperias … http://instagram.com/p/yget5jc6jm/


In [10]:
test.head(2)

Unnamed: 0,id,tweet,tweet_stem,tweet_lemma
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/


In [11]:
vect = CountVectorizer(stop_words='english')
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
nb = MultinomialNB()
nb

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [47]:
def model_test(vec, model):
    '''
    This function is used to tokenize text-data (train.tweet) using countvectorizer {vec}, 
    and a Machine Learning algorithm {model}, to understand the relationship between the tweet and its label.
    It is also used to predict the label of the given test data.
    The result of the prediction is now output in a csv file which can be uploaded unto Analytics Vidhya website 
    to determine its score.
    The function also returns the prediction probabilities which can be further combine to see if there could be improvement in 
    performance.
    '''
    
    print ('Tokenization:\n ', vec)
    print ()
    print ('Model:', model)
    print ()
    X_traindata_dtm = vec.fit_transform(train['tweet'])
    model.fit(X_traindata_dtm, train.label)
    X_testdata_dtm = vec.transform(test['tweet'])
    y_result = model.predict(X_testdata_dtm)
     
    D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':y_result})
    D.to_csv('Result.csv', index=False)
    
    return model.predict_proba(X_testdata_dtm)[:, 1]

In [25]:
def model_test_lemma(vec, model):
    '''
    Function used with lemmatization
    This function is used to tokenize text-data (train.tweet) using countvectorizer {vec}, 
    and a Machine Learning algorithm {model}, to understand the relationship between the tweet and its label.
    It is also used to predict the label of the given test data.
    The result of the prediction is now output in a csv file which can be uploaded unto Analytics Vidhya to determine its score.
    The function also returns the prediction probabilities which can be further combine to see if there could be improvement in 
    performance.
    '''
    
    print ('Tokenization:\n ', vec)
    print ()
    print ('Model:', model)
    print ()
    X_traindata_dtm = vec.fit_transform(train['tweet_lemma'])
    model.fit(X_traindata_dtm, train.label)
    X_testdata_dtm = vec.transform(test['tweet_lemma'])
    y_result = model.predict(X_testdata_dtm)
     
    D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':y_result})
    D.to_csv('Result.csv', index=False)
    
    return model.predict_proba(X_testdata_dtm)[:, 1]

In [26]:
def model_test_stem(vec, model):
    '''
    Function used with stemming
    This function is used to tokenize text-data (train.tweet) using countvectorizer {vec}, 
    and a Machine Learning algorithm {model}, to understand the relationship between the tweet and its label.
    It is also used to predict the label of the given test data.
    The result of the prediction is now output in a csv file which can be uploaded unto Analytics Vidhya to determine its score.
    The function also returns the prediction probabilities which can be further combine to see if there could be improvement in 
    performance.
    '''
    
    print ('Tokenization:\n ', vec)
    print ()
    print ('Model:', model)
    print ()
    X_traindata_dtm = vec.fit_transform(train['tweet_stem'])
    model.fit(X_traindata_dtm, train.label)
    X_testdata_dtm = vec.transform(test['tweet_stem'])
    y_result = model.predict(X_testdata_dtm)
    
    D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':y_result})
    D.to_csv('Result.csv', index=False)
    
    return model.predict_proba(X_testdata_dtm)[:, 1]

In [15]:
# F1-score = 0.889762413131238
model_test_lemma(CountVectorizer(max_df = 0.5, stop_words='english'), MultinomialNB())

Tokenization:
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Model: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [16]:
# F1-score = 0.870341292341096
model_test_lemma(CountVectorizer(max_df = 0.5), MultinomialNB())

Tokenization:
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Model: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [17]:
# F1-score = 0.877160712143345
model_test_stem(CountVectorizer(max_df = 0.5, stop_words='english'), MultinomialNB())

Tokenization:
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Model: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [18]:
# F1-score = 0.875749865803659
model_test_stem(CountVectorizer(max_df = 0.5), MultinomialNB())

Tokenization:
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Model: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [28]:
# Probabilities of the test data --> Lemmatization
a_lemma = model_test_lemma(CountVectorizer(max_df = 0.5, stop_words='english'), MultinomialNB())

Tokenization:
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Model: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [29]:
# Probabilities of the test data --> Stemming
a_stem = model_test_stem(CountVectorizer(max_df = 0.5, stop_words='english'), MultinomialNB())

Tokenization:
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Model: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [48]:
# Probabilities of the test data --> No Stemming OR Lemmatization
a_ = model_test(CountVectorizer(max_df = 0.5, stop_words='english'), MultinomialNB())

Tokenization:
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Model: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [36]:
a_mean.shape

(1953,)

In [33]:
a_mean = (a_stem + a_lemma)/2

In [49]:
all_mean = (a_stem + a_lemma + a_)/3

In [37]:
a_mean[9]

3.3421177861827135e-10

In [38]:
# F1-score = 0.895872763551162
a_mean_class = [0 if a_mean[i] < 0.5 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':a_mean_class})
D.to_csv('Result.csv', index=False)

In [40]:
# F1-score = 0.890699776085467
a_mean_class = [0 if a_mean[i] < 0.45 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':a_mean_class})
D.to_csv('Result.csv', index=False)

In [41]:
# F1-score = 0.888227350335826
a_mean_class = [0 if a_mean[i] < 0.55 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':a_mean_class})
D.to_csv('Result.csv', index=False)

In [42]:
# F1-score = 0.895872763551162
a_mean_class = [0 if a_mean[i] < 0.49 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':a_mean_class})
D.to_csv('Result.csv', index=False)

In [43]:
# F1-score = 0.895872763551162
a_mean_class = [0 if a_mean[i] < 0.51 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':a_mean_class})
D.to_csv('Result.csv', index=False)

In [44]:
# F1-score = 0.894652734138471
a_mean_class = [0 if a_mean[i] < 0.48 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':a_mean_class})
D.to_csv('Result.csv', index=False)

In [45]:
# F1-score = 0.892823492211137
a_mean_class = [0 if a_mean[i] < 0.52 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':a_mean_class})
D.to_csv('Result.csv', index=False)

In [50]:
# F1-score = 0.891766507676512
a_mean_class = [0 if all_mean[i] < 0.5 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':a_mean_class})
D.to_csv('Result.csv', index=False)

In [53]:
a = 1 
b = 1
c = 0
weig_aver_prob = (a*a_ + b*a_lemma + c*a_stem)/(a + b + c)
ave_pred = [0 if weig_aver_prob[i] < 0.5 else 1 for i in range(1953)]
D = pd.DataFrame({'id':[i for i in range(7921, 9874)], 'label':ave_pred})
D.to_csv('Result.csv', index=False)

In [None]:
# a = 1, b = 1; F1 = 0.897538960600431 {Current best result from combination of two good models}
# a = 1, c = 1; F1 = 0.890087917081782