# Identifying Sentiments
The data analyzed in this presentation were obtained from Analytics Vidhya. 

The metrics for evaluating the performance is the F1-score.

As at the time of writing this report (04/08/2019), 215 teams/individuals have registered for this learning competition. 
The score on the board according to their positions are shown below:
    1.    0.9164
    25.   0.8985
    50.   0.8925
    75.   0.8863
    100.  0.8813
The best submission based on the analysis in this report was 0.8966 (which was on #33, which is also in top 15th percentile).

This report will show some of the different techniques that were used and the corresponding F1-scores based on the test data set that was provided.

This report is divided into three sections:
* Using the Count Vectorizer and various Machine Learning algorithms
* Checking to see the possibilities of ensembling different ML alorithms
* Feature Engineering and using various ML algorithms

Finally, there will be some future work to be done.

In [55]:
import sklearn
sklearn.__version__

'0.19.2'

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
% matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import metrics
from textblob import TextBlob
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel

import random
random.seed (1)

train = pd.read_csv('Train_Data.csv')
test = pd.read_csv('Test_Data.csv')

  from numpy.core.umath_tests import inner1d


In [2]:
# The dimension of the train and test data
print (train.shape)
print (test.shape)

(7920, 3)
(1953, 2)


In [3]:
pd.set_option('display.max_colwidth', 1000)

In [4]:
# checking out the top two lines for the train data set
train.head(2)

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/


In [5]:
# checking out the top two lines for the test data sets
test.head(2)

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/


In [6]:
train.tweet.shape

(7920,)

In [7]:
# Checking what the CountVectorizer parameters look like
vect = CountVectorizer()
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [8]:
# The vocabulary is learned and the features are turned into document-term matrix
train_dtm = vect.fit_transform(train.tweet)

In [9]:
# Dimension of the matrix
train_dtm.shape

(7920, 23090)

In [10]:
# The tokens are all stored in the get_feature_names method
tokens = vect.get_feature_names()

In [11]:
# Length of the tokens, which is the same as the columns of train_dtm
len(tokens)

23090

In [12]:
# 7920 x 23090 matrix which is mostly zero, only 137638 elements are non-zeros
# only about 0.075%
train_dtm

<7920x23090 sparse matrix of type '<class 'numpy.int64'>'
	with 137638 stored elements in Compressed Sparse Row format>

In [13]:
# print (train_dtm)

In [14]:
# checking what the Multinomial naive bayes parameters look like
nb = MultinomialNB()

In [15]:
nb.fit(train_dtm, train.label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
# shows features in class 0 (first row elements)
# and features in class 1 (second row elements)
# number of times those features occured in each class are the respective entries
nb.feature_count_

array([[12.,  5.,  2., ...,  1.,  1.,  1.],
       [ 1.,  4.,  0., ...,  0.,  0.,  0.]])

In [17]:
nb.feature_count_.shape

(2, 23090)

In [18]:
words_by_class = pd.DataFrame({'vocabularies': tokens, 
                    'class 0': nb.feature_count_[0,:],
                    'class 1': nb.feature_count_[1,:]}).set_index('vocabularies')

In [19]:
words_by_class.head()

Unnamed: 0_level_0,class 0,class 1
vocabularies,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12.0,1.0
0,5.0,4.0
2,2.0,0.0
4,1.0,0.0
51,2.0,0.0


In [20]:
# Add 1 to each class count so as to avoid dividing by 0 when taken ratios
words_by_class['class 0'] = words_by_class['class 0'] + 1
words_by_class['class 1'] = words_by_class['class 1'] + 1

In [21]:
words_by_class.head()

Unnamed: 0_level_0,class 0,class 1
vocabularies,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13.0,2.0
0,6.0,5.0
2,3.0,1.0
4,2.0,1.0
51,3.0,1.0


In [22]:
# Number of documents in each class
# Of the 7920 documents in the training data set
# 5894 are in class 0
# 2026 are in class 1
# Quite an imbalance class ratio
nb.class_count_

array([5894., 2026.])

In [23]:
# Normalizing the counts in each class by the number of documents in each class
# In other words, the frequency of occurence of each word in the class
words_by_class['class 0'] = words_by_class['class 0']/nb.class_count_[0]
words_by_class['class 1'] = words_by_class['class 1']/nb.class_count_[1]

In [24]:
# Calculate the probability that each word belong to class 1 as against class 0
words_by_class['class_1_prob']=words_by_class['class 1']/words_by_class['class 0']

In [25]:
# Arrange the data frame such that the class 1 predictors are at the top
words_by_class.sort_values('class_1_prob', ascending = False, inplace = True)

In [28]:
# Checking out the top 20 class 1 predictors
words_by_class.head(50)

Unnamed: 0_level_0,class 0,class 1,class_1_prob
vocabularies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reset,0.00017,0.007897,46.54689
restore,0.000339,0.014808,43.63771
deleting,0.00017,0.00691,40.728529
crashing,0.00017,0.006417,37.819348
decides,0.00017,0.005923,34.910168
contact,0.00017,0.005923,34.910168
fucking,0.001697,0.05923,34.910168
hateapple,0.000679,0.023198,34.182873
wtf,0.000339,0.011352,33.455577
failed,0.00017,0.005429,32.000987


In [29]:
# Checking out the top 50 class 0 predictors
words_by_class.tail(50)

Unnamed: 0_level_0,class 0,class 1,class_1_prob
vocabularies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sup,0.009162,0.000494,0.053874
surf,0.009332,0.000494,0.052894
followback,0.009501,0.000494,0.05195
io,0.009501,0.000494,0.05195
ig,0.009841,0.000494,0.050158
girl,0.029861,0.001481,0.049588
capetown,0.01001,0.000494,0.049308
summer,0.02019,0.000987,0.048894
present,0.01018,0.000494,0.048486
steemit,0.01018,0.000494,0.048486


In [None]:
# Repeating the same process as above with two neighboring words

In [31]:
vect2 = CountVectorizer(ngram_range=(2, 2))
vect2

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [32]:
train_dtm2 = vect2.fit_transform(train.tweet) 

In [33]:
train_dtm2.shape

(7920, 78193)

In [34]:
tokens2 = vect2.get_feature_names()

In [35]:
len(tokens2)

78193

In [36]:
train_dtm2

<7920x78193 sparse matrix of type '<class 'numpy.int64'>'
	with 136891 stored elements in Compressed Sparse Row format>

In [37]:
nb2 = MultinomialNB()

In [38]:
nb2.fit(train_dtm2, train.label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
nb2.feature_count_

array([[1., 2., 0., ..., 1., 1., 1.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [40]:
nb2.feature_count_.shape

(2, 78193)

In [42]:
words_by_class2 = pd.DataFrame({'vocab': tokens2, 
                               'class 0': nb2.feature_count_[0,:], 
                               'class 1': nb2.feature_count_[1,:]}).set_index('vocab')

In [43]:
words_by_class2.head()

Unnamed: 0_level_0,class 0,class 1
vocab,Unnamed: 1_level_1,Unnamed: 2_level_1
00 gay,1.0,0.0
00 http,2.0,0.0
00 thanks,0.0,1.0
00 via,1.0,0.0
000 00,1.0,0.0


In [44]:
words_by_class2['class 0'] = words_by_class2['class 0'] + 1
words_by_class2['class 1'] = words_by_class2['class 1'] + 1

In [45]:
nb2.class_count_

array([5894., 2026.])

In [46]:
words_by_class2['class 0'] = words_by_class2['class 0']/nb2.class_count_[0]
words_by_class2['class 1'] = words_by_class2['class 1']/nb2.class_count_[1]

In [47]:
words_by_class2['class_1_prob']=words_by_class2['class 1']/words_by_class2['class 0']

In [49]:
words_by_class2.sort_values('class_1_prob', ascending = False, inplace = True)

In [50]:
words_by_class2.head(50)

Unnamed: 0_level_0,class 0,class 1,class_1_prob
vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hate apple,0.00017,0.014314,84.366239
you suck,0.00017,0.006417,37.819348
because of,0.00017,0.006417,37.819348
fucking hate,0.00017,0.005429,32.000987
ipod is,0.00017,0.005429,32.000987
apple you,0.000679,0.021224,31.273692
to restore,0.000339,0.010365,30.546397
fuckyou apple,0.000509,0.015301,30.061533
new update,0.00017,0.004936,29.091807
apple your,0.00017,0.004936,29.091807


In [51]:
words_by_class2.tail(50)

Unnamed: 0_level_0,class 0,class 1,class_1_prob
vocab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
http mf,0.009162,0.000494,0.053874
mf tt,0.009162,0.000494,0.053874
zpr io,0.009162,0.000494,0.053874
sup surf,0.009162,0.000494,0.053874
fun capetown,0.009162,0.000494,0.053874
sexy me,0.009162,0.000494,0.053874
capetownsup on,0.009162,0.000494,0.053874
surf fun,0.009162,0.000494,0.053874
follow capetownsup,0.009162,0.000494,0.053874
phone case,0.009332,0.000494,0.052894
