In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.options.display.max_colwidth=100

In [2]:
import re

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [43]:
import string

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [62]:
from sklearn.model_selection import GridSearchCV

In [4]:
train_data = pd.read_csv('train.csv',encoding='ISO-8859-1')

## Methodology
1. __Loading the Data__ :<br>
The given dataset has total 99989 tweets, out of which 56457(56.46%) are positive tweets and 43532(43.54%) tweets have negative sentiment
***
2. __Approach__:<br>
After studying the data, a workflow plan for preprocessing the data, so it can be used by machine learning algorith for prediction.

***
3. __Text Preprocesing Fuction__:<br>
After identifing the approach for Text preprocessing, a Pipeline is developed which will be fed to the GridSearchCV for hyper-parameter tuning
***
4. __Model Selection__:<br>
Comparison of Multiple Classfication techniques, to find out best classification technique to use.
***
5. __Hyper-parameter Tuning__:<br>
After Model Selection,Hyper-parameter tuning to find best parameters for Sentiment prediction
***
6. __Model Evaluation With Test Data__:<br>
***
7. __Future scope for accuracy improvement__:


### 1.Loading the Data

In [5]:
train_data.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL friend.............
1,2,0,I missed the New Moon trailer...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2...
4,5,0,i think mi bf is cheating on me!!! T_T
5,6,0,or i just worry too much?
6,7,1,Juuuuuuuuuuuuuuuuussssst Chillin!!
7,8,0,Sunny Again Work Tomorrow :-| TV Tonight
8,9,1,handed in my uniform today . i miss you already
9,10,1,hmmmm.... i wonder how she my number @-)


In [6]:
train_data['Sentiment'].value_counts()

1    56457
0    43532
Name: Sentiment, dtype: int64

Here 1 denote 'Positive' , 0 denote 'Negative Tweet'

##### Lets see some random tweets

In [7]:
indices = np.random.randint(0,len(train_data),25).tolist()
train_data['SentimentText'][indices]

58168    @BeauGiles awe how cute xD I HAd a party at mine yesterday, it was awesome.  SO DRUNK. don't rem...
46468                                                   @Apple_x360a Okay. Buy it for meh if its so awesome 
82049                                                                 @chekaq Cheka!!!  Follow me! Miss you!
15136                * at home. chillen. its hot as hell in this house. summer 09' doin things differently! 
94996                             @Cianmm ok then, just as long as he does not get screwed at PC world, etc 
52808                                           @astroengine ahhh, understood. Kind of like hurricane logic 
378                                                                  I want to write a song. I think I will.
24627                                                                               @20orsomething Gracias. 
38880                                                        @amycutbill @alexparr just adding you both now 
60900              

1. There are some url in tweets which i dont think will help in sentiment analysis
2. will remove hashtags
3. stopwords will be removed

### 1.2 Data Analysis 

In [9]:
tweet_text = ''.join(train_data['SentimentText'])
tweet_text[:4000]



#### Emoticons

In [13]:
emotions = set(re.findall(r" ([xX:;][-']?.) ",tweet_text))

In [14]:
emotions

{': ',
 ':$',
 ":'",
 ":'(",
 ":')",
 ":'D",
 ":'[",
 ':(',
 ':*',
 ':-$',
 ':-*',
 ':-/',
 ':-D',
 ':-O',
 ':-P',
 ':-S',
 ':-W',
 ':-X',
 ':-[',
 ':-\\',
 ':-]',
 ':-h',
 ':-o',
 ':-p',
 ':-s',
 ':-x',
 ':-|',
 ':/',
 ':0',
 ':1',
 ':3',
 '::',
 ':;',
 ':?',
 ':@',
 ':C',
 ':D',
 ':E',
 ':H',
 ':I',
 ':L',
 ':O',
 ':S',
 ':T',
 ':X',
 ':Z',
 ':[',
 ':\\',
 ':]',
 ':b',
 ':d',
 ':l',
 ':o',
 ':p',
 ':s',
 ':x',
 ':|',
 ':}',
 ';(',
 ';)',
 ';-(',
 ';-)',
 ';-/',
 ';-;',
 ';-D',
 ';-|',
 ';.',
 ';/',
 ';3',
 ';;',
 ';D',
 ';I',
 ';P',
 ';]',
 ';d',
 ';o',
 ';p',
 ';s',
 ';t',
 'X ',
 "X's",
 'X,',
 'X-(',
 'X.',
 'X1',
 'X5',
 'XD',
 'XL',
 'XM',
 'XO',
 'XP',
 'XS',
 'XT',
 'XX',
 'Xo',
 'Xx',
 'x ',
 "x'D",
 "x'd",
 'x(',
 'x)',
 'x*',
 'x,',
 'x-',
 'x.',
 'x2',
 'x3',
 'x:',
 'x?',
 'x@',
 'xD',
 'xP',
 'xX',
 'x]',
 'xa',
 'xd',
 'xe',
 'xh',
 'xk',
 'xm',
 'xo',
 'xp',
 'xx',
 'x|'}

In [20]:
emotions_count=[]

for emo in emotions:
    emotions_count.append((emo,tweet_text.count(emo)))
    
emotions_count= sorted(emotions_count,key=lambda x:x[1],reverse=True)
emotions_count[:20]

[(':/', 3281),
 ('x ', 2874),
 (': ', 2626),
 ('x@', 1339),
 ('xx', 1214),
 ('xa', 1162),
 (';3', 984),
 ('xp', 887),
 ('xo', 842),
 (';)', 713),
 ('xe', 483),
 (';I', 431),
 (';.', 353),
 ('xD', 254),
 ('x.', 251),
 ('::', 245),
 ('X ', 234),
 (';t', 217),
 (';s', 209),
 (':O', 185)]

In [21]:
HAPPY_EMO = r" ([xX;:]-?[dD)]|:-?[\)]|[;:][pP]) "
SAD_EMO = r" (:'?[/|\(]) "
print("Happy emoticons:", set(re.findall(HAPPY_EMO, tweet_text)))
print("Sad emoticons:", set(re.findall(SAD_EMO, tweet_text)))

Happy emoticons: {':p', ';D', ';)', 'xd', ':d', ';d', ':-D', ';p', ';-D', 'XD', ';P', ';-)', ':D', 'x)', 'xD'}
Sad emoticons: {':/', ':|', ':(', ":'("}


### Most used words

In [26]:
def most_used_word(text):
    tokens = word_tokenize(text)
    frequency = nltk.FreqDist(tokens)
    word_count = [i for i in frequency.items()]
    print('There are total {} different words/expressions'.format(len(set(tokens))))
    return sorted(word_count, key=lambda x:x[1],reverse=True)

In [27]:
mw= most_used_word(tweet_text)
mw[:50]

There are total 133899 different words/expressions


[('@', 90792),
 ('!', 57931),
 ('.', 50679),
 ('I', 43542),
 (',', 32571),
 ('to', 29176),
 ('the', 28236),
 ('you', 26695),
 ('?', 25284),
 ('a', 21411),
 ('it', 19214),
 ('i', 19074),
 ('...', 18480),
 (';', 16178),
 ('and', 14967),
 ('&', 14440),
 ('my', 12535),
 ('for', 12312),
 ('is', 12038),
 ('that', 11939),
 ("'s", 11825),
 ("n't", 11710),
 ('in', 11504),
 ('of', 10407),
 ('me', 10393),
 ('have', 9666),
 ('on', 9385),
 ('quot', 9153),
 ("'m", 8447),
 ('so', 8020),
 (':', 7736),
 ('but', 7587),
 ('#', 7435),
 ('do', 7397),
 ('was', 7381),
 ('be', 7276),
 ('not', 6528),
 ('your', 6056),
 ('are', 5993),
 ('just', 5897),
 ('with', 5406),
 ('like', 5322),
 ('-', 5118),
 ('at', 5056),
 ('too', 4934),
 ('get', 4916),
 ('good', 4807),
 ('u', 4629),
 ('up', 4473),
 ('know', 4458)]

### 2. Approach

#### Stopwords

Stopwords are not useful in finding 'Positive' or 'Negative' emotions, so its better to remove them

In [45]:
stop_punc = list(list(stopwords.words('english')) + list(string.punctuation))

In [46]:
most_word = []

for w in mw:
    if len(most_word)==50:
        break
    if w[0].lower() in stop_punc:
        continue
    else:
        most_word.append(w)

In [47]:
most_word

[('...', 18480),
 ("'s", 11825),
 ("n't", 11710),
 ('quot', 9153),
 ("'m", 8447),
 ('like', 5322),
 ('get', 4916),
 ('good', 4807),
 ('u', 4629),
 ('know', 4458),
 ('love', 3760),
 ('one', 3440),
 ('lol', 3386),
 ('go', 3358),
 ("'ll", 3202),
 ('got', 3196),
 ('amp', 3176),
 ('day', 3089),
 ('http', 3084),
 ('see', 3047),
 ("'re", 2979),
 ('time', 2878),
 ('think', 2793),
 ('going', 2563),
 ('really', 2536),
 ('work', 2503),
 ('well', 2494),
 ('would', 2451),
 ('thanks', 2318),
 ('back', 2259),
 ('im', 2256),
 ('haha', 2239),
 ('want', 2210),
 ('ca', 2142),
 ('na', 2141),
 ('much', 2129),
 ('still', 2128),
 ('today', 2117),
 ("'ve", 1991),
 ('2', 1961),
 ('need', 1936),
 ('hope', 1925),
 ('miss', 1901),
 ('sorry', 1891),
 ('great', 1883),
 ('could', 1780),
 ('right', 1716),
 ('Thanks', 1703),
 ('though', 1690),
 ('oh', 1629)]

### Stemming

In [48]:
def Lemmatize_tokenizer(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text)]

## 3. Text Preprocessing Function

### 3.1 Create Text Preprocessing class

In [71]:
# We need to do some preprocessing of the tweets.
# We will delete useless strings (like @, # ...)
# because we think that they will not help
# in determining if the person is Happy/Sad

class TextPreProc(BaseEstimator,TransformerMixin):
    
    def __init__(self,use_people_mention = False):
        self.use_people_mention = use_people_mention
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        
        if self.use_people_mention:                      #to represent tagged account
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", " @tags ")
            
        else:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", "")
            
            
        # Removing Hashtag symbol and keeping only text after that
        
        X = X.str.replace("#", "")
        X = X.str.replace(r"[-\.\n]", "")
        
        # Removing HTML 
        X = X.str.replace(r"&\w+;", "")
        
        # Removing links
        X = X.str.replace(r"https?://\S*", "")
        
        # replace repeated letters with only two occurences
        # heeeelllloooo => heelloo
        X = X.str.replace(r"(.)\1+", r"\1\1")
        
        # mark emoticons as happy or sad
        X = X.str.replace(HAPPY_EMO, " happy happy ")   ## We created happy and sad emitocons remember
        X = X.str.replace(SAD_EMO, " sad sad ")
        X = X.str.lower()
        return X
        

### 3.2 Pipeline

In [72]:
# This is the pipeline that will transform our tweets to something eatable.
# You can see that we are using our previously defined stemmer, it will
# take care of the stemming process.
# For stop words, we let the inverse document frequency do the job

from sklearn.cross_validation import train_test_split

Sentiments = train_data['Sentiment']
tweets = train_data['SentimentText']

vectorizer = TfidfVectorizer(tokenizer= Lemmatize_tokenizer,ngram_range=(1,2),stop_words=stop_punc)
pipeline = Pipeline([
    ('text_Pre_Processing',TextPreProc(use_people_mention=True)),
    ('TfidfVect',vectorizer)
])


Train_tweets,Test_tweets,train_sentiment,test_sentiment = train_test_split(tweets,Sentiments,test_size=0.3
                                                                           ,random_state=101)

Learning_data = pipeline.fit_transform(Train_tweets)

In [79]:
Learning_data.shape

(69992, 350243)

Basicaly Learning Data is sparse matrix which stores tfidf score

In [80]:
print(Learning_data[0])

  (0, 282065)	0.025973519585985774
  (0, 259427)	0.16719373080803926
  (0, 191687)	0.14975053935520266
  (0, 51062)	0.15431072035350127
  (0, 117129)	0.17280310494298406
  (0, 246772)	0.1763334560123435
  (0, 64772)	0.25175077013781216
  (0, 206966)	0.1144886552684716
  (0, 196343)	0.1824166467767404
  (0, 340389)	0.2143110492765299
  (0, 342214)	0.18337166790863543
  (0, 288888)	0.23652785416911098
  (0, 259460)	0.25175077013781216
  (0, 191710)	0.25175077013781216
  (0, 51102)	0.23652785416911098
  (0, 118215)	0.23652785416911098
  (0, 246785)	0.25175077013781216
  (0, 64773)	0.25175077013781216
  (0, 207253)	0.25175077013781216
  (0, 196377)	0.25175077013781216
  (0, 340397)	0.25175077013781216
  (0, 342226)	0.25175077013781216


## 4. Model selection

In [81]:
lr = LogisticRegression()
bnb = BernoulliNB()
mnb = MultinomialNB()

In [82]:
models = {
    'Logistic Regression' : lr,
    'BernoulliNB' : bnb,
    'MultinomialNB':mnb
}

In [83]:
for model in models:
    
    scores = cross_val_score(estimator= models[model], X=Learning_data, y= train_sentiment,scoring='f1',cv=5)
    
    print('----',model,'---')
    print('Scores :' , scores)
    print('Average f1 score :', scores.mean())
    models[model].fit(Learning_data,train_sentiment)
    
    print('Accuracy of {} is : {}'.format(model, metrics.accuracy_score(train_sentiment,models[model].predict(Learning_data))))
    
    print('\n')

---- Logistic Regression ---
Scores : [0.79680851 0.79286177 0.80018817 0.79837283 0.79365831]
Average f1 score : 0.7963779178121799
Accuracy of Logistic Regression is : 0.872556863641559


---- BernoulliNB ---
Scores : [0.77224622 0.77301544 0.77406084 0.77883829 0.77128384]
Average f1 score : 0.7738889263908065
Accuracy of BernoulliNB is : 0.9002457423705567


---- MultinomialNB ---
Scores : [0.79614219 0.79249642 0.79465849 0.79634723 0.7938241 ]
Average f1 score : 0.7946936872037742
Accuracy of MultinomialNB is : 0.9234626814493085




Well, from above it can infered that multinomialNB is best model, So we move further with multinomialNB

## 5. Hyper-Parameter Tuning  of the MultinomialNB

In [84]:
grid_search_pipeline = Pipeline([
    ('Text_Preprocessing',TextPreProc()),
    ('TfidfVectorizer',TfidfVectorizer(tokenizer=Lemmatize_tokenizer)),
    ('Model',MultinomialNB())
])

param = [{
    'Text_Preprocessing__use_people_mention':[True,False],
    'TfidfVectorizer__max_features':[5000,20000,30000],
    'TfidfVectorizer__ngram_range':[(1,2)]
}]

In [85]:
grid_search = GridSearchCV(grid_search_pipeline, param, cv=2, scoring='f1')
grid_search.fit(Train_tweets,train_sentiment)

GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('Text_Preprocessing', TextPreProc(use_people_mention=False)), ('TfidfVectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
   ..._idf=True, vocabulary=None)), ('Model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'Text_Preprocessing__use_people_mention': [True, False], 'TfidfVectorizer__max_features': [5000, 20000, 30000], 'TfidfVectorizer__ngram_range': [(1, 2)]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [86]:
grid_search.best_params_

{'Text_Preprocessing__use_people_mention': True,
 'TfidfVectorizer__max_features': 30000,
 'TfidfVectorizer__ngram_range': (1, 2)}

In [87]:
grid_search.best_estimator_

Pipeline(memory=None,
     steps=[('Text_Preprocessing', TextPreProc(use_people_mention=True)), ('TfidfVectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
   ..._idf=True, vocabulary=None)), ('Model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## 6. Model Evaluation with Test Data

In [93]:
train_pred= grid_search.predict(Train_tweets)

In [94]:
test_pred = grid_search.predict(Test_tweets)

In [98]:
metrics.confusion_matrix(test_sentiment,test_pred),metrics.accuracy_score(test_sentiment,test_pred)

(array([[ 9301,  3850],
        [ 2851, 13995]]), 0.7766109944327766)

In [99]:
metrics.confusion_matrix(train_sentiment,train_pred),metrics.accuracy_score(train_sentiment,train_pred)

(array([[23326,  7055],
        [ 4713, 34898]]), 0.8318664990284604)

### 7. How to improve the accuracy & Future Scope

1.I think I have to work on how Happy and sad Emoticons where defined, in my opinion they are influencial factors<br>
***
2.I also think I need to implement other classification models as well, may be they can be successful in learning more from the sparse matrix