# Twitter Sentiment Analysis

In [153]:
import pandas as pd
import numpy as np
import re
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import  LogisticRegression
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.metrics import confusion_matrix,roc_auc_score,classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
#nltk.download('stopwords')
#nltk.download('wordnet')
wnlem=WordNetLemmatizer()
df=pd.read_csv("twitter_analysis.csv",encoding="ISO-8859-1")

In [4]:
df.columns=['class','id','Date-Time','Query','User','Tweet']

In [5]:
df['class']=np.where(df['class']==4,1,0)
df.head()

Unnamed: 0,class,id,Date-Time,Query,User,Tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
stopwords=np.array(stopwords.words('english'))
stopwords

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'm

In [7]:
text=list(df.Tweet)
print(df['Tweet'].loc[136])
print(df['Tweet'].loc[27])
text[30:40]

@BridgetsBeaches Thank you for letting people know, but now I'm sad that the direct message I got wasn't actually from Bridget 
ooooh.... LOL  that leslie.... and ok I won't do it again so leslie won't  get mad again 


['@alielayus I want to go to promote GEAR AND GROOVE but unfornately no ride there  I may b going to the one in Anaheim in May though',
 'thought sleeping in was an option tomorrow but realizing that it now is not. evaluations in the morning and work in the afternoon! ',
 '@julieebaby awe i love you too!!!! 1 am here  i miss you',
 '@HumpNinja I cry my asian eyes to sleep at night ',
 "ok I'm sick and spent an hour sitting in the shower cause I was too sick to stand and held back the puke like a champ. BED now ",
 '@cocomix04 ill tell ya the story later  not a good day and ill be workin for like three more hours...',
 '@MissXu sorry! bed time came here (GMT+1)   http://is.gd/fNge',
 "@fleurylis I don't either. Its depressing. I don't think I even want to know about the kids in suitcases. ",
 "Bed. Class 8-12. Work 12-3. Gym 3-5 or 6. Then class 6-10. Another day that's gonna fly by. I miss my girlfriend ",
 "really don't feel like getting up today... but got to study to for tomorrows p

In [8]:
user=r'@[^\s]+'
url=r'((http://)[^ ]*|(https://)[^ ]*|(www.)[^ ]*)'
pattern=r'([^a-zA-z0-9 ]*)'
numbers=r'(\d+)'

In [9]:
time1=time.time()

In [10]:
processed_text=[]
for tweet in text:
    tweet=tweet.lower()
    tweet=re.sub(url,"url",tweet)
    tweet=re.sub(user,"",tweet)
    tweet=re.sub(pattern,"",tweet)
    tweet=re.sub(numbers,"",tweet)
    lemm_tweet=""
    for word in tweet.split( ):
        if word not in stopwords:
            word=wnlem.lemmatize(word)
            lemm_tweet=lemm_tweet+word+"  "
        else:
            lemm_tweet=lemm_tweet+word+" "
    processed_text.append(lemm_tweet)

In [11]:
processed_text[30:40]

['i want  to go  to promote  gear  and groove  but unfornately  no ride  there i may  b  going  to the one  in anaheim  in may  though  ',
 'thought  sleeping  in was an option  tomorrow  but realizing  that it now is not evaluation  in the morning  and work  in the afternoon  ',
 'awe  i love  you too am here i miss  you ',
 'i cry  my asian  eye  to sleep  at night  ',
 'ok  im  sick  and spent  an hour  sitting  in the shower  cause  i was too sick  to stand  and held  back  the puke  like  a champ  bed  now ',
 'ill  tell  ya  the story  later  not a good  day  and ill  be workin  for like  three  more hour  ',
 'sorry  bed  time  came  here gmt  url  ',
 'i dont  either  its depressing  i dont  think  i even  want  to know  about the kid  in suitcase  ',
 'bed  class  work  gym  or then class  another  day  thats  gonna  fly  by i miss  my girlfriend  ',
 'really  dont  feel  like  getting  up today  but got  to study  to for tomorrow  practical  exam  ']

In [12]:
time2=time.time()
print("Time Taken :",round(time2-time1)/60,"minutes")

Time Taken : 7.4 minutes


In [13]:
df['processed_text']=processed_text
df.drop(columns=['Tweet'],inplace=True)
df.head()

Unnamed: 0,class,id,Date-Time,Query,User,processed_text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he cant update his facebook ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,i dived many time for the ball managed to...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feel itchy and like its on ...
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,no its not behaving at all im mad why am i ...
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,not the whole crew


In [14]:
df['processed_text']

0          is upset  that he cant  update  his facebook  ...
1          i dived  many  time  for the ball  managed  to...
2          my whole  body  feel  itchy  and like  its on ...
3          no its not behaving  at all im  mad  why am i ...
4                                      not the whole  crew  
                                 ...                        
1599994    just woke  up having no school  is the best  f...
1599995    thewdbcom  very cool  to hear  old  walt  inte...
1599996    are you ready  for your mojo  makeover  ask  m...
1599997    happy  th  birthday  to my boo  of alll  time ...
1599998                              happy  charitytuesday  
Name: processed_text, Length: 1599999, dtype: object

In [15]:
cols=['processed_text']
X=df[cols]
y=df['class']
X_train,X_test,y_train,y_test=train_test_split(df['processed_text'],df['class'],random_state=0,test_size=0.05)

In [16]:
X_train.head()

429405     im  free  now and welcome  for graduation  no ...
399143     will be in congo  during political  election  ...
1221496        omg  my shower  was too refreshing  i think  
881682             on my way  back  home  tonight  was fun  
1161833     on sunsetttt  after a fun  night  at ucla  with 
Name: processed_text, dtype: object

In [17]:
print(X_train.head(),"\n")
print(y_train.head())
X_train.shape
y_train.shape


429405     im  free  now and welcome  for graduation  no ...
399143     will be in congo  during political  election  ...
1221496        omg  my shower  was too refreshing  i think  
881682             on my way  back  home  tonight  was fun  
1161833     on sunsetttt  after a fun  night  at ucla  with 
Name: processed_text, dtype: object 

429405     0
399143     0
1221496    1
881682     1
1161833    1
Name: class, dtype: int32


(1519999,)

In [18]:
print("Positive count ",len(df[df['class']==1]))
print("Negative count ",len(df[df['class']==0]))

Positive count  800000
Negative count  799999


## Tfidf Vectorizer

In [123]:
time5=time.time()
tf=TfidfVectorizer(ngram_range=(1,3)).fit(X_train)
x_train_transform=tf.transform(X_train)
x_test_transform=tf.transform(X_test)
vocab=tf.vocabulary_.keys()
#vocab
time6=time.time()
print("Time Taken ",round((time6-time5)/60)," minutes")

Time Taken  3  minutes


In [124]:
features=np.array(tf.get_feature_names())
len(features)

12256866

## Linear SVC

In [132]:
time3=time.time()
lsvc=LinearSVC()
lsvc.fit(x_train_transform,y_train)
acc_score=lsvc.score(x_test_transform,y_test)
print("Accuracy Score ",acc_score)
time4=time.time()
print("Time Taken ",round((time4-time3)/60)," minutes")

Accuracy Score  0.8261125
Time Taken  16  minutes




In [178]:
y_pred_lsvc=lsvc.predict(x_test_transform)
cm_lsvc=confusion_matrix(y_test,y_pred_lsvc)
roc_score_lsvc=roc_auc_score(y_test,y_pred_lsvc)
print(cm_lsvc)
roc_score_lsvc


[[33629  6357]
 [ 7554 32460]]


0.8261177155744202

In [176]:
print(classification_report(y_test,y_pred_lsvc))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83     39986
           1       0.84      0.81      0.82     40014

    accuracy                           0.83     80000
   macro avg       0.83      0.83      0.83     80000
weighted avg       0.83      0.83      0.83     80000



In [177]:
coef=np.array(lsvc.coef_[0].argsort())
negative_=features[coef[:10]]
positive_=features[coef[-11:-1][::-1]]
print("Positive features :",positive_)
print("Negative features :",negative_)

Positive features : ['dont have to' 'not bad' 'no problem' 'not so bad' 'not sad'
 'doesnt have to' 'cannot wait' 'doesnt hurt' 'no worry' 'excited']
Negative features : ['sad' 'miss' 'poor' 'cant' 'missing' 'rip' 'sick' 'sadly' 'unfortunately'
 'died']


## Logistic Regression

In [136]:
time5=time.time()
lg=LogisticRegression(C=1,random_state=0,max_iter=10000)
lg.fit(x_train_transform,y_train)
acc_score=lg.score(x_test_transform,y_test)
print("Accuracy Score ",acc_score,"\n")
time6=time.time()
print("Time Taken ",round((time6-time5)/60)," minutes")

Accuracy Score  0.8241625 

Time Taken  15  minutes


In [180]:
y_pred_lg=lg.predict(x_test_transform)
cm_lg=confusion_matrix(y_test,y_pred_lg)
roc_score_lg=roc_auc_score(y_test,y_pred_lg)
print(cm_lg)
roc_score_lg

[[33462  6524]
 [ 7543 32471]]


0.8241669365854497

In [181]:
print(classification_report(y_test,y_pred_lg))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83     39986
           1       0.83      0.81      0.82     40014

    accuracy                           0.82     80000
   macro avg       0.82      0.82      0.82     80000
weighted avg       0.82      0.82      0.82     80000



In [139]:
coef=np.array(lg.coef_[0].argsort())
negative_=features[coef[:10]]
positive_=features[coef[-11:-1]]
print("Positive features :",positive_)
print("Negative features :",negative_)

Positive features : ['yay' 'smile' 'love' 'good' 'excited' 'awesome' 'happy' 'dont have to'
 'not bad' 'no problem']
Negative features : ['sad' 'miss' 'poor' 'cant' 'sick' 'missing' 'hate' 'rip' 'suck' 'sadly']


## Bernoulli Naive Bayes

In [144]:
time5=time.time()
bnb=BernoulliNB(alpha=2)
bnb.fit(x_train_transform,y_train)
acc_score=bnb.score(x_test_transform,y_test)
print("Accuracy Score ",acc_score,"\n")
time6=time.time()
print("Time Taken ",round((time6-time5)/60)," minutes")

Accuracy Score  0.7983875 

Time Taken  0  minutes


## Multinomial Naive Bayes

In [183]:
time5=time.time()
mnb=MultinomialNB(alpha=2)
mnb.fit(x_train_transform,y_train)
acc_score=mnb.score(x_test_transform,y_test)
print("Accuracy Score ",acc_score,"\n")
time6=time.time()
print("Time Taken ",round((time6-time5)/60)," minutes")

Accuracy Score  0.8059875 

Time Taken  0  minutes


In [184]:
y_pred=bnb.predict(x_test_transform)
cm_bnb=confusion_matrix(y_test,y_pred)
cm_bnb

array([[30428,  9558],
       [ 6571, 33443]], dtype=int64)

### Gradient boosting Classifier

In [24]:
time7=time.time()
gbc=GradientBoostingClassifier()
gbc.fit(x_train_transform,y_train)
acc_score_gbc=gbc.score(x_test_transform,y_test)
print("Accuracy Score ",acc_score_gbc,"\n")
time8=time.time()
print("Time Taken ",round((time8-time7)/60)," minutes")

Accuracy Score  0.6768375 

Time Taken  31  minutes
