In [1]:
#Import the libraries.
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 


In [2]:
#Read the data 
df=pd.read_csv('sentiment.csv')

In [3]:
#Check first 5 rows
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
#size of data 
df.shape

(31962, 3)

In [5]:
#Total no. of classes in target value
df['label'].value_counts()

label
0    29720
1     2242
Name: count, dtype: int64

In [8]:
#Understanding the sentiments in the data
df[df['label']==0].head(10) 

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [9]:
df[df['label']==1].head(10)

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'buil...
14,15,1,no comment! in #australia #opkillingbay #se...
17,18,1,retweet if you agree!
23,24,1,@user @user lumpy says i am a . prove it lumpy.
34,35,1,it's unbelievable that in the 21st century we'...
56,57,1,@user lets fight against #love #peace
68,69,1,ð©the white establishment can't have blk fol...
77,78,1,"@user hey, white people: you can call people '..."
82,83,1,how the #altright uses &amp; insecurity to lu...
111,112,1,@user i'm not interested in a #linguistics tha...


In [9]:
#Lets classify the tweets into positive or negative sentiment.

In [10]:
df.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [11]:
#ID is not helpful so remove it.
df.drop(['id'],axis=1,inplace=True)
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [12]:
#Data preprocessing

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
#function to remove punctuation

def clean(text):
    remv_pun=[char for char in text.lower() if char not in string.punctuation]
    remv_punc_join = ''.join(remv_pun)
    return remv_punc_join

In [14]:
clean(' @ Great beginning,,, takes! time,,,.   #run')

'  great beginning takes time   run'

In [15]:
tweets_df_clean = df['tweet'].apply(clean)

In [16]:
tweets_df_clean[6]


' user camping tomorrow user user user user user user user dannyâ\x80¦'

In [17]:
tweets_df_clean.head()

0     user when a father is dysfunctional and is so...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model   i love u take with u all the time in u...
4                 factsguide society now    motivation
Name: tweet, dtype: object

In [18]:
#Install a popular nlp library called nltk
!pip install nltk




[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
#Remove stopwords.
import nltk 
stopwords=nltk.corpus.stopwords.words('english')
print(stopwords[:5])


['i', 'me', 'my', 'myself', 'we']


In [20]:
tweets_df = pd.DataFrame(tweets_df_clean)
tweets_df.columns

Index(['tweet'], dtype='object')

In [21]:
# stopwords treatment and converting the data into lower case 
def stop(text):
    remv_stop = [a for a in text.split() if a.lower() not in stopwords]
    remv_stop_join = ' '.join(remv_stop)
    return remv_stop_join

In [22]:
stop(tweets_df['tweet'][0])


'user father dysfunctional selfish drags kids dysfunction run'

In [23]:
tweets_df_stopwords = tweets_df['tweet'].apply(stop)


In [24]:
tweets_df_stopwords[:2]


0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
Name: tweet, dtype: object

In [25]:
tweets_df_stopwords = pd.DataFrame(tweets_df_stopwords)
tweets_df_stopwords

Unnamed: 0,tweet
0,user father dysfunctional selfish drags kids d...
1,user user thanks lyft credit cant use cause do...
2,bihday majesty
3,model love u take u time urð± ðððð...
4,factsguide society motivation
...,...
31957,ate user isz youuuððððððð...
31958,see nina turner airwaves trying wrap mantle ge...
31959,listening sad songs monday morning otw work sad
31960,user sikh temple vandalised calgary wso condem...


In [26]:
from nltk.stem import PorterStemmer
st = PorterStemmer()

def steming(text):
    ste = [st.stem(word) for word in text.split()]
    ste_join = ' '.join(ste)
    return ste_join

In [27]:
tweets_df_stem = tweets_df_stopwords['tweet'].apply(steming)

tweets_df_stem[:2]

# The dataset has been stemmed to its root word

0    user father dysfunct selfish drag kid dysfunct...
1    user user thank lyft credit cant use caus dont...
Name: tweet, dtype: object

In [28]:
tweets_df_stopwords['tweet'][0]

'user father dysfunctional selfish drags kids dysfunction run'

In [29]:
#Applying Lemmatization
from nltk.stem import WordNetLemmatizer

wl = WordNetLemmatizer()

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...


In [None]:
def lematize(text):
    ste = [wl.lemmatize(word) for word in text.split()]
    ste_join = ' '.join(ste)
    return ste_join

In [32]:
lematize('Dog keepss on barkings')


'Dog keep on barkings'

In [33]:
tweets_df_stopwords.iloc[:2]

Unnamed: 0,tweet
0,user father dysfunctional selfish drags kids d...
1,user user thanks lyft credit cant use cause do...


In [34]:
tweets_df_stem = pd.DataFrame(tweets_df_stem)
tweets_df_stem.head()

Unnamed: 0,tweet
0,user father dysfunct selfish drag kid dysfunct...
1,user user thank lyft credit cant use caus dont...
2,bihday majesti
3,model love u take u time urð± ðððð...
4,factsguid societi motiv


In [35]:
# Applying the Count Vectorizer 

from sklearn.feature_extraction.text import CountVectorizer 

cv = CountVectorizer(max_features=5000)

sen = tweets_df_stem['tweet'].tolist()
len(sen)

31962

In [36]:
from pandas import DataFrame

In [37]:
def document_matrix(text, vectorizer):
    mat = vectorizer.fit_transform(text)
    return DataFrame(mat.toarray())

In [38]:
m = document_matrix(sen,cv)
m.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer 


tfidf_vec = TfidfVectorizer(max_features=2500)

In [40]:
#Splitting the data into dependent and independent variable

In [41]:
y= df['label']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [42]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(m,y,test_size=0.33,random_state=25)

In [43]:
x_train.shape
y_train.shape

(21414,)

In [44]:
from sklearn.naive_bayes import MultinomialNB
NaiveBclassifier = MultinomialNB()
NaiveBclassifier.fit(x_train,y_train)

MultinomialNB()

In [45]:
# Predicting train cases
y_pred_train = NaiveBclassifier.predict(x_train)

In [46]:
from sklearn.metrics import accuracy_score
#Accuracy Score 

acc = accuracy_score(y_train, y_pred_train)
acc

0.9614738021854862