In [1]:
# --- Mount Google Drive (for Colab) ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

base_dir = '/content/drive/MyDrive/NLP'
os.makedirs(base_dir, exist_ok=True)
train_path = os.path.join(base_dir, 'Dataset/train.csv')
test_path  = os.path.join(base_dir, 'Dataset/test.csv')

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import sklearn
import tensorflow

In [4]:
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

In [5]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
df_test.head()


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [7]:
print(df_train.shape)
df_test.shape

(27481, 4)


(3534, 3)

In [8]:
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [38]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
from nltk.corpus import stopwords

In [11]:
print(df_train.columns)
print(df_test.columns)

Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')
Index(['textID', 'text', 'sentiment'], dtype='object')


In [12]:
print(df_train.info())
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   textID     3534 non-null   object
 1   text       3534 non-null   object
 2   sentiment  3534 non-null   object
dtypes: object(3)
memory usage: 83.0+ KB


In [13]:

print(df_train.isna().sum())

# only 2 null values out of 24000 total values.
# let's drop them.


df_test.isna().sum()

# No null values in test dataset.

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64


Unnamed: 0,0
textID,0
text,0
sentiment,0


In [14]:
df_train.dropna(inplace=True)


In [15]:
df_train.isna().sum()

# No null values left.

Unnamed: 0,0
textID,0
text,0
selected_text,0
sentiment,0


In [16]:
import string


In [17]:
print(df_train['text'][4])
df_train['selected_text'][4].split()

 Sons of ****, why couldn`t they put them on the releases we already bought


['Sons', 'of', '****,']

In [18]:

def sel_tex(i):
    split_text = i.split()
    return split_text

In [19]:
df_train['selected_text2'] = df_train['selected_text'].apply(sel_tex)


In [20]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,selected_text2
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[I`d, have, responded,, if, I, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[Sooo, SAD]"
2,088c60f138,my boss is bullying me...,bullying me,negative,"[bullying, me]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[Sons, of, ****,]"


In [21]:
# selected_text column of test dataset will bo on the basis of selected_text of Train dataset to
#    predict better for types of messages.


select_text = pd.Series(df_train['selected_text'])


list1 = ' '.join(select_text)


list2 = list1.split()

In [22]:
def test_select(i):
    l  = [ ]
    for w in i.split():
        if w in list2:
            l.append(w)
    return(l)

In [23]:
df_test['selected_text'] = df_test['text'].apply(test_select)


In [24]:
df_test.head(6)


Unnamed: 0,textID,text,sentiment,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,"[Last, session, of, the, day]"
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,"[is, also, really, exciting, --, Good, tweeps,..."
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"[hit, she, has, to, quit, her, company,, such, a]"
3,01082688c6,happy bday!,positive,"[happy, bday!]"
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,"[-, I, like, it!!]"
5,726e501993,that`s great!! weee!! visitors!,positive,"[that`s, great!!]"


In [25]:
df_train.head(1)


Unnamed: 0,textID,text,selected_text,sentiment,selected_text2
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[I`d, have, responded,, if, I, were, going]"


In [26]:
from sklearn.feature_extraction.text import CountVectorizer


# Fitting and Training the Model

In [27]:
bag_of_words = CountVectorizer(analyzer=test_select).fit(df_test['text'])

In [28]:
df_test_bow_trans = bag_of_words.transform(df_test['text'])

In [29]:
df_test_bow_trans


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 37379 stored elements and shape (3534, 6117)>

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

In [31]:
tfidf = TfidfTransformer().fit(df_test_bow_trans)

In [32]:
df_test_tfidf = tfidf.transform(df_test_bow_trans)

In [33]:
df_test_tfidf.shape

(3534, 6117)

In [34]:
from sklearn.naive_bayes import MultinomialNB

In [35]:
df_test['text_length'] = df_test['text'].apply(lambda x : len(x))


In [36]:
def test_select(i):
    list_text = [text for text in i if text not in string.punctuation]
    join_test_text = ''.join(list_text)
    clean_test_text = [ text for text in join_test_text.split() if text.lower() not in stopwords.words('english')]
    return clean_test_text

In [39]:
df_test['selected_text'] = df_test['text'].apply(test_select)

In [40]:
df_test.head()


Unnamed: 0,textID,text,sentiment,selected_text,text_length
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,"[Last, session, day, httptwitpiccom67ezh]",49
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,"[Shanghai, also, really, exciting, precisely, ...",103
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"[Recession, hit, Veronique, Branquinho, quit, ...",78
3,01082688c6,happy bday!,positive,"[happy, bday]",12
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,"[httptwitpiccom4w75p, like]",39


In [41]:
bag_of_words = CountVectorizer(analyzer=test_select).fit(df_test['text'])


df_test_bow_trans = bag_of_words.transform(df_test['text'])


tfidf = TfidfTransformer().fit(df_test_bow_trans)


df_test_tfidf = tfidf.transform(df_test_bow_trans)


sentiment_detect_model = MultinomialNB().fit(df_test_tfidf,df_test['sentiment'])


all_sentiments_predictions = sentiment_detect_model.predict(df_test_tfidf)

In [42]:
from sklearn.metrics import confusion_matrix,classification_report

In [43]:
print(confusion_matrix(all_sentiments_predictions,df_test['sentiment']))


[[ 814    9    3]
 [ 169 1400  111]
 [  18   21  989]]


In [44]:
print(classification_report(all_sentiments_predictions,df_test['sentiment']))
# ACCURACY = 91 %

              precision    recall  f1-score   support

    negative       0.81      0.99      0.89       826
     neutral       0.98      0.83      0.90      1680
    positive       0.90      0.96      0.93      1028

    accuracy                           0.91      3534
   macro avg       0.90      0.93      0.91      3534
weighted avg       0.92      0.91      0.91      3534

