## Overview:
1. Import the dataset.
2. Data cleaning- Removal of punctuations and common words.
3. Conversion of string to array of frequencies.
4. Training of the model.
5. Obtain the results.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, recall_score, precision_score, f1_score

### 1. Import the dataset.

In [2]:
df=pd.read_csv('reviews.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [4]:
df.describe()

Unnamed: 0,id,label
count,31962.0,31962.0
mean,15981.5,0.070146
std,9226.778988,0.255397
min,1.0,0.0
25%,7991.25,0.0
50%,15981.5,0.0
75%,23971.75,0.0
max,31962.0,1.0


In [5]:
df.drop(['id'],axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [7]:
df['lengths'] = df['tweet'].apply(len)

In [8]:
df.head()

Unnamed: 0,label,tweet,lengths
0,0,@user when a father is dysfunctional and is s...,102
1,0,@user @user thanks for #lyft credit i can't us...,122
2,0,bihday your majesty,21
3,0,#model i love u take with u all the time in ...,86
4,0,factsguide: society now #motivation,39


#### segregation of data into postives and negatives

In [9]:
positive = df[df['label']==0]
positive

Unnamed: 0,label,tweet,lengths
0,0,@user when a father is dysfunctional and is s...,102
1,0,@user @user thanks for #lyft credit i can't us...,122
2,0,bihday your majesty,21
3,0,#model i love u take with u all the time in ...,86
4,0,factsguide: society now #motivation,39
...,...,...,...
31956,0,off fishing tomorrow @user carnt wait first ti...,61
31957,0,ate @user isz that youuu?ðððððð...,68
31958,0,to see nina turner on the airwaves trying to...,131
31959,0,listening to sad songs on a monday morning otw...,63


In [10]:
negative = df[df['label']==1]
negative

Unnamed: 0,label,tweet,lengths
13,1,@user #cnn calls #michigan middle school 'buil...,74
14,1,no comment! in #australia #opkillingbay #se...,101
17,1,retweet if you agree!,22
23,1,@user @user lumpy says i am a . prove it lumpy.,47
34,1,it's unbelievable that in the 21st century we'...,104
...,...,...,...
31934,1,lady banned from kentucky mall. @user #jcpenn...,59
31946,1,@user omfg i'm offended! i'm a mailbox and i'...,82
31947,1,@user @user you don't have the balls to hashta...,112
31948,1,"makes you ask yourself, who am i? then am i a...",87


### 2. Data cleaning- Removal of punctuations and common words.

In [11]:
nltk.download('stopwords')
stopwords.words('english')
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def text_cleaning(text):
    remv_punc = [char for char in text.lower() if char not in string.punctuation]
    remv_punc_join = ''.join(remv_punc)
    remv_punc_clean = [word for word in remv_punc_join.split() if word.lower() not in stopwords.words('english')]
    return remv_punc_clean

In [13]:
tweets_df_clean = df['tweet'].apply(text_cleaning)

In [14]:
print("Original Tweet:")
print(df['tweet'][0])

print("Converted Tweet:")
print(tweets_df_clean[0])
print(tweets_df_clean[1])

Original Tweet:
 @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
Converted Tweet:
['user', 'father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']
['user', 'user', 'thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked']


### 3. Conversion of string to array of frequencies.

In [15]:
# This will take unique words utilized in text as features, and then count that how many time each word is 
# utilized in that sentence.
vectorizer_analyzer = CountVectorizer(analyzer = text_cleaning)
countvectorizer_tweets = CountVectorizer(analyzer= text_cleaning, dtype= 'uint8').fit_transform(df['tweet']).toarray()

#### For example: 

['This is first method.', 'This method is the second method.', 'This new one is the third one.' ]

['first', 'is', 'method', 'new', 'one', 'second', 'the', 'third', 'this']

[[1 1 1 0 0 0 0 0 1]
 [0 1 2 0 0 1 1 0 1]
 [0 1 0 1 2 0 1 1 1]]
 

In [16]:
countvectorizer_tweets[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

### 4. Training of the model.

In [17]:
X=countvectorizer_tweets
y=df['label']

sm = SMOTE()
X, y = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

X_train.shape, y_train.shape

((47552, 47385), (47552,))

In [18]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train,y_train)

MultinomialNB()

In [19]:
pred = nb.predict(X_test)

### 5. Obtain the results.

In [20]:
cm = confusion_matrix(y_test, pred)
print(cm)

[[5819  125]
 [ 141 5803]]


In [21]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5944
           1       0.98      0.98      0.98      5944

    accuracy                           0.98     11888
   macro avg       0.98      0.98      0.98     11888
weighted avg       0.98      0.98      0.98     11888



In [22]:
cm = confusion_matrix(y_test,pred)
print(cm)
print('Accuracy {:.2f} \nRecall: {:.2f} \nPrecision: {:.2f} \nF1-Score: {:.2f}'.format((accuracy_score(y_test,pred)*100),
                                                                                       (recall_score(y_test,pred)*100),
                                                                                       (precision_score(y_test,pred)*100),
                                                                                       (f1_score(y_test,pred)*100)))


[[5819  125]
 [ 141 5803]]
Accuracy 97.76 
Recall: 97.63 
Precision: 97.89 
F1-Score: 97.76
