In [1]:
%reload_ext autoreload

In [2]:
import numpy as np 
import pandas as pd
import os
import random

import joblib

import matplotlib.pyplot as plt 

from sklearn.pipeline import Pipeline 
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from utils import clean_text, CleanTextTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wizardcalidad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/wizardcalidad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
tweet = pd.read_csv('tweets.csv')

In [4]:
tweet.head(3)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative


In [5]:
tweet.shape

(27481, 4)

In [6]:
tweet.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [7]:
df = tweet.dropna()

In [8]:
df.shape

(27480, 4)

In [9]:
indexNames = df[ df['sentiment'] == 'neutral' ].index
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [10]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive


In [11]:
df.shape

(16363, 4)

In [12]:
df = df.drop(columns=['textID'])

In [13]:
df.tail(3)

Unnamed: 0,text,selected_text,sentiment
27477,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,But it was worth it ****.,But it was worth it ****.,positive


In [14]:
X = df["selected_text"]
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [15]:
cv_NB = Pipeline([
      ('clean_text', CleanTextTransformer()),
      ('bow', CountVectorizer(stop_words="english")),
      ('classifier', ComplementNB())
])

In [16]:
cv_NB.fit(X_train, y_train)

Pipeline(steps=[('clean_text', CleanTextTransformer()),
                ('bow', CountVectorizer(stop_words='english')),
                ('classifier', ComplementNB())])

In [17]:
joblib.dump(cv_NB, "tweets_complement_with_count_vectorizer.joblib")

['tweets_complement_with_count_vectorizer.joblib']

In [18]:
y_pred = cv_NB.predict(X_test) #predict testing data

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.87      0.91      0.89      2398
    positive       0.91      0.87      0.89      2511

    accuracy                           0.89      4909
   macro avg       0.89      0.89      0.89      4909
weighted avg       0.89      0.89      0.89      4909



In [19]:
accuracy = []

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=100) 
skf.get_n_splits(X, y) 

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index] 
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    cv_NB.fit(x_train_fold, y_train_fold)
    result = cv_NB.score(x_test_fold, y_test_fold)
    accuracy.append(result)

accuracy = np.array(accuracy)

# Print the output
print('List of first 10 possible accuracy:')
for index, acc in enumerate(accuracy[:10]):
    print(f"{index+1:3d}. {acc:.4f}")

print('\nMetrics that were obtained from this model:')
print(f' Maximum Accuracy:   {accuracy.max()*100:.2f}%') 
print(f' Minimum Accuracy:   {accuracy.min()*100:.2f}%') 
print(f' Mean Accuracy:   {accuracy.mean()*100:.2f}%') 
print(f' Standard Deviation: {accuracy.std():.4f}')

List of first 10 possible accuracy:
  1. 0.9004
  2. 0.8943
  3. 0.8937
  4. 0.8936
  5. 0.9071
  6. 0.9022
  7. 0.8912
  8. 0.9101
  9. 0.8998
 10. 0.9016

Metrics that were obtained from this model:
 Maximum Accuracy:   91.01%
 Minimum Accuracy:   89.12%
 Mean Accuracy:   89.94%
 Standard Deviation: 0.0059


In [20]:
tfidf_NB = Pipeline([
      ('clean_text', CleanTextTransformer()),
      ('bow', TfidfVectorizer(stop_words="english")),
      ('classifier', ComplementNB())
])

In [21]:
tfidf_NB.fit(X_train, y_train)

Pipeline(steps=[('clean_text', CleanTextTransformer()),
                ('bow', TfidfVectorizer(stop_words='english')),
                ('classifier', ComplementNB())])

In [22]:
joblib.dump(tfidf_NB, "tweets_complement_with_tfidf_vectorizer.joblib")

['tweets_complement_with_tfidf_vectorizer.joblib']

In [23]:
y_pred = tfidf_NB.predict(X_test) #predict testing data

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.88      0.91      0.89      2398
    positive       0.91      0.88      0.89      2511

    accuracy                           0.89      4909
   macro avg       0.89      0.89      0.89      4909
weighted avg       0.90      0.89      0.89      4909



In [24]:
accuracy = []

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=100) 
skf.get_n_splits(X, y) 

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index] 
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    cv_NB.fit(x_train_fold, y_train_fold)
    result = tfidf_NB.score(x_test_fold, y_test_fold)
    accuracy.append(result)

accuracy = np.array(accuracy)

# Print the output
print('List of first 10 possible accuracy:')
for index, acc in enumerate(accuracy[:10]):
    print(f"{index+1:3d}. {acc:.4f}")

print('\nMetrics that were obtained from this model:')
print(f' Maximum Accuracy:   {accuracy.max()*100:.2f}%') 
print(f' Minimum Accuracy:   {accuracy.min()*100:.2f}%') 
print(f' Mean Accuracy:   {accuracy.mean()*100:.2f}%') 
print(f' Standard Deviation: {accuracy.std():.4f}')

List of first 10 possible accuracy:
  1. 0.9279
  2. 0.9371
  3. 0.9365
  4. 0.9285
  5. 0.9364
  6. 0.9346
  7. 0.9364
  8. 0.9383
  9. 0.9383
 10. 0.9370

Metrics that were obtained from this model:
 Maximum Accuracy:   93.83%
 Minimum Accuracy:   92.79%
 Mean Accuracy:   93.51%
 Standard Deviation: 0.0036
