In [1]:
%reload_ext autoreload

In [2]:
import numpy as np 
import pandas as pd
import os
import random

import joblib

import matplotlib.pyplot as plt 

from sklearn.pipeline import Pipeline 

from sklearn.metrics import ( classification_report, confusion_matrix, f1_score as calculate_f1_score, accuracy_score as calculate_accuracy_score)
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from utils import clean_text, CleanTextTransformer, DenseTransformer, CategoricalBatchNB

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wizardcalidad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/wizardcalidad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
tweet = pd.read_csv('tweets.csv')

In [4]:
tweet.head(3)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative


In [5]:
tweet.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [6]:
df = tweet.dropna()

In [7]:
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [8]:
indexNames = df[ df['sentiment'] == 'neutral' ].index
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive


In [10]:
df.shape

(16363, 4)

In [11]:
df = df.drop(columns=['textID'])

In [12]:
df.tail(3)

Unnamed: 0,text,selected_text,sentiment
27477,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,But it was worth it ****.,But it was worth it ****.,positive


In [13]:
X = df["selected_text"]
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [20]:
cv_NB = Pipeline([
      ('clean_text', CleanTextTransformer()),
      ('bow', CountVectorizer(stop_words="english", binary=True)),
      ('classifier', CategoricalBatchNB(batch_size=1000, classes=[0,1]))
])

In [15]:
cv_NB.fit(X_train, y_train)

Pipeline(steps=[('clean_text', CleanTextTransformer()),
                ('bow', CountVectorizer(binary=True, stop_words='english')),
                ('classifier', CategoricalBatchNB())])

In [16]:
joblib.dump(cv_NB, "tweets_categorical_with_count_vectorizer.joblib")

['tweets_categorical_with_count_vectorizer.joblib']

In [17]:
y_pred = cv_NB.predict(X_test) #predict testing data

print(classification_report(y_test, y_pred))

TypeError: '<' not supported between instances of 'int' and 'str'

In [18]:
y_test


5057     positive
19195    positive
17479    positive
9873     negative
8976     negative
           ...   
5234     positive
20567    positive
8049     positive
4444     negative
18888    positive
Name: sentiment, Length: 4909, dtype: object