# Naive Bayes project

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import pickle

In [2]:
# Read csv

df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv')

In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [4]:
df_raw.sample(5)

Unnamed: 0,package_name,review,polarity
816,com.hamropatro,hamro keyboar needs update n new emojis thnkz...,1
92,com.linkedin.android,good app but......... why do post come up in ...,0
459,com.whatsapp,wrong details of last seen after upgrading to...,0
273,com.android.chrome,dear google hate how tabs work on samsung gal...,0
131,com.king.candycrushsaga,lost power ups switched phones and lost my po...,0


In [5]:
df_raw['polarity'].value_counts()

0    584
1    307
Name: polarity, dtype: int64

**1. Transform dataframe**

In [6]:
df_transf = df_raw.copy()

In [7]:
# Drop package_name column
# 
df_transf = df_transf.drop('package_name', axis=1)

In [9]:
# Strip whitespaces from left and right sides from column review

df_transf['review'] = df_transf['review'].str.strip()

# column review to lower case

df_transf['review'] = df_transf['review'].str.lower()

In [10]:
df = df_transf.copy()

**2. Split data frame**

In [11]:
X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=25)

# use stratify because dataset is unbalanced

**3. Three options of pipelines depending on the preprocessing steps**

3.1 One preprocessing step (CountVectorizer) and one model step

In [12]:
clf_1 = Pipeline([('cont_vect', CountVectorizer()), ('clf', MultinomialNB())])
clf_1.fit(X_train, y_train)
pred_1 = clf_1.predict(X_test)

3.2 One preprocessing step (TfidfVectorizer) and one model step

In [13]:
clf_2 = Pipeline([('tfidf_vect', TfidfVectorizer()), ('clf', MultinomialNB())])
clf_2.fit(X_train, y_train)
pred_2 = clf_2.predict(X_test)

3.3 Two preprocessing steps (CountVectorizer and TfidfTransformer) and one model step

In [14]:
clf_3 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
clf_3.fit(X_train, y_train)
pred_3 = clf_3.predict(X_test)

**4. Check results**

In [15]:
print('CountVectorizer')
print(classification_report(y_test, pred_1))
print('TfidfVectorizer')
print(classification_report(y_test, pred_2))
print('CountVectorizer and TfidfTransformer')
print(classification_report(y_test, pred_3))

CountVectorizer
              precision    recall  f1-score   support

           0       0.82      0.95      0.88       146
           1       0.87      0.61      0.72        77

    accuracy                           0.83       223
   macro avg       0.85      0.78      0.80       223
weighted avg       0.84      0.83      0.83       223

TfidfVectorizer
              precision    recall  f1-score   support

           0       0.69      0.99      0.81       146
           1       0.92      0.14      0.25        77

    accuracy                           0.70       223
   macro avg       0.80      0.57      0.53       223
weighted avg       0.77      0.70      0.62       223

CountVectorizer and TfidfTransformer
              precision    recall  f1-score   support

           0       0.69      0.99      0.81       146
           1       0.92      0.14      0.25        77

    accuracy                           0.70       223
   macro avg       0.80      0.57      0.53       223
weigh

In [16]:
print('clf_1 Test Accuracy = ',metrics.accuracy_score(y_test,pred_1))
print('clf_2 Test Accuracy = ',metrics.accuracy_score(y_test,pred_2))
print('clf_3 Test Accuracy = ',metrics.accuracy_score(y_test,pred_3))

clf_1 Test Accuracy =  0.8340807174887892
clf_2 Test Accuracy =  0.6995515695067265
clf_3 Test Accuracy =  0.6995515695067265


- The model with highest accuracy in the dataset (without making hyperparameter tuning) is the Multinomial Naive Bayes with CounVectorizer as preprocessing step

**5. Randomized search to select hyperparameters**

In [22]:
n_iter_search = 5
parameters = {'cont_vect__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (1e-2, 1e-3)}
gs_clf_1 = RandomizedSearchCV(clf_1, parameters, n_iter = n_iter_search)
gs_clf_1.fit(X_train, y_train)
pred_1_grid = gs_clf_1.predict(X_test)



In [25]:
gs_clf_1.best_params_

{'cont_vect__ngram_range': (1, 1), 'clf__alpha': 0.01}

In [23]:
n_iter_search = 5
parameters = {'clf__alpha': (1e-2, 1e-3)}
gs_clf_2 = RandomizedSearchCV(clf_2, parameters, n_iter = n_iter_search)
gs_clf_2.fit(X_train, y_train)
pred_2_grid = gs_clf_2.predict(X_test)



In [26]:
gs_clf_2.best_params_

{'clf__alpha': 0.01}

In [24]:
n_iter_search = 5
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
gs_clf_3 = RandomizedSearchCV(clf_3, parameters, n_iter = n_iter_search)
gs_clf_3.fit(X_train, y_train)
pred_3_grid = gs_clf_3.predict(X_test)

In [27]:
gs_clf_3.best_params_

{'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.01}

In [29]:
print('gs_clf_1')
print(classification_report(y_test, pred_1_grid))
print('gs_clf_2')
print(classification_report(y_test, pred_2_grid))
print('gs_clf_3')
print(classification_report(y_test, pred_3_grid))

gs_clf_1
              precision    recall  f1-score   support

           0       0.82      0.94      0.88       146
           1       0.84      0.61      0.71        77

    accuracy                           0.83       223
   macro avg       0.83      0.77      0.79       223
weighted avg       0.83      0.83      0.82       223

gs_clf_2
              precision    recall  f1-score   support

           0       0.80      0.95      0.87       146
           1       0.86      0.55      0.67        77

    accuracy                           0.81       223
   macro avg       0.83      0.75      0.77       223
weighted avg       0.82      0.81      0.80       223

gs_clf_3
              precision    recall  f1-score   support

           0       0.84      0.97      0.90       146
           1       0.91      0.66      0.77        77

    accuracy                           0.86       223
   macro avg       0.88      0.81      0.83       223
weighted avg       0.87      0.86      0.85    

In [30]:
best_model = gs_clf_3.best_estimator_

In [31]:
print('The model with highest accuracy in the dataset, after hyperparameter tuning, is:', best_model)

The model with highest accuracy in the dataset, after hyperparameter tuning, is: Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('clf', MultinomialNB(alpha=0.01))])


This model will be saved to use it in the future with new data:

In [32]:
pickle.dump(best_model, open('../models/best_model.pickle', 'wb')) # save the model
# modelo = pickle.load(open('../models/best_model.pickle', 'rb')) # read the model in the future
# modelo.predict(X_test) # use it to predict with new data

** Extra: step by step (instead of a pipeline) to make sure pipeline is OK **

In [22]:
# vect = CountVectorizer() # vector de conteo
# text_vec = vect.fit_transform(X_train)

In [24]:
# vect.get_feature_names_out()

array(['000', '10', '100', ..., 'žŕľ', 'ˇŕ', 'ˇŕľ'], dtype=object)

In [43]:
# text_vec.toarray()[0]

In [44]:
# vect_tfidf = TfidfVectorizer()

# text_vec_tfidf = vect_tfidf.fit_transform(X_train)

In [33]:
# np.set_printoptions(threshold=sys.maxsize)
# text_vec_tfidf.toarray()[0]

In [38]:
# text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())])
# text_norm = text_clf.fit_transform(X_train)

In [41]:
# text_norm.toarray()[0]

In [45]:
# clf_1 = MultinomialNB()

# clf_1.fit(text_vec, y_train)

In [46]:
# clf_2 = MultinomialNB()

# clf_2.fit(text_vec_tfidf, y_train)

In [47]:
# clf_3 = MultinomialNB()

# clf_3.fit(text_norm, y_train)

In [34]:
# pred_1 = clf_1.predict(vect.transform(X_test))
# pred_2 = clf_2.predict(vect_tfidf.transform(X_test))
# pred_3 = clf_3.predict(text_clf.transform(X_test))

# print(classification_report(y_test, pred_1))
# print(classification_report(y_test, pred_2))
# print(classification_report(y_test, pred_3))