##Sentiment analysis##
Naive Bayes models are very useful when we want to analyze sentiment, classify texts into topics or recommendations, as the characteristics of these challenges meet the theoretical and methodological assumptions of the model very well.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

###Step 1: Loading the dataset###

In [2]:
URL="https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"

In [3]:
data=pd.read_csv(URL)

In [10]:
data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offlin...,0
1,com.facebook.katana,"messenger issues ever since the last update, i...",0
2,com.facebook.katana,profile any time my wife or anybody has more t...,0
3,com.facebook.katana,the new features suck for those of us who don'...,0
4,com.facebook.katana,forced reload on uploading pic on replying com...,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


###Step 2: Study of variables and their content###

In [5]:
#Removing spaces and converting the text to lowercase:
data["review"] = data["review"].str.strip().str.lower()

In [6]:
#Divide the dataset into train and test:
from sklearn.model_selection import train_test_split

In [7]:
X=data['review']
y=data['polarity']

X_train, X_test, y_train, y_test=train_test_split(X, y,test_size=0.20, random_state=42)

In [8]:
#Transform the text into a word count matrix.
#This is a way to obtain numerical features from the text. For this, we use the training set to train the transformer and apply it in test:

from sklearn.feature_extraction.text import CountVectorizer
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [9]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

###Step 3: Build a naive bayes model###

In [11]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)

In [13]:
y_pred = clf.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [14]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.770949720670391

###Step 4: Save the model###

In [16]:
from pickle import dump

dump(clf, open("naive_bayes_default.sav", "wb"))