# Solution Guide

**Import necessary libraries**

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

**Load your data**

In [14]:
data = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv')

**Check the forst rows of your dataset**

In [15]:
data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


**Create a preprocessing function that drops the package name column and puts reviews in lower case**

In [16]:
def preprocess_data(data):
    # Remove package name as it's not relevant
    data = data.drop('package_name', axis=1)
    
    # Convert text to lowercase
    data['review'] = data['review'].str.strip().str.lower()
    return data

**Call your funtion**

In [17]:
data = preprocess_data(data)

**Separate your data**

In [18]:
# Separate your target and predictor
X = data['review']
y = data['polarity']

**Split the data**

In [19]:
#Split your data in train and test set

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.25, random_state=42)

**Vectorize text reviews to numbers.**

We will need to convert each review to a numerical representation that we call vectorization.

In [20]:
# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

**Fit your model**

In [21]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

**Get your score**

In [22]:
model.score(X_test, y_test)

0.8565022421524664

**Make a prediction**

In [23]:
model.predict(vec.transform(['Love this app simply awesome!']))

array([1])

**Save your model**

In [None]:
import pickle

filename = 'nb_model.sav'
pickle.dump(model, open(filename,'wb'))