# Predict a Sentiment Label From an Amazon Review
Train a model that can predict a sentiment label given a review string

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from read_data import get_output_amazon_data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from read_ml_models import save_model

## Read in the Amazon dataset

In [2]:
amazon_reviews = get_output_amazon_data()

## Use Naive Bayes since it has the highest score

In [3]:
def modelNB(Xt,Yt):
    nbm = make_pipeline(
      CountVectorizer(),
      MultinomialNB()
    )
    return nbm.fit(Xt,Yt)

## Train the model on the X and Y dataset.
We run train test split to split the data between training and testing data and train a naive bayes model based on this data.

In [11]:
def modeler(X,Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)
    modelnb = modelNB(X_train, Y_train)
    return modelnb, X_test, Y_test

In [12]:
Xv = amazon_reviews['review_body']
Yv = amazon_reviews['review_body_sentiment_label']

In [13]:
model, Xt, Yt = modeler(Xv, Yv)

In [14]:
model.score(Xt,Yt)

0.8338970594745836

In [17]:
fake_reviews = [
    "this headset is terrible. the sound quality is horrible and I can't justify paying 50 bucks for this!",
    "amazing game with a great storyline. i loved completing all the quests and collecting rewards!"    
]
sentiment_label_predictions = model.predict(fake_reviews)

sentiment_predictions_table = pd.DataFrame({
    "review_body": fake_reviews,
    "sentiment_label_prediction": sentiment_label_predictions
})

sentiment_predictions_table

Unnamed: 0,review_body,sentiment_label_prediction
0,this headset is terrible. the sound quality is...,negative
1,amazing game with a great storyline. i loved c...,positive


In [18]:
save_model(model,"sentiment_label_bayes.joblib.gz")