## Imports

In [5]:
# import keras/tenserflow packages/classes
import tensorflow as tf
from keras import callbacks
from keras.models import Sequential
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras import initializers, regularizers, constraints, optimizers, layers

# import sklearn packages
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

# import nltk packages/classes
import nltk
from nltk.tokenize import word_tokenize

# import utility libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# set display preferences
%matplotlib inline

## Data Processing

In [3]:
cols = ['marketplace',
        'customer_id',
        'review_id',
        'product_id',
        'product_parent',
        'product_title',
        'product_category',
        'star_rating',
        'helpful_votes',
        'total_votes',
        'vine',
        'verified_purchase',
        'review_headline',
        'review_body',
        'review_date']

df = pd.read_csv('amazon_reviews_us_Luggage_v1_00.tsv',
                 sep='\t',
                 usecols = cols)

df.dropna(inplace=True)

# drop reviews with < 10 votes or vine
df = df.loc[(df.helpful_votes > 10) & (df.vine == 'N')]

# drop duplicate reviews
df.drop_duplicates(subset=['review_body'], inplace=True)

# reset index
df.reset_index(inplace=True, drop=True)

### Target Dataframes

In [None]:
"""uncomment to generate target dataframes"""
# # helpfulness ratio
# df['helpful_score'] = df.helpful_votes / df.total_votes

# # helpfulness > global median
# df['helpful_1'] = np.where(df.helpful_score > df.helpful_score.median(), 1, 0)

# # product median helpful votes
# df['median_helpful_votes'] = df.product_id.apply(lambda x: df.groupby('product_id').helpful_votes.median()[x])

# # helpful votes > product median helpful votes
# df['helpful_2'] = np.where(df.helpful_votes > df.median_helpful_votes, 1, 0)

# # unhelpful votes
# df['unhelpful_votes'] = df.total_votes - df.helpful_votes

# # product median unhelpful votes
# df['median_unhelpful_votes'] = df.product_id.apply(lambda x: df.groupby('product_id').unhelpful_votes.median()[x])

# # unhelpful votes > review median helpful votes
# df['unhelpful'] = np.where(df.unhelpful_votes > df.median_unhelpful_votes, 1, 0)

# df_helpful_1 = df[['helpful_1', 'review_body']]
# df_helpful_2 = df[['helpful_2', 'review_body']]
# df_unhelpful = df[['unhelpful', 'review_body']]

# df_helpful_1.columns = ['helpful', 'text']
# df_helpful_2.columns = ['helpful', 'text']
# df_unhelpful.columns = ['unhelpful', 'text']

# df_helpful_1.to_csv('helpful_1.csv')
# df_helpful_2.to_csv('helpful_2.csv')
# df_unhelpful.to_csv('unhelpful.csv')

In [None]:
df_helpful_1 = pd.read_csv('helpful_1.csv')
df_helpful_2 = pd.read_csv('helpful_2.csv')
df_unhelpful = pd.read_csv('unhelpful.csv')

## NLTK

In [10]:
from nltk.corpus import stopwords
import string

stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)

In [13]:
from nltk import word_tokenize

df['tokens'] = df.review_body.apply(word_tokenize)
df['stopped_tokens'] = df.tokens.apply(lambda x: [word.lower() for word in x if word not in stopwords_list])

In [17]:
from  nltk import FreqDist
df['freqdist'] = df.stopped_tokens.apply(FreqDist)

In [21]:
df['most_common'] = df.freqdist.apply(lambda x: x.most_common(10))

In [29]:
from nltk.stem.wordnet import WordNetLemmatizer
df['lemmas'] = df.stopped_tokens.apply(lambda x: [WordNetLemmatizer().lemmatize(word) for word in x])

In [34]:
WordNetLemmatizer().lemmatize('oversized')

'oversized'

In [30]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,tokens,stopped_tokens,freqdist,most_common,lemmas
0,US,20761040,R11IBSD5E6HPSD,B002B3FWXY,677901073,Travelon Anti-Theft Classic Messenger Bag,Luggage,1.0,29.0,31.0,N,Y,This bag was on my shoulder and it just fell t...,The strap broke!!! It was supposed to be anti...,2015-08-31,"[The, strap, broke, !, !, !, It, was, supposed...","[the, strap, broke, it, supposed, anti-theft, ...","{'the': 1, 'strap': 2, 'broke': 2, 'it': 2, 's...","[(i, 5), (n't, 3), (strap, 2), (broke, 2), (it...","[the, strap, broke, it, supposed, anti-theft, ..."
1,US,23857312,R3NPROA23JJRFF,B00V6FKB5M,909535974,MOIERG Vintage Trolley Luggage 2tone TSA,Luggage,5.0,11.0,15.0,N,Y,This product is absolutely BEAUTIFUL. I ordere...,This product is absolutely BEAUTIFUL. I order...,2015-08-31,"[This, product, is, absolutely, BEAUTIFUL, ., ...","[this, product, absolutely, beautiful, i, orde...","{'this': 1, 'product': 1, 'absolutely': 1, 'be...","[(i, 5), (august, 2), (this, 1), (product, 1),...","[this, product, absolutely, beautiful, i, orde..."
2,US,12318409,R2KVWAYBPWK1OV,B011KEPZG8,919734058,Iblue Canvas Leather Weekend Shoulder Duffels ...,Luggage,5.0,20.0,22.0,N,N,My boyfriend wouldn't be without this for travel!,This review is for the Iblue Oversized Leather...,2015-08-31,"[This, review, is, for, the, Iblue, Oversized,...","[this, review, iblue, oversized, leather, canv...","{'this': 4, 'review': 2, 'iblue': 2, 'oversize...","[(br, 12), (bag, 7), (leather, 5), (the, 5), (...","[this, review, iblue, oversized, leather, canv..."
3,US,25513669,R1FLOE9E4ODIGR,B00VBDT55G,995661027,Hynes Eagle Travel Backpack 40L Flight Approve...,Luggage,5.0,34.0,38.0,N,Y,It's perfect!,"I'm just packing for my trip to Europe, and th...",2015-08-31,"[I, 'm, just, packing, for, my, trip, to, Euro...","[i, 'm, packing, trip, europe, luggage, i, 'm,...","{'i': 6, ''m': 2, 'packing': 1, 'trip': 1, 'eu...","[(i, 6), ('m, 2), (bag, 2), (packing, 1), (tri...","[i, 'm, packing, trip, europe, luggage, i, 'm,..."
4,US,11235780,R6XTEZCSCUJ4J,B00SXKUEIC,43152132,AmazonBasics 4 Piece Small Packing Cube Set,Luggage,5.0,20.0,23.0,N,Y,I'm very pleased; they seem to be well made wi...,My husband and I are travelling to NYC in the ...,2015-08-31,"[My, husband, and, I, are, travelling, to, NYC...","[my, husband, i, travelling, nyc, near, future...","{'my': 1, 'husband': 1, 'i': 7, 'travelling': ...","[(i, 7), (well, 3), (future, 2), ('m, 2), (br,...","[my, husband, i, travelling, nyc, near, future..."


## Feature Engineering

In [43]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,...,verified_purchase,review_headline,review_body,review_date,review_length,num_exclamation,num_question,median_star_rating,relative_star_rating,popularity
0,US,20761040,R11IBSD5E6HPSD,B002B3FWXY,677901073,Travelon Anti-Theft Classic Messenger Bag,Luggage,1.0,29.0,31.0,...,Y,This bag was on my shoulder and it just fell t...,The strap broke!!! It was supposed to be anti...,2015-08-31,318,3,3,4.0,0.25,15
1,US,23857312,R3NPROA23JJRFF,B00V6FKB5M,909535974,MOIERG Vintage Trolley Luggage 2tone TSA,Luggage,5.0,11.0,15.0,...,Y,This product is absolutely BEAUTIFUL. I ordere...,This product is absolutely BEAUTIFUL. I order...,2015-08-31,437,2,2,5.0,1.0,1
2,US,12318409,R2KVWAYBPWK1OV,B011KEPZG8,919734058,Iblue Canvas Leather Weekend Shoulder Duffels ...,Luggage,5.0,20.0,22.0,...,N,My boyfriend wouldn't be without this for travel!,This review is for the Iblue Oversized Leather...,2015-08-31,1951,9,9,5.0,1.0,1
3,US,25513669,R1FLOE9E4ODIGR,B00VBDT55G,995661027,Hynes Eagle Travel Backpack 40L Flight Approve...,Luggage,5.0,34.0,38.0,...,Y,It's perfect!,"I'm just packing for my trip to Europe, and th...",2015-08-31,415,1,1,5.0,1.0,3
4,US,11235780,R6XTEZCSCUJ4J,B00SXKUEIC,43152132,AmazonBasics 4 Piece Small Packing Cube Set,Luggage,5.0,20.0,23.0,...,Y,I'm very pleased; they seem to be well made wi...,My husband and I are travelling to NYC in the ...,2015-08-31,707,1,1,5.0,1.0,1


In [19]:
# structural features
df['review_length'] = df.review_body.apply(lambda x: len(x))
df['num_exclamation_pts'] = df.review_body.str.count('!')

In [None]:
# lexical features


In [None]:
# syntactic features


In [None]:
# semantic features


In [40]:
# meta-data features
df['prod_median_star_rating'] = df.product_id.apply(lambda x: df.groupby('product_id').star_rating.median()[x])
df['review_relative_star_rating'] = df.star_rating / df.prod_median_star_rating
df['num_reviews'] = df.product_id.apply(lambda x: df.groupby('product_id').size()[x])

## Models

In [None]:
# train-val-test split
y = df_unhelpful['unhelpful']
X = df_unhelpful['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.1, random_state=0)

### Train LSTM with embedding layer

In [None]:
# Create sequences of tokens of uniform length for all reviews (~ 1 min.)
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(X_train))
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_train_tok_pad = sequence.pad_sequences(X_train_tok, maxlen=100)

X_val_tok = tokenizer.texts_to_sequences(X_val)
X_val_tok_pad = sequence.pad_sequences(X_val_tok, maxlen=100)

X_test_tok = tokenizer.texts_to_sequences(X_test)
X_test_tok_pad = sequence.pad_sequences(X_test_tok, maxlen=100)

In [None]:
# Build and train neural network (~ 80 sec./epoch)
checkpoint = callbacks.ModelCheckpoint('lstm_model_1.h5', 
                                       save_best_only=True)

embedding_size = 128
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(20000, embedding_size, 
                                    input_shape=(100,)))
model.add(tf.keras.layers.LSTM(25, return_sequences=True))
model.add(tf.keras.layers.LSTM(25, return_sequences=True))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=['accuracy'])
history = model.fit(X_train_tok_pad, y_train, epochs=5, batch_size=1024, 
                    validation_data=(X_val_tok_pad, y_val),
                    callbacks=[checkpoint])

In [None]:
# Define a function to visualize ROC curve and AUC
def roc_it(y_true, y_pred, model_name, figsize=(12,10)):
    '''Plot ROC curve with AUC value overlaid on plot.
    
    Keyword arguments:
    y_true: ground-truth labels
    y_pred: predicted labels
    model_name: name to print in the plot title
    
    Dependencies:
    pandas aliased as pd
    sklearn.metrics.roc_curve
    sklearn.metrics.auc
    matplotlib.pyplot aliased as plt
    '''
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc_value = round(auc(fpr, tpr), 2)

    plt.figure(figsize=figsize)
    plt.plot(fpr, tpr)
    plt.plot([0.0,1.0], [0.0,1.0], linestyle='--')
    plt.text(x=0.0, y=0.95, s='AUC: {}'.format(auc_value))
    plt.title('ROC curve for {}'.format(model_name))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show();

In [None]:
# Load model and get predictions
model = tf.keras.models.load_model('lstm_model_1.h5')
y_pred = model.predict_classes(X_test_tok_pad, batch_size=2048)

In [None]:
# Plot the ROC curve
pred_classes = [int(n) for n in y_pred]
roc_it(y_test, pred_classes, 'LSTM model 1')

### Train GRU with same architecture

In [None]:
# Build and train neural network (~ 70 sec./epoch)
checkpoint = callbacks.ModelCheckpoint('gru_model_1.h5', 
                                       save_best_only=True)

embedding_size = 128
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(20000, embedding_size, input_shape=(100,)))
model.add(tf.keras.layers.GRU(25, return_sequences=True, input_shape=(100,)))
model.add(tf.keras.layers.GRU(25, return_sequences=True, input_shape=(100,)))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train_tok_pad, y_train, epochs=5, batch_size=1024, 
                    validation_data=(X_val_tok_pad, y_val),
                    callbacks=[checkpoint])

In [None]:
# Load model and get predictions
model = tf.keras.models.load_model('gru_model_1.h5')
y_pred = model.predict_classes(X_test_tok_pad, batch_size=2048)

In [None]:
# Plot the ROC curve
pred_classes = [int(n) for n in y_pred]
roc_it(y_test, pred_classes, 'GRU model 1')

### Train GRU with bigger embeddings

In [None]:
# Tokenize and pad text vectors (~ 1 min.)
tokenizer = text.Tokenizer(num_words=200000)
tokenizer.fit_on_texts(list(X_train))
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_train_tok_pad = sequence.pad_sequences(X_train_tok, maxlen=200)

X_val_tok = tokenizer.texts_to_sequences(X_val)
X_val_tok_pad = sequence.pad_sequences(X_val_tok, maxlen=200)

X_test_tok = tokenizer.texts_to_sequences(X_test)
X_test_tok_pad = sequence.pad_sequences(X_test_tok, maxlen=200)

In [None]:
# Build and train neural network (~ 2.3 min./epoch)
checkpoint = callbacks.ModelCheckpoint('gru_model_2.h5', 
                                       save_best_only=True)

embedding_size = 128
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(200000, embedding_size, input_shape=(200,)))
model.add(tf.keras.layers.GRU(25, return_sequences=True, input_shape=(200,)))
model.add(tf.keras.layers.GRU(25, return_sequences=True, input_shape=(200,)))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train_tok_pad, y_train, epochs=5, batch_size=1024, 
                    validation_data=(X_val_tok_pad, y_val),
                    callbacks=[checkpoint])

In [None]:
# Load model and get predictions
model = tf.keras.models.load_model('gru_model_2.h5')
y_pred = model.predict_classes(X_test_tok_pad, batch_size=2048)

In [None]:
# Plot the ROC curve
pred_classes = [int(n) for n in y_pred]
roc_it(y_test, pred_classes, 'GRU model 2')

In [None]:
# View model summary
model.summary()

### Train GRU with more epochs

In [None]:
# Create sequences of tokens uniform in length for all reviews (~ 1 min.)
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(X_train))
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_train_tok_pad = sequence.pad_sequences(X_train_tok, maxlen=100)

X_val_tok = tokenizer.texts_to_sequences(X_val)
X_val_tok_pad = sequence.pad_sequences(X_val_tok, maxlen=100)

X_test_tok = tokenizer.texts_to_sequences(X_test)
X_test_tok_pad = sequence.pad_sequences(X_test_tok, maxlen=100)

In [None]:
# Try again with more epochs, callbacks
checkpoint = callbacks.ModelCheckpoint('gru_model_3.h5',
                                       monitor='val_acc', 
                                       save_best_only=True)
# early_stop = callbacks.EarlyStopping(monitor='val_loss', 
#                                      min_delta=0.001, 
#                                      patience=5) 

embedding_size = 128
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(200000, embedding_size, 
                                    input_shape=(100,)))
model.add(tf.keras.layers.GRU(25, return_sequences=True, input_shape=(100,)))
model.add(tf.keras.layers.GRU(25, return_sequences=True, input_shape=(100,)))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=['accuracy'])
history = model.fit(X_train_tok_pad, y_train, epochs=50, batch_size=2048, 
                    validation_data=(X_val_tok_pad, y_val),
                    callbacks=[checkpoint])

In [None]:
# Visualize loss and accuracy over training epochs
x = [i for i in range(1, 51)]

plt.figure(figsize=(12, 10))
plt.plot(x, history.history['accuracy'], label='Train Accuracy')
plt.plot(x, history.history['loss'], label='Train Loss')
plt.plot(x, history.history['val_accuracy'], label='Val. Accuracy')
plt.plot(x, history.history['val_loss'], label='Val. Loss')
plt.title('Model performance over 50 training epochs')
plt.xlabel('Epochs')
plt.legend()
plt.show();

In [None]:
# Load model and get predictions
model = tf.keras.models.load_model('gru_model_2.h5')
y_pred = model.predict_classes(X_test_tok_pad, batch_size=2048)

In [None]:
# Plot the ROC curve
pred_classes = [int(n) for n in y_pred]
roc_it(y_test, pred_classes, 'GRU model 3')

### Evaluate best model

In [None]:
# Load model
import tensorflow as tf

model = tf.keras.models.load_model('gru_model_2.h5')

In [None]:
# View model summary
model.summary()

In [None]:
# Evaluate the best model using holdout data
model.evaluate(X_test_tok_pad, y_test, batch_size=2048)

In [None]:
# Examine classification report
from sklearn.metrics import classification_report
y_pred_classes = model.predict_classes(X_test_tok_pad, batch_size=2048)
print(classification_report(y_test, y_pred_classes))

In [None]:
# Examine confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Define a function to plot a color-coded confusion matrix
def pretty_confusion(y_true, y_pred, model_name):
    '''Display normalized confusion matrix with color scale.
    
    Keyword arguments:
    y_true: ground-truth labels
    y_pred: predicted labels
    model_name: name to print in the plot title
    
    Dependencies:
    numpy aliased as np
    sklearn.metrics.confusion_matrix
    matplotlib.pyplot aliased as plt
    seaborn aliased as sns
    '''
    
    # Calculate the confusion matrix
    matrix = confusion_matrix(y_true, y_pred)
    matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
    
    # Build the plot
    plt.figure(figsize=(16,7))
    sns.set(font_scale=1.4)
    sns.heatmap(matrix, annot=True, annot_kws={'size':10},
                cmap=plt.cm.Greens, linewidths=0.2)
    
    # Add labels to the plot
    class_names = ['Unhelpful', 'Helpful']
    tick_marks = np.arange(len(class_names))
    tick_marks2 = tick_marks + 0.5
    plt.xticks(tick_marks, class_names, rotation=25)
    plt.yticks(tick_marks2, class_names, rotation=0)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title('Confusion Matrix for {}'.format(model_name)) 
    plt.tight_layout()

# Define a function to save a confusion matrix visualization    
def save_conf_matrix(y_true, y_pred, model_name):
    '''Save normalized confusion matrix with color scale as .png file.
    
    Note that in Jupyter Notebook, the plot will also be displayed
    automatically on calling this function. This function saves .png files
    with 300 dpi and 0.5-in. padding.
    
    Keyword arguments:
    y_true: ground-truth labels
    y_pred: predicted labels
    model_name: name to print in the plot title
    
    Dependencies:
    sklearn.metrics.confusion_matrix
    matplotlib.pyplot aliased as plt
    seaborn aliased as sns
    '''
    fig = pretty_confusion(y_true, y_pred, model_name)
    path = '/'
    filename = path + '_'.join(model_name.split()) + '_confmatrix.png'
    plt.savefig(filename, pad_inches=0.5, dpi=300)

# Plot and save the confusion matrix
save_conf_matrix(y_test, y_pred_classes, 'Final Model')

In [None]:
# Plot the ROC curve
pred_classes = [int(n) for n in y_pred_classes]
roc_it(y_test, pred_classes, 'final model')

### Inspect sample predictions

In [None]:
# Generate a few sample predictions (1 = "helpful", 0 = not helpful)
sample = X_train_tok_pad[:10]
actual = y_train[:10]

predictions = model.predict(sample)

In [None]:
# Create DataFrame of predicted v. actual values for samples
sample_results = pd.concat([pd.Series([float(p) for p in predictions]), pd.Series(actual).reset_index()], axis=1)
sample_results.columns = ['Probability_of_helpfulness', 'Original_index', 'Actual_helpfulness']
sample_results.set_index(keys='Original_index', inplace=True)
sample_results

### Visualize predictions

In [None]:

# Plot predicted v. actual values
import seaborn as sns
sns.set_style('darkgrid')

x = [i for i in range(len(sample_results))]
y = [0.5 for i in range(len(sample_results))]
colors = ['r', 'r', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g']
incorrect = sample_results.Probability_of_helpfulness[:2]
correct = sample_results.Probability_of_helpfulness[2:]

plt.figure(figsize=(12,6))
plt.scatter(x[:2], incorrect, label='Incorrect prediction', color='r', s=80)
plt.scatter(x[2:], correct, label='Correct prediction', color='g', s=80)
plt.scatter(x, sample_results.Actual_helpfulness, label='Actual', color='gray', s=80)
plt.plot(x, y, linestyle='--', color='k', label='Decision boundary')
plt.title('Predicted v. actual helpfulness ratings for sample reviews')
plt.xlabel('Indices of sample reviews')
plt.xticks(x, sample_results.index)
plt.ylabel('Probability')
plt.legend()
plt.savefig('sample_results.png', dpi=300, padding=0.5)
plt.show();