<span style="font-family: Arial; font-size:3em;color:red;"> <b> Yelp </p> Ratings </p> Presentation </p> Notebook </b> </span>
<br><br> Let's see who's better at classifying Yelp reviews -- you (humans) or our model? </br>

<img style="float:left" src="https://uproxx.files.wordpress.com/2015/10/south-park-yelp.png?w=650" /> 

In [None]:
import nltk
import unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
wnl = nltk.WordNetLemmatizer()
nltk.download('averaged_perceptron_tagger')
from collections import Counter
from sklearn.metrics import confusion_matrix

## Load data from survey

In [None]:
csv_path = '../data/new_review_examples.csv'
review_1 = pd.read_csv(csv_path, usecols = [3], names=['reviews'], nrows=1)
review_2 = pd.read_csv(csv_path, usecols = [4], names=['reviews'], nrows=1)
review_3 = pd.read_csv(csv_path, usecols = [5], names=['reviews'], nrows=1)
review_4 = pd.read_csv(csv_path, usecols = [6], names=['reviews'], nrows=1)
review_5 = pd.read_csv(csv_path, usecols = [7], names=['reviews'], nrows=1)
review_1_stars = pd.read_csv(csv_path, usecols = [3], names=['stars'], skiprows=1)
review_2_stars = pd.read_csv(csv_path, usecols = [4], names=['stars'], skiprows=1)
review_3_stars = pd.read_csv(csv_path, usecols = [5], names=['stars'], skiprows=1)
review_4_stars = pd.read_csv(csv_path, usecols = [6], names=['stars'], skiprows=1)
review_5_stars = pd.read_csv(csv_path, usecols = [7], names=['stars'], skiprows=1)
premade_reviews = pd.concat([review_1, review_2, review_3, review_4, review_5], ignore_index=True)
premade_reviews['actual_stars'] = [2,4,1,3,5]
submitted_reviews = pd.read_csv(csv_path, usecols = [1,2], names=['reviews', 'stars'], skiprows=1, encoding = 'ISO-8859-1')

In [None]:
review_1_counts = dict(Counter(review_1_stars.stars.tolist()))
review_1_counts = [review_1_counts.get(1, 0), review_1_counts.get(2,0), review_1_counts.get(3,0), review_1_counts.get(4,0), review_1_counts.get(5,0)]
review_2_counts = dict(Counter(review_2_stars.stars.tolist()))
review_2_counts = [review_2_counts.get(1, 0), review_2_counts.get(2,0), review_2_counts.get(3,0), review_2_counts.get(4,0), review_2_counts.get(5,0)]
review_3_counts = dict(Counter(review_3_stars.stars.tolist()))
review_3_counts = [review_3_counts.get(1, 0), review_3_counts.get(2,0), review_3_counts.get(3,0), review_3_counts.get(4,0), review_3_counts.get(5,0)]
review_4_counts = dict(Counter(review_4_stars.stars.tolist()))
review_4_counts = [review_4_counts.get(1, 0), review_4_counts.get(2,0), review_4_counts.get(3,0), review_4_counts.get(4,0), review_4_counts.get(5,0)]
review_5_counts = dict(Counter(review_5_stars.stars.tolist()))
review_5_counts = [review_5_counts.get(1, 0), review_5_counts.get(2,0), review_5_counts.get(3,0), review_5_counts.get(4,0), review_5_counts.get(5,0)]

## Visualize survey data

<img style="float:left" src="https://img.memecdn.com/bad-yelp-rebiew_c_4185847.webp" /> 

In [None]:
plt.figure(figsize=(14,6))
barWidth = 0.2
 
bars1 = review_1_counts
bars2 = review_2_counts
bars3 = review_3_counts
bars4 = review_4_counts
bars5 = review_5_counts
 
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]

plt.bar(r1, bars1, color='#101357', width=barWidth, edgecolor='white', label='review_1')
plt.bar(r2, bars2, color='#fea49f', width=barWidth, edgecolor='white', label='review_2')
plt.bar(r3, bars3, color='#fbaf08', width=barWidth, edgecolor='white', label='review_3')
plt.bar(r4, bars4, color='#00a0a0', width=barWidth, edgecolor='white', label='review_4')
plt.bar(r5, bars5, color='#007f4f', width=barWidth, edgecolor='white', label='review_5')

plt.xlabel('Star', fontsize=22)
plt.xticks([r + barWidth for r in range(len(bars1))], ['1', '2', '3', '4', '5'], fontsize=20)
axes = plt.gca()
axes.set_ylim([0,max(max(bars1), max(bars2), max(bars3), max(bars4), max(bars5))])
plt.title('Star Count per Review', fontweight='bold', fontsize=25)
plt.legend(fontsize=12)
plt.show()

Now let's look at how your guesses measured up to the actual star ratings given to these reviews

In [None]:
print('The five reviews for which we solicited guesses from you all:\n')
for i in range(premade_reviews.shape[0]):
    print('Review {}:\n{}\n\nActual Star Rating: {}\n\n'.format(i+1, premade_reviews.loc[i, 'reviews'],premade_reviews.loc[i, 'actual_stars']))

Let's better visualize your accuracy in guessing reviews' star ratings with pie charts

In [None]:
def make_pie(review_counts):
    def get_labels():
        non_zero_labels = []
        for i in range(len(review_counts)):
            if review_counts[i] != 0:
                non_zero_labels.append(i+1)
        return non_zero_labels

    labels = get_labels()
    sizes = [review_counts[i-1] for i in labels]
    colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'pink']

    plt.pie(sizes, labels=labels, colors=colors,  shadow=True, startangle=140, autopct='%1.1f%%')
    
    plt.axis('equal')
    plt.show()

In [None]:
for i, counts in enumerate([review_1_counts, review_2_counts, review_3_counts, review_4_counts, review_5_counts]):
    print('Review {}'.format(i+1))
    print('Actual star rating: {}'.format(premade_reviews.loc[i, 'actual_stars']))
    make_pie(counts)

Now let's take a look at some of the reviews you guys submitted in our circulated survey

In [None]:
print('Total number of submitted reviews: {}'.format(submitted_reviews.shape[0]))
print('Submitted reviews dataframe:')
submitted_reviews

In [None]:
for i in range(0,3):
    print('{}\n\nStar Rating: {}\n\n'.format(submitted_reviews.loc[i, 'reviews'], submitted_reviews.loc[i, 'stars']))

Below we can see the distribution of the star ratings you assigned to the reviews you wrote

In [None]:
submitted_reviews_counts = dict(Counter(submitted_reviews.stars.tolist()))
submitted_reviews_counts = [submitted_reviews_counts.get(1,0), submitted_reviews_counts.get(2,0), submitted_reviews_counts.get(3,0), submitted_reviews_counts.get(4,0), submitted_reviews_counts.get(5,0)]

bins = np.arange(7) - 0.5
plt.hist(submitted_reviews.stars, bins, edgecolor='black')
plt.xticks(range(6))
plt.xlim([0, 6])
plt.yticks(range(max(submitted_reviews_counts)+1))
plt.ylim([0, max(submitted_reviews_counts)+1])
plt.title('Star Counts for Submitted Reviews', fontsize=15, fontweight='bold')
plt.xlabel('Star', fontsize=13)
plt.ylabel('Count', fontsize=13)
plt.show();

## Clean reviews for modeling ##

Before we can compare your results to our model's results, we need to clean the reviews and vectorize them for input into the model.

Cleaning/preprocessing steps:

- Lowercase<br>
- Remove non-ASCII characters<br>
- Tokenize on pattern \w+\'?\w+ (maintains internal apostrophes)<br>
- Replace non-alphabetic characters (e.g., digits, %, *, &, $) with ''<br>
- Remove stopwords (based on customized list that retains words with negative connotation (e.g., not, won't, no))<br>
- Lemmatize (canonical/dictionary form of a word... babies --> baby)<br>
- Remove rare tokens (30 threshold, tokens that appear less than or equal to 30 times in corpus will be removed)<br>

<img style="float:left" src="https://pics.me.me/if-beauty-and-the-beast-happened-today-be-our-guest-11644791.png" />

In [None]:
neg_stops = ['no',
    'nor',
    'not',
    'don',
    "don't",
    'ain',
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    'mightn',
    "mightn't",
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'shan',
    "shan't",
    'shouldn',
    "shouldn't",
    'wasn',
    "wasn't",
    'weren',
    "weren't",
    "won'",
    "won't",
    'wouldn',
    "wouldn't",
    'but',
    "don'",
    "ain't"]

common_nonneg_contr = ["could've",
    "he'd",
    "he'd've",
    "he'll",
    "he's",
    "how'd",
    "how'll",
    "how's",
    "i'd",
    "i'd've",
    "i'll",
    "i'm",
    "i've",
    "it'd",
    "it'd've",
    "it'll",
    "it's",
    "let's",
    "ma'am",
    "might've",
    "must've",
    "o'clock",
    "'ow's'at",
    "she'd",
    "she'd've",
    "she'll",
    "she's",
    "should've",
    "somebody'd",
    "somebody'd've",
    "somebody'll",
    "somebody's",
    "someone'd",
    "someone'd've",
    "someone'll",
    "someone's",
    "something'd",
    "something'd've",
    "something'll",
    "something's",
    "that'll",
    "that's",
    "there'd",
    "there'd've",
    "there're",
    "there's",
    "they'd",
    "they'd've",
    "they'll",
    "they're",
    "they've",
    "'twas",
    "we'd",
    "we'd've",
    "we'll",
    "we're",
    "we've",
    "what'll",
    "what're",
    "what's",
    "what've",
    "when's",
    "where'd",
    "where's",
    "where've",
    "who'd",
    "who'd've",
    "who'll",
    "who're",
    "who's",
    "who've",
    "why'll",
    "why're",
    "why's",
    "would've",
    "y'all",
    "y'all'll",
    "y'all'd've",
    "you'd",
    "you'd've",
    "you'll",
    "you're",
    "you've"]

letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
  'u', 'v', 'w', 'x', 'y', 'z']

ranks = ['st', 'nd', 'rd', 'th']

In [None]:
def create_stopword_list(nltk_english = True, contractions = True, single_letters = True, rank_suffixes = True, remove_negs = True):

    # Figure out if the stopwords corpus is present
    try:
        dir(nltk.corpus.stopwords)
    except AttributeError:
        nltk.download('stopwords')

    # Assemble all the stopwords into a list
    stops = []
    if nltk_english:
        stops += nltk.corpus.stopwords.words('english')
    if contractions:
        stops += common_nonneg_contr
    if single_letters:
        stops += letters
    if rank_suffixes:
        stops += ranks
    stops += [""] + ['us'] + [''] + ["'"]

    # Remove all negative stopwords and any duplicates
    if remove_negs:
        stops = list(set(stops) - set(neg_stops))

    return stops

In [None]:
def _process_review(df):
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].lower()
        tag_dict = {"a": wordnet.ADJ,
                    "n": wordnet.NOUN,
                    "v": wordnet.VERB,
                    "r": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    def _clean_review(text):
        text = text.lower()
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8', 'ignore')
        tokenizer = nltk.RegexpTokenizer('\w+\'?\w+')
        filtered_tokens = [(re.sub(r"[^A-Za-z\s']", '', token)) for token in tokenizer.tokenize(text)]
        stops = create_stopword_list()
        tokens = [token for token in filtered_tokens if token not in stops]
        tokens = [re.sub("'s", '', token) for token in tokens if re.sub("'s", '', token) != '']
        for i, token in enumerate(tokens):
            tokens[i] = wnl.lemmatize(token, pos= get_wordnet_pos(token))
        tokens = [token for token in tokens if token not in stops]
        return tokens
    df['review_tokens'] = df['reviews'].apply(lambda x: _clean_review(x))
    return df

Let's now clean/preprocess the reviews you submitted in our survey...

In [None]:
submitted_reviews = _process_review(submitted_reviews)

Below are some examples of what the reviews look like in tokenized form after preprocessing

In [None]:
for i in range(0,3):  
    print('Full review:\n{}'.format(submitted_reviews.loc[i, 'reviews']))
    print('\nTokenized review: \n{}\n\n'.format(submitted_reviews.loc[i, 'review_tokens']))

Now that we have our reviews in tokenized form, let's remove rare tokens using our previously pickled list of rare tokens based on our corpus of over 6.6 million Yelp reviews. Our list of rare tokens included 583595 tokens.

In [None]:
rare_tokens_30 = pickle.load(open('../data/rare_tokens_threshold30_copy.pkl', 'rb'))

In [None]:
def _remove_rare_tokens(df):
    def _filter_rare_tokens(tokens):   
        tokens_to_remove = list((set(tokens) & set(rare_tokens_30)))
        frequent_tokens = [token for token in tokens if token not in tokens_to_remove]
        return frequent_tokens
    df['review_tokens'] = df['review_tokens'].apply(lambda x: _filter_rare_tokens(x))
    return df

In [None]:
submitted_reviews = _remove_rare_tokens(submitted_reviews)

Now that we've fully preprocessed the reviews you submitted, let's do the same for the five reviews for which we had you guess star ratings.

In [None]:
premade_reviews = _process_review(premade_reviews)
premade_reviews = _remove_rare_tokens(premade_reviews)

## Use model to predict ratings

Below we will load in our pickled TF-IDF vectorizer fit to our data and our pickled stochastic gradient descent classifier (SGDC) model. We will first vectorize the reviews and then feed them (both preexisting and submitted by you) into the model to compare model predictions with human guesses.

<img style="float:left" src="https://img.wonderhowto.com/img/17/27/63452217416656/0/human-vs-computer-scrabble-showdown.w1456.jpg" />

In [None]:
tfidfer = pickle.load(open('../data/pickled_TfidfVectorizer.pkl', 'rb'))
SGDC = pickle.load(open('../data/SGDC.pkl', 'rb'))

In [None]:
premade_reviews['review_tokens'] = premade_reviews['review_tokens'].apply(' '.join)
submitted_reviews['review_tokens'] = submitted_reviews['review_tokens'].apply(' '.join)
tfidf_premade_reviews = tfidfer.transform(premade_reviews['review_tokens'])
tfidf_submitted_reviews = tfidfer.transform(submitted_reviews['review_tokens'])

Now let's feed our vectorized reviews into the model for prediction

In [None]:
submitted_reviews_predictions = SGDC.predict(tfidf_submitted_reviews)
premade_reviews_predictions = SGDC.predict(tfidf_premade_reviews)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

Now let's visualize how the classifier performed in classifying the reviews you submitted. 

In [None]:
plot_confusion_matrix(submitted_reviews.stars, submitted_reviews_predictions, [1,2,3,4,5],
                          title='Confusion matrix for submitted reviews',
                          cmap=plt.cm.Blues, normalize=True);

Based on the output above, how did the model do in classifying the reviews you all submitted?

Finally, let's look at who outperformed whom in classifying the five premade reviews -- humans or the model?

In [None]:
plot_confusion_matrix(premade_reviews.actual_stars, premade_reviews_predictions, [1,2,3,4,5],
                          title='Confusion matrix for premade reviews',
                          cmap=plt.cm.Blues, normalize=True);

Let's compare the output above to you guys' performance in classifying the reviews via revisiting the pie charts generated earlier in the notebook:

In [None]:
for i, counts in enumerate([review_1_counts, review_2_counts, review_3_counts, review_4_counts, review_5_counts]):
    print('Review {}'.format(i+1))
    print('Actual star rating: {}'.format(premade_reviews.loc[i, 'actual_stars']))
    make_pie(counts)

So what do you think? Would you trust yourselves more than the model for this classification task?