In [None]:
import numpy as np
import pandas as pd
import csv

## Import

In [None]:
yelp_df = pd.read_csv('data/all_reviews_cleaned.csv')

In [None]:
yelp_df.describe()

## General summary

### Categorical

In [None]:
yelp_df.drop(['business_star_rating','business_zip','review_raiting', 'useful', 'funny', 'cool'], axis=1)

### Numeric

In [None]:
yelp_df[['business_star_rating', 'review_raiting', 'useful', 'funny', 'cool']].describe()

### Missing

In [None]:
# Make sure every observation has a label
yelp_df.query('label != "local" and label != "remote"')

In [None]:
# make sure reviews do not end on whitespace (this would be indicative that the full review was no scraped)
import re
yelp_df[['review_text', 'business_url']].assign(end_on_whitespace=yelp_df['review_text'].apply(lambda x: re.search(r'[ \f\t\v]+$', x) != None)).query('end_on_whitespace == True')

In [None]:
yelp_df.isna().sum(0)

## Label Anaysis

#### Count of labels per state

In [None]:
yelp_df[['business_url']].groupby([yelp_df['business_state'], yelp_df['label'] ]).agg('count')

#### Percerntage of labels per state

In [None]:
biz_label_group = yelp_df[['business_state','label']].groupby(
    [yelp_df['business_state'], yelp_df['label']])

biz_label_group.agg('count').apply(lambda x: 100 * x / float(x.sum()))

In [None]:
yelp_df.groupby(yelp_df['label']).agg('count')

#### Ratio of business rating to review rating

In [None]:
yelp_df[['business_star_rating', 'review_raiting', 'label']].assign(review_biz_ratio= (yelp_df['review_raiting']/yelp_df['business_star_rating'])).groupby('label').agg(lambda x: x.sum()/x.count())

# Define Corpus

In [None]:
import pickle

### Add tagged reviews to data frame

In [None]:
with open('dump/scraped_reviews_tagged.p', 'rb') as f_reviews_tagged:
    reviews_tagged = pickle.load(f_reviews_tagged)

In [None]:
yelp_df['reviews_tagged'] = reviews_tagged

In [None]:
yelp_df = yelp_df.rename(columns={'cool': 'cool_', 'label': 'label_', 'funny': 'funny_', 'useful': 'useful_'})

# yelp_df[['reviewer_id']] = yelp_df[['reviewer_id']].drop_duplicates()
# yelp_df = yelp_df[yelp_df['reviewer_id'].notnull()]

yelp_df[['business_url']] = yelp_df[['business_url']].drop_duplicates(keep='first')
yelp_df = yelp_df[yelp_df['business_url'].notnull()]

state_min = min(yelp_df.query('business_state != "NJ"').groupby('business_state').agg('count').iloc[:, 0 ])

sample_ny = yelp_df.query('business_state == "NY"').sample(n=state_min)
sample_nv = yelp_df.query('business_state == "NV"').sample(n=state_min)
sample_ca = yelp_df.query('business_state == "CA"').sample(n=state_min)
sample_fl = yelp_df.query('business_state == "FL"').sample(n=state_min)
sample_il = yelp_df.query('business_state == "IL"').sample(n=state_min)

sample = pd.concat([sample_ny, sample_nv, sample_ca, sample_fl, sample_il]).reset_index(drop = True)

sample_min = min(sample.groupby('label_').agg('count').iloc[:, 0])

local_sample = sample.query('label_ == "local"').sample(n=sample_min)
remote_sample = sample.query('label_ == "remote"').sample(n=sample_min)

yelp_df = pd.concat([local_sample, remote_sample]).reset_index(drop = True)

print("Sample size: ", len(yelp_df))

In [None]:
yelp_df.drop('reviews_tagged', axis= 1).astype(object).describe()

In [None]:
yelp_df[['review_length', 'label_']].groupby('label_').describe()

# Plots

In [None]:
from matplotlib import pyplot as plt

from yellowbrick.text import FreqDistVisualizer
from sklearn.feature_extraction.text import CountVectorizer

from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from nltk import FreqDist

In [None]:
vectorizer = CountVectorizer()
docs       = vectorizer.fit_transform(yelp_df['review_text'].apply(lambda x: x.lower()))
features   = vectorizer.get_feature_names()

## Word histogram

In [None]:
visualizer = FreqDistVisualizer(n= 40, fontsize=25, features=features, orient='h', size=(1200, 800))
visualizer.fit(docs)
visualizer.ax.legend(loc=4)
visualizer.set_title()
# Set the title
# Create the vocab, count, and hapaxes labels
infolabel = "vocab: {:,}\nword tokens: {:,}\nhapax: {:,}".format(
    visualizer.vocab_, visualizer.words_, visualizer.hapaxes_
)

visualizer.ax.text(0.68, 0.97, infolabel, position=(.75,.1), transform=visualizer.ax.transAxes,
             fontsize=20, verticalalignment='bottom',
             bbox={'boxstyle':'round', 'facecolor':'white', 'alpha':.8})

# Set the legend and the grid
plt.title('Frequency Distribution of Top {} tokens'.format(visualizer.N), fontsize=30)
plt.yticks(size=15)
plt.xticks(size=15)
plt.rcParams.update({'font.size': 22})
plt.show(visualizer)


## Zipf's Law

Zipf's law states natural language corpus of utterances, the frequency of any word type is inversely proportional to its rank in the frequency table.

So frequency of the word with rank n is proportional to 1/n. In other words, the most ranked word is around twice as common as the second ranked word, and a thousand times more common than the word with rank 1,000.

We can check Zipf's Law for the scraped corpus of Yelp reviews by plotting the frequencies of the word types in rank order on a log-log graph.

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
corpus_retokenized = tokenizer.tokenize(' '.join(yelp_df['review_text']).lower())

In [None]:
word_freq = FreqDist(corpus_retokenized)

In [None]:
word_freq = sorted(word_freq.items(), reverse=True, key=operator.itemgetter(1))

for i in range(len(word_freq)):
    word_freq[i] = (i, word_freq[i][1])

refreq_df = pd.DataFrame.from_dict(dict(word_freq), orient='index')
refreq_df = refreq_df.rename(columns={0: 'frequency'})
refreq_df['count'] = refreq_df.index + 1

In [None]:
plt.figure(figsize=(20,10))
s = plt.scatter(refreq_df['count'], refreq_df['frequency'])
s.axes.loglog(True)
plt.title('Word Frequency by Frequency Rank', fontsize=25)
plt.ylabel('Token Frequency', fontsize=25)
plt.xlabel('Word Type Rank (by Frequency)', fontsize=25)

# PArt of Speech Features

## POS

In [None]:
class pos_counter():

    adverbs = [u'RB', u'RBR', u'RBS', u'RBS\r', u'RB\r', u'RBR\r']
    simple_past = [u'VBD', u'VBD\r']
    simple_present = [u'VBP', u'VBZ', u'VBP\r', u'VPZ\r']
    past_participle = [u'VBN', u'VBN\r']
    modal = [u'MD', u'MD\r']
    pn = [u'NNP', u'NNPS', u'NNP\r', u'NNPS\r']
    prep = [u'IN', u'IN\r']
    nn = [u'NN', u'NN\r']
    adj = [u'JJ', u'JJ\r']
    dt = [u'DT', u'DT\r']

    def count_pos(tagged_reviews, pos_list):
        count = 0
        for review in tagged_reviews:
            try:
                if review[1] in pos_list:
                    count += 1
            except:
                pass
        return float(count)

In [None]:
# yelp_df["adv count"] = np.array([
#         pos_counter.count_pos(review, pos_counter.adverbs)
#         for review in yelp_df['reviews_tagged']])

# yelp_df["past prog"] = np.array([
#     pos_counter.count_pos(review, pos_counter.past_participle)
#     for review in yelp_df['reviews_tagged']])

# yelp_df["simple future"] = np.array([
#     pos_counter.count_pos(review, pos_counter.modal)
#     for review in yelp_df['reviews_tagged']])

# yelp_df["simple past"] = np.array([
#     pos_counter.count_pos(review, pos_counter.simple_past)
#     for review in yelp_df['reviews_tagged']])

# yelp_df["simple present"] = np.array([
#     pos_counter.count_pos(review, pos_counter.simple_present)
#     for review in yelp_df['reviews_tagged']])

# yelp_df['porper name'] = np.array([
#     pos_counter.count_pos(review, pos_counter.pn)
#     for review in yelp_df['reviews_tagged']])

# yelp_df['prep count'] = np.array([
#     pos_counter.count_pos(review, pos_counter.prep)
#     for review in yelp_df['reviews_tagged']])

# yelp_df['nn count'] = np.array([
#     pos_counter.count_pos(review, pos_counter.nn)
#     for review in yelp_df['reviews_tagged']])

# yelp_df['adj count'] = np.array([
#     pos_counter.count_pos(review, pos_counter.adj)
#     for review in yelp_df['reviews_tagged']])

# yelp_df['det count'] = np.array([
#     pos_counter.count_pos(review, pos_counter.dt)
#     for review in yelp_df['reviews_tagged']])

### Pairwise POST comparison

In [None]:
pltme = yelp_df.loc[:, ['det count', 'adj count', 'nn count', 'prep count', 'porper name', "simple present", "simple past", "simple future", "past prog", 'adv count', 'label_']].groupby('label_').agg('sum').apply(lambda x: (x[0] - x[1]) / (x[0] + x[1]))

xticks = pltme.index
ins = np.arange(len(xticks))
plt.figure(figsize=(20,10))
plt.title('Pairwise comparison of tags (across labels)', fontsize=25)
plt.bar(ins, pltme.sort_values())
plt.xticks(ins, xticks, rotation=45, fontsize=20)

plt.yticks(size=20)

In [None]:
tags = []
pos_list = []
for review in yelp_df['reviews_tagged']:
    pos_dict = {}
    for item in review:
        if type(item) == tuple:
            if item[1] in pos_dict:
                pos_dict[item[1]] += 1.0
            else:
                pos_dict[item[1]] = 1.0
            if not item[1] in tags:
                tags.append(item[1])
    pos_list.append(pos_dict)


In [None]:
pltme = yelp_df[['label_']].join(
    pd.DataFrame.from_dict(pos_list), on=None, how='left', lsuffix='',
    rsuffix='', sort=False).fillna(0).groupby('label_').agg('sum').apply(lambda x: (x[0] - x[1]) / (x[0] + x[1])).sort_values().transform(lambda x:((x-x.mean())/x.std()))

#pltme = pltme[pltme.between(-5, -.3) | pltme.between(.3, 5)]

xticks = pltme.index
ins = np.arange(len(xticks))
plt.figure(figsize=(20,10))
plt.title('Pairwise comparison of tags (across labels)', fontsize=25)
plt.bar(ins, pltme.sort_values())
plt.xticks(ins, xticks, rotation=45, fontsize=20)
plt.yticks(size=20)

In [None]:
pltme = yelp_df[['label_']].join(
    pd.DataFrame.from_dict(pos_list), on=None, how='left', lsuffix='',
    rsuffix='', sort=False).groupby('label_').agg(
    lambda x: sum(x>1)).apply(
    lambda x: 1 - (min(x)/max(x)) if max(x) != 0 else 0).sort_values()#.transform(lambda x:((x-x.mean())/x.std()))

xticks = pltme.index
ins = np.arange(len(xticks))
plt.figure(figsize=(20,10))
plt.title('Pairwise comparison of tags (across labels)', fontsize=25)
plt.bar(ins, pltme.sort_values())
plt.xticks(ins, xticks, rotation=45, fontsize=20)
plt.yticks(size=20)

In [None]:
yelp_df = yelp_df.join(
    pd.DataFrame.from_dict(pos_list)[['#','RBS', 'RBR', 'VBZ',
                                      'VBD','VBP', 'EX', 'JJR', 'LS', 'MD',
                                      'VBN', 'EX', '$']],
    on=None, how='left', lsuffix='', rsuffix='', sort=False)

In [None]:
yelp_df = yelp_df.join(
    pd.DataFrame.from_dict(pos_list),
    on=None, how='left', lsuffix='', rsuffix='', sort=False)

# Basic Features

In [None]:
import time

#### review length

In [None]:
yelp_df= yelp_df.assign(review_length =
    yelp_df['review_text'].apply(lambda x: len(x)))

#### week of year

In [None]:
yelp_df = yelp_df.assign(week_of_year =
    yelp_df['review_date'].apply(lambda x: time.strptime(x, "%m/%d/%Y").tm_yday // 7))

#### day of week

In [None]:
yelp_df = yelp_df.assign(day_of_week =
    yelp_df['review_date'].apply(lambda x: time.strptime(x, "%m/%d/%Y").tm_wday))

#### city mentioned

In [None]:
yelp_df = yelp_df.assign(city_mentioned = 0)

In [None]:
vecIn = np.vectorize(lambda a, b: a.lower() in b.lower() )

In [None]:
yelp_df = yelp_df.assign(city_mentioned = np.where(vecIn(yelp_df['business_city'].values, yelp_df['review_text'].values), 1, 0))

#### reviewer state

In [None]:
# yelp_df = yelp_df.assign(reviewer_state = yelp_df['reviewer_location'].astype(str).apply(lambda x: x[-2:]))

## Saliance

### Saliance keep words function

In [None]:
def saliance(words, local_words, remote_words, theta=.50):
    """ saliance(data) takes a dataframe and returns a list of variables to keep
    that meet a salience theta
    """
    keep_words = []
    for i in range(words.shape[1]):
        normalizer = words[:, i].nnz
        l_prob_sum = local_words[:, i].nnz / normalizer
        r_prob_sum = remote_words[:, i].nnz / normalizer

        min_ = min(r_prob_sum, l_prob_sum)
        max_ = max(r_prob_sum, l_prob_sum)
        if max_ != 0:
                salience = (1 - (min_ / max_))
        else:
                salience = 0
        if salience > theta and salience != 1:
            keep_words.append(i)
    return keep_words

## Saliance Table

In [None]:
unigram_vec = TfidfVectorizer(
    analyzer="word",
    tokenizer=None,
    stop_words = 'english',
    ngram_range=(1, 1),
    #min_df = .001,
    #max_df = .999,
    preprocessor=None,
    max_features=1000)

unigram_fit = unigram_vect.fit_transform(yelp_df['review_text'])

unigrams = pd.DataFrame(
     unigram_fit.A, columns=unigram_vect.get_feature_names())

unigrams = pd.concat([
    unigrams.reset_index(drop=True), 
    yelp_df[['label_']]], axis=1)

sali = unigrams.groupby('label_').agg(
    lambda x: sum(x > 0)).apply(
    lambda x: 1 - (min(x)/max(x)) if max(x) != 0 else 0)
sali_type = unigrams.groupby('label_').agg(
    lambda x: sum(x > 0)).apply(
    lambda x: 'local' if x[0] > x[1] else 'remote')

In [None]:
sali_table = pd.DataFrame(sali[sali < 1])
sali_table['label'] = sali_type[sali < 1]
sali_table.sort_values(by=0, ascending=False)

### Format categorical variables

# Modeling

In [None]:
import features_basic
from features_pos import pos_counter, get_pos_pickle

import numpy as np
import pandas as pd

from scipy.sparse.csr import csr_matrix
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
# from sklearn.svm import LinearSVC

import statsmodels.discrete.discrete_model as sm

import itertools

In [None]:
X = yelp_df.drop([
    'business_city', 'business_state', 'business_name',
    'reviewer_location', 'business_url', 'review_date',
    'reviewer_id', 'reviews_tagged', 'review_text', 'label_', 'funny_', 'cool_'], axis=1).fillna(0)

le = LabelEncoder()
y = le.fit_transform(yelp_df['label_'])

logit = sm.Logit(np.asarray(y), np.asarray(X))
logit.fit_regularized()

### Split Training and Test

### Ngram model

In [None]:
X.to_csv('data/X.csv')

In [None]:
# drop colinear variables and variables that make dataset biased
# X = yelp_df.drop([
#     'business_city', 'business_zip', 'business_state', 'business_name',
#     'reviewer_location', 'business_url', 'review_date',
#     'reviewer_id', 'reviews_tagged', 'funny_', 'cool_', ],   
#     axis=1)

# Only language model (drop all non-POS features then add N-gram Model)

X = yelp_df.drop([
    'business_city', 'business_zip', 'business_state', 'business_name',
    'reviewer_location', 'business_url', 'review_date', 'reviewer_id',
    'reviews_tagged', 'funny_', 'cool_', 'useful_', 'review_length',
    'week_of_year', 'day_of_week', 'business_star_rating',
    'review_raiting'],   
    axis=1)

print("Model feature space includes:", ', '.join(X.columns))

y = yelp_df[['label_']].astype('category')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.75, test_size=0.25)

unigram_vect = TfidfVectorizer(
    analyzer="word",
    tokenizer=None,
    stop_words = None,
    ngram_range=(1, 2),
    #min_df = .001,
    #max_df = .999,
    preprocessor=None,
    max_features=10000)

unigram_fit = unigram_vect.fit_transform(X_train['review_text'])
unigram_transform = unigram_vect.transform(X_test['review_text'])

local_words = unigram_vect.transform(
    X_train.query('label_ == "local"')['review_text'])
remote_words = unigram_vect.transform(
    X_train.query('label_ == "remote"')['review_text'])
keep_index = saliance(unigram_fit, local_words, remote_words, theta=.5)

unigram_transform = csr_matrix(unigram_transform[:, keep_index])
unigram_fit = csr_matrix(unigram_fit[:, keep_index])
keep_words = np.array(unigram_vect.get_feature_names())[keep_index]

unigram_train = pd.DataFrame(unigram_fit.A, columns=keep_words)

unigram_test = pd.DataFrame(unigram_transform.A, columns=keep_words)

# unigram_train = pd.DataFrame(unigram_fit.A, columns=unigram_vect.get_feature_names())
# unigram_test = pd.DataFrame(unigram_transform.A, columns=unigram_vect.get_feature_names())

print(unigram_train.shape[1], " n-grams in model")
print(unigram_train.columns)

X_train = X_train.drop(['review_text', 'label_'], axis='columns')
X_train = X_train.join(
    unigram_train, on=None, how='left', lsuffix='',
    rsuffix='', sort=False).fillna(0)

X_test = X_test.drop(['review_text', 'label_'], axis='columns')
X_test = X_test.join(
    unigram_test, on=None, how='left', lsuffix='',
    rsuffix='', sort=False).fillna(0)


In [None]:
non_ling = {'business_zip', 'day_of_week', 'week_of_year'}

if non_ling <= set(X_train.columns):
    business_zip_train = X_train[['business_zip']].fillna(0).astype(int)
    business_zip_test = X_test[['business_zip']].fillna(0).astype(int)

    day_train = X_train[['day_of_week']]
    day_test = X_test[['day_of_week']]

    week_train = X_train[['week_of_year']]
    week_test = X_test[['week_of_year']]

    cityref_train = X_train[['city_mentioned']]
    cityref_test = X_test[['city_mentioned']]

X_std = StandardScaler()
train_values = X_std.fit_transform(X_train.values)
test_values = X_std.transform(X_test.values)

X_train = pd.DataFrame(train_values, index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(test_values, index=X_test.index, columns=X_test.columns)

if non_ling <= set(X_train.columns):
    X_train[['business_zip']] = business_zip_train
    X_test[['business_zip']] = business_zip_test

    X_train[['day_of_week']] = day_train
    X_test[['day_of_week']] = day_test

    X_train[['week_of_year']] = week_train
    X_test[['week_of_year']] = week_test

    X_train[['city_mentioned']] = cityref_train
    X_test[['city_mentioned']] = cityref_test

In [None]:
from sklearn import naive_bayes

In [None]:
nb = naive_bayes.MultinomialNB()
nb_fit = nb.fit(X_train, y_train)
score_nb = nb.score(X_test, y_test)

y_pred_nb = nb_fit.predict(X_test)
cnf_nb = confusion_matrix(y_test, y_pred_nb)

In [None]:
gnb = GaussianNB()
gnb_fit = gnb.fit(X_train, y_train.values.ravel())
score_nb = gnb.score(X_test, y_test)

y_pred_nb = gnb_fit.predict(X_test)
cnf_nb = confusion_matrix(y_test, y_pred_nb)

In [None]:
logistic = LogisticRegression(solver='liblinear')
logistic_fit = logistic.fit(X_train, y_train.values.ravel())
score_lr = logistic.score(X_test, y_test)

y_pred_lr = logistic_fit.predict(X_test)
cnf_lr = confusion_matrix(y_test, y_pred_lr)

In [None]:
print("LR: ", score_lr, "\n",
      "NB: ", score_nb)

In [None]:
d = pd.DataFrame({'feature': X_train.columns, 'coef': [x for y in logistic.coef_ for x in y]})
d.sort_values(by='coef', ascending=False)

## Plot Decision Graph

In [None]:
def get_decision(y_label, y_pred):
    dec = []
    num = len(y_pred)
    correct = np.array(y_label == y_pred)
    for i in range(1,num):
        dec.append((i/len(correct), sum(correct[:i]/i)))
    return dec
dec_lr = get_decision(y_test['label_'], y_pred_lr)
dec_nb = get_decision(y_test['label_'], y_pred_nb)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(*zip(*dec_lr[0:-1]), dashes=[3, 3])
plt.plot(*zip(*dec_nb[0:-1]), dashes=[3, 3])
plt.title('Accuracy by decision', fontsize=25)
plt.legend(['Logistic Regression', 'Naive Bayes'], fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel('Accuracy', fontsize=25)
plt.xlabel('Decision', fontsize=25)

In [None]:
print(sklearn.metrics.classification_report(y_test, y_pred_nb))

## Plot Confusion Matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(20,10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=25)
    plt.grid(False)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=25)
    plt.yticks(tick_marks, classes, fontsize=25)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred_nb)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
class_names = np.array(['remote', 'local'])

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix (without normalization)')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()