In [None]:
import numpy as np
import pandas as pd
import csv

## Cleanup

In [None]:
yelp_df = pd.read_csv('/home/gavagai/all_reviews.csv')
to_drop = yelp_df.query('label != "remote" and label != "local"').index
yelp_df = yelp_df.drop(to_drop)
yelp_df.reindex()

In [None]:
yelp_df.query('label != "remote" and label != "local"')

In [None]:
yelp_df.to_csv('all_reviews_cleaned.csv', index = False)

## Import

In [None]:
yelp_df = pd.read_csv('/home/gavagai/all_reviews_cleaned.csv')
#yelp_df = yelp_df.drop('Unnamed: 0', axis=1)
#yelp_df = yelp_df.drop_duplicates()
#yelp_df = yelp_df.reset_index(drop=True)

In [None]:
yelp_df

### Business data frame

In [None]:
biz_df = yelp_df.iloc[:,0:6].drop_duplicates().astype('object')
biz_df[['business_star_rating']] = biz_df[['business_star_rating']].apply(pd.to_numeric)

In [None]:
biz_df.drop('business_star_rating', axis = 1).describe()

In [None]:
biz_df.describe()

In [None]:
biz_df[['business_star_rating', 'business_url']].groupby('business_star_rating').agg('count')

## General summary

### Categorical

In [None]:
yelp_df.drop(['business_star_rating','business_zip','review_raiting', 'useful', 'funny', 'cool'], axis=1)

### Numeric

In [None]:
yelp_df[['business_star_rating', 'review_raiting', 'useful', 'funny', 'cool']].describe()

### Missing

In [None]:
yelp_df.isna().sum(0)

## Label Anaysis

In [None]:
yelp_df[['business_url']].groupby([yelp_df['business_state'], yelp_df['label'] ]).agg('count')

In [None]:
biz_label_group = yelp_df[['business_state','label']].groupby(
    [yelp_df['business_state'], yelp_df['label']])

biz_label_group.agg('count').apply(lambda x: 100 * x / float(x.sum()))

In [None]:
yelp_df.groupby(yelp_df['label']).agg('count')

In [None]:
test = yelp_df[['funny', 'label', 'business_url', 'review_text']].sort_values('funny', ascending=False).iloc[0,3]
re.search(r'[ \f\t\v]+$', test)

In [None]:
import re
yelp_df[['review_text', 'business_url']].assign(end_on_whitespace=yelp_df['review_text'].apply(lambda x: re.search(' +$', x) != None)).query('end_on_whitespace == True')
#re.findall('\s$', yelp_df['review_text'])

In [None]:
yelp_df.isna().sum(0)

In [None]:
yelp_df.sort_values('funny', ascending=False).drop_duplicates()

# Fretures

## Length of review

In [None]:
yelp_df[['review_text', 'label']].assign(review_length =
    yelp_df['review_text'].apply(lambda x: len(x))).query('label == "local"')

In [None]:
yelp_df[['review_text', 'label']].assign(review_length =
    yelp_df['review_text'].apply(lambda x: len(x))).query('label == "remote"').describe()

#### Ratio of business rating to review rating

In [None]:
yelp_df[['business_star_rating', 'review_raiting']].assign(review_biz_ratio= (yelp_df['review_raiting']/yelp_df['business_star_rating'])).sort_values('review_biz_ratio')

# Plots

In [None]:
from matplotlib import pyplot as plt

from yellowbrick.text import FreqDistVisualizer
from sklearn.feature_extraction.text import CountVectorizer

from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from nltk import FreqDist

In [None]:
vectorizer = CountVectorizer()
docs       = vectorizer.fit_transform(yelp_df['review_text'].apply(lambda x: x.lower()))
features   = vectorizer.get_feature_names()

In [None]:
visualizer = FreqDistVisualizer(n= 40, fontsize=25, features=features, orient='h', size=(1200, 800))
visualizer.fit(docs)
visualizer.ax.legend(loc=4)
visualizer.set_title()
# Set the title
# Create the vocab, count, and hapaxes labels
infolabel = "vocab: {:,}\nword tokens: {:,}\nhapax: {:,}".format(
    visualizer.vocab_, visualizer.words_, visualizer.hapaxes_
)

visualizer.ax.text(0.68, 0.97, infolabel, position=(.75,.1), transform=visualizer.ax.transAxes,
             fontsize=20, verticalalignment='bottom',
             bbox={'boxstyle':'round', 'facecolor':'white', 'alpha':.8})

# Set the legend and the grid
plt.title('Frequency Distribution of Top {} tokens'.format(visualizer.N), fontsize=30)
plt.yticks(size=15)
plt.xticks(size=15)
plt.rcParams.update({'font.size': 22})
plt.show(visualizer)


In [None]:
tokenizer = RegexpTokenizer(r'\w+')
corpus_retokenized = tokenizer.tokenize(' '.join(yelp_df['review_text']).lower())

In [None]:
reword_freq = FreqDist(corpus_retokenized)

In [None]:
import operator
refreq = reword_freq
#freq = dict(sorted(freq.items(), reverse=True, key=lambda kv: kv[1]))
refreq = sorted(refreq.items(), reverse=True, key=operator.itemgetter(1))
for i in range(len(refreq)):
    refreq[i] = (i, refreq[i][1])

In [None]:
refreq_df = pd.DataFrame.from_dict(dict(refreq), orient='index')
refreq_df = refreq_df.rename(columns={0: 'frequency'})
refreq_df['word type rank (by frequency)'] = refreq_df.index + 1

In [None]:
plt.figure(figsize=(20,10))
s = plt.scatter(refreq_df['count'], refreq_df['frequency'])
s.axes.loglog(True)
plt.title('Word Frequency by Frequency Rank', fontsize=25)
plt.ylabel('Token Frequency', fontsize=25)
plt.xlabel('Word Type Rank (by Frequency)', fontsize=25)
#refreq_df.plot(kind='scatter', loglog=True,  x='count', y='frequency')

Zipf's law states natural language corpus of utterances, the frequency of any word type is inversely proportional to its rank in the frequency table.

So frequency of the word with rank n is proportional to 1/n. In other words, the most ranked word is around twice as common as the second ranked word, and a thousand times more common than the word with rank 100,000.)

We can check Zipf's Law for the scraped corpus of Yelp reviews by plotting the frequencies of the word types in rank order on a log-log graph.

## Add confusion matrx

## Add decision Plot

# Modeling

## Import

In [1]:
import numpy as np
import pandas as pd
import pickle
import time
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
yelp_df = pd.read_csv('all_reviews_cleaned.csv')

In [24]:
yelp_df.groupby('business_state').agg('count')

Unnamed: 0_level_0,business_city,business_name,business_star_rating,business_url,business_zip,cool,funny,label,review_date,review_raiting,review_text,reviewer_id,reviewer_location,useful
business_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
CA,6859,6859,6859,6859,6851,2833,2262,6859,6859,6859,6859,6859,6859,3912
FL,12015,12015,12015,12015,12015,4264,3444,12015,12015,12015,12015,12015,12015,5865
IL,15796,15796,15796,15796,15796,5570,4415,15796,15796,15796,15796,15796,15796,8390
NJ,300,300,300,300,300,97,69,300,300,300,300,300,300,155
NV,9725,9725,9725,9725,9725,3593,2592,9725,9725,9725,9725,9725,9725,4866
NY,8662,8662,8662,8662,8662,3310,2271,8662,8662,8662,8662,8662,8662,4746


## Add features

#### review length

In [None]:
yelp_df= yelp_df.assign(review_length =
    yelp_df['review_text'].apply(lambda x: len(x)))

#### week of year

In [None]:
yelp_df = yelp_df.assign(week_of_year =
    yelp_df['review_date'].apply(lambda x: time.strptime(x, "%m/%d/%Y").tm_yday // 7))

#### day of week

In [None]:
yelp_df = yelp_df.assign(day_of_week =
    yelp_df['review_date'].apply(lambda x: time.strptime(x, "%m/%d/%Y").tm_wday))

#### city mentioned

In [3]:
yelp_df = yelp_df.assign(city_mentioned = 0)

In [11]:
vecIn = np.vectorize(lambda a, b: a.lower() in b.lower() )

In [12]:
yelp_df = yelp_df.assign(city_mentioned = np.where(vecIn(yelp_df['business_city'].values, yelp_df['review_text'].values), 1, 0))

In [10]:
yelp_df['business_city'].values in yelp_df['review_text'].values

False

In [13]:
yelp_df[['city_mentioned']]

Unnamed: 0,city_mentioned
0,0
1,1
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


#### reviewer state

In [None]:
yelp_df = yelp_df.assign(reviewer_state = yelp_df['reviewer_location'].astype(str).apply(lambda x: x[-2:]))

#### POS Tagging

In [None]:
from nltk.tag.stanford import StanfordPOSTagger
from nltk import word_tokenize
from os import environ
import pickle
import nltk

def review_tokenize(reviews):
    return map(lambda review: word_tokenize(review), reviews)

def review_tager(tokenized_reviews):    
    st_model_path = r'SPOST/models/english-bidirectional-distsim.tagger'
    st = StanfordPOSTagger(st_model_path,
                           r'SPOST/stanford-postagger.jar')
    results = []
    errors = []
    count = 0

#     return map(lambda review: st.tag(review), tokenized_reviews)
    for review in tokenized_reviews:
        try:
            results.append(st.tag(review))
            count += 1
        except:
            print(count)
            errors.append(count)
            results.append(review)
            count += 1
    print('errors for the following indexes\n', errors)
    return results  # [st.tag(review) for review in tokenized_reviews]


In [None]:
# tokenized_reviews = review_tokenize(yelp_df['review_text'])
# reviews_tagged = review_tager(tokenized_reviews)

In [None]:
with open('reviews_tagged.p', 'rb') as f_reviews_tagged:
    reviews_tagged = pickle.load(f_reviews_tagged)

In [None]:
yelp_df['reviews_tagged'] = reviews_tagged

In [None]:
class pos_counter():

    adverbs = [u'RB', u'RBR', u'RBS', u'RBS\r', u'RB\r', u'RBR\r']
    simple_past = [u'VBD', u'VBD\r']
    simple_present = [u'VBP', u'VPZ', u'VBP\r', u'VPZ\r']
    past_participle = [u'VBN', u'VBN\r']
    modal = [u'MD', u'MD\r']
    pn = [u'NNP', u'NNPS', u'NNP\r', u'NNPS\r']
    prep = [u'IN', u'IN\r']
    nn = [u'NN', u'NN\r']
    adj = [u'JJ', u'JJ\r']
    dt = [u'DT', u'DT\r']

    def count_pos(tagged_reviews, pos_list):
        count = 0
        for review in tagged_reviews:
            try:
                if review[1] in pos_list:
                    count += 1
            except:
                pass
        return float(count)

## POS Features

In [None]:
yelp_df["adv count"] = [
        pos_counter.count_pos(review, pos_counter.adverbs)
        for review in reviews_tagged]

yelp_df["past prog"] = [
    pos_counter.count_pos(review, pos_counter.past_participle)
    for review in reviews_tagged]

yelp_df["simple future"] = [
    pos_counter.count_pos(review, pos_counter.modal)
    for review in reviews_tagged]

yelp_df["simple past"] = [
    pos_counter.count_pos(review, pos_counter.simple_past)
    for review in reviews_tagged]

yelp_df["simple present"] = [
    pos_counter.count_pos(review, pos_counter.simple_present)
    for review in reviews_tagged]

yelp_df['porper name'] = [
    pos_counter.count_pos(review, pos_counter.pn)
    for review in reviews_tagged]

yelp_df['prep count'] = [
    pos_counter.count_pos(review, pos_counter.prep)
    for review in reviews_tagged]

yelp_df['nn count'] = [
    pos_counter.count_pos(review, pos_counter.nn)
    for review in reviews_tagged]

yelp_df['adj count'] = [
    pos_counter.count_pos(review, pos_counter.adj)
    for review in reviews_tagged]

yelp_df['det count'] = [
    pos_counter.count_pos(review, pos_counter.dt)
    for review in reviews_tagged]

### Saliance function

## Saliance

In [None]:
def saliance(unigrams, unigram_labels, theta=.50):
    """ saliance(data) takes a dataframe and returns a list of dropable variables
    that do not meet a salience theta
    """
    unigrams = pd.concat([unigrams.reset_index(drop=True), unigram_labels], axis=1)
    unigrams_l = unigrams.query('label == "local"')
    unigrams_r = unigrams.query('label == "remote"')

    drop_words = []
    
    for word in unigrams.drop('label', axis = 1):
        normalizer = len([x for x in unigrams[word] if x > 0])
        l_prob_sum = len([x for x in unigrams_l[word] if x > 0]) / normalizer
        r_prob_sum =  len([x for x in unigrams_r[word] if x > 0]) / normalizer
        min_ = min(r_prob_sum, l_prob_sum)
        max_ = max(r_prob_sum, l_prob_sum)
        if max_ != 0:
            salience = (1 - (min_/max_))
        else:
            salience = 0
        if salience == 1 or salience < theta:
            drop_words.append(word)
    return drop_words

In [None]:
# unigram_vect = sklearn.feature_extraction.text.CountVectorizer(
#     analyzer="word",
#     tokenizer=None,
#     preprocessor=None,
#     stop_words=None,
#     max_features=1000)

# unigram_fit = unigram_vect.fit_transform(yelp_df['review_text'])

# unigrams = pd.DataFrame(
#      unigram_fit.A, columns=unigram_vect.get_feature_names())


# unigrams = pd.concat([unigrams.reset_index(drop=True), yelp_df['label']], axis=1)
# unigrams_l = unigrams.query('label == "local"')
# unigrams_r = unigrams.query('label == "remote"')

# def map_saliance(word, theta = .6):
#     """ saliance(data) takes a dataframe and returns a list of dropable variables
#     that do not meet a salience theta
#     """
#     normalizer = len([x for x in unigrams[word] if x > 0])
#     l_prob_sum = len([x for x in unigrams_l[word] if x > 0]) / normalizer
#     r_prob_sum =  len([x for x in unigrams_r[word] if x > 0]) / normalizer
#     min_ = min(r_prob_sum, l_prob_sum)
#     max_ = max(r_prob_sum, l_prob_sum)
#     if max_ != 0:
#         salience = (1 - (min_/max_))
#     else:
#         salience = 0
#     if salience > theta and max(r_prob_sum, l_prob_sum) == l_prob_sum:
#             return (word, salience, 'local')
#     elif salience > theta:
#         (word, salience, 'remote')
# map_saliance('10')

In [3]:
unigram_vect = sklearn.feature_extraction.text.CountVectorizer(
    analyzer="word",
    tokenizer=None,
    preprocessor=None,
    stop_words=None,
    max_features=30000)

yelp_df = yelp_df.rename(columns={'cool': 'cool_', 'label': 'label_', 'funny': 'funny_', 'useful': 'useful_'})

unigram_fit = unigram_vect.fit_transform(yelp_df['review_text'])

unigrams = pd.DataFrame(
     unigram_fit.A, columns=unigram_vect.get_feature_names())

unigrams = pd.concat([
    unigrams.reset_index(drop=True), 
    yelp_df[['label_']]], axis=1)

sali = unigrams.groupby('label_').agg(
    lambda x: sum(x > 1)).apply(
    lambda x: 1 - (min(x)/max(x)) if max(x) != 0 else 0)


MemoryError: 

In [None]:
sali_type = unigrams.groupby('label_').agg(
    lambda x: sum(x > 1)).apply(
    lambda x: 'local' if x[0] > x[1] else 'remote')

In [None]:
l[l > .50][l != 1].sort_values()

In [None]:
# def saliance(words, local_words, remote_words, theta=.50):
#     drop_words = []
#     for i in range(words.shape[1]):
#         normalizer = words[:, i].sum()
#         l_prob_sum = local_words[:, i].sum() / normalizer
#         r_prob_sum = remote_words[:, i].sum() / normalizer

#         min_ = min(r_prob_sum, l_prob_sum)
#         max_ = max(r_prob_sum, l_prob_sum)
#         if max_ != 0:
#                 salience = (1 - (min_/max_))
#         else:
#                 salience = 0
#         if salience < theta:
#             drop_words.append(i)
#     return drop_words

In [None]:
import time

t0 = time.time()

unigram_vect = sklearn.feature_extraction.text.CountVectorizer(
    analyzer="word",
    tokenizer=None,
    preprocessor=None,
    stop_words=None,
    max_features=50000)

unigram_fit = unigram_vect.fit_transform(yelp_df['review_text'])

unigram_model = pd.DataFrame(
     unigram_fit.A, columns=unigram_vect.get_feature_names())


def rank_saliance(unigrams, unigram_labels, theta=.50):

    saliance_rank = {}

    unigrams = pd.concat([unigrams.reset_index(drop=True), unigram_labels], axis=1)
    unigrams_l = unigrams.query('label == "local"')
    unigrams_r = unigrams.query('label == "remote"')

    for word in unigrams.drop('label', axis = 1):
        #normalizer = sum(unigrams[word])                 
        normalizer = len([x for x in unigrams[word] if x > 0])
        l_prob_sum = len([x for x in unigrams_l[word] if x > 0]) / normalizer
        r_prob_sum =  len([x for x in unigrams_r[word] if x > 0]) / normalizer
        min_ = min(r_prob_sum, l_prob_sum)
        max_ = max(r_prob_sum, l_prob_sum)
        if max_ != 0:
            salience = (1 - (min_/max_))
        else:
            salience = 0
        if salience > theta and max(r_prob_sum, l_prob_sum) == l_prob_sum:
            saliance_rank[word] = (word, salience, 'local')
        elif salience > theta:
            saliance_rank[word] = (salience, 'remote')
    return saliance_rank
r = rank_saliance(unigram_model, yelp_df['label'], theta=.65)
print(dict(sorted(r.items(), reverse=True, key=lambda kv: kv[1])))
t1 = time.time()
print(t1-t0)

## Sample from Date Frame

In [79]:
yelp_df = pd.read_csv('all_reviews_cleaned.csv')

In [82]:
yelp_df = yelp_df.rename(columns={'cool': 'cool_', 'label': 'label_', 'funny': 'funny_', 'useful': 'useful_'})

state_min = min(yelp_df.query('business_state != "NJ"').groupby('business_state').agg('count').iloc[:, 0 ])

print(state_min)

sample_ny = yelp_df.query('business_state == "NY"').sample(n=state_min)
sample_nv = yelp_df.query('business_state == "NV"').sample(n=state_min)
sample_ca = yelp_df.query('business_state == "CA"').sample(n=state_min)
sample_fl = yelp_df.query('business_state == "FL"').sample(n=state_min)
sample_il = yelp_df.query('business_state == "IL"').sample(n=state_min)

sample = pd.concat([sample_ny, sample_nv, sample_ca, sample_fl, sample_il]).reset_index(drop = True)

sample_min = min(sample.groupby('label_').agg('count').iloc[:, 0])

print(sample_min)

local_sample = sample.query('label_ == "local"').sample(n=sample_min)
remote_sample = sample.query('label_ == "remote"').sample(n=sample_min)

yelp_df = pd.concat([local_sample, remote_sample]).reset_index(drop = True)

6859
16484


## Clean Data frame

In [None]:
from sklearn.preprocessing import LabelEncoder
le_state = LabelEncoder()
yelp_df[['business_city']] = le_state.fit_transform(yelp_df['business_city'])
le_state = LabelEncoder()
yelp_df[['business_state']] = le_state.fit_transform(yelp_df['business_state'])
le_zip = LabelEncoder()
yelp_df[['business_zip']] = le_zip.fit_transform(yelp_df['business_zip'])
le_loc = LabelEncoder()
yelp_df[['reviewer_location']] = le_loc.fit_transform(yelp_df['reviewer_location'])
le_rstate = LabelEncoder()
yelp_df[['reviewer_state']] = le_rstate.fit_transform(yelp_df['reviewer_state'])

## Train

In [None]:
yelp_df.drop(['business_city', 'business_state', 'reviews_tagged','business_name','business_url','review_date','reviewer_id'], axis = 1).to_csv('X.csv')

In [14]:
X = yelp_df.drop(['business_city', 'business_state', 'business_name','business_url',
                  'review_date','reviewer_id'], axis = 1)

# X = yelp_df[['business_star_rating', 'business_zip', 'cool_', 
#          'funny_', 'review_raiting', 'useful_', 'review_length', 
#          'week_of_year', 'city_mentioned', 'adv count', 'past prog', 
#          'simple past', 'porper name', 'prep count', 'review_text', 'label' ]]

y = yelp_df[['label']].astype('category')

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, train_size=0.75, test_size=0.25)

unigram_vect = sklearn.feature_extraction.text.CountVectorizer(
    analyzer="word",
    tokenizer=None,
    preprocessor=None,
    stop_words=None,
    max_features=1000)

unigram_fit = unigram_vect.fit_transform(X_train['review_text'])
unigram_transform = unigram_vect.transform(X_test['review_text'])


# def saliance(words, local_words, remote_words, theta=.50):
#     drop_words = []
#     for i in range(words.shape[1]):
#         normalizer = words[:, i].sum()
#         l_prob_sum = local_words[:, i].sum() / normalizer
#         r_prob_sum = remote_words[:, i].sum() / normalizer

#         min_ = min(r_prob_sum, l_prob_sum)
#         max_ = max(r_prob_sum, l_prob_sum)
#         if max_ != 0:
#                 salience = (1 - (min_/max_))
#         else:
#                 salience = 0
#         if salience < theta:
#             drop_words.append(i)
#     return drop_words


print(np.array(unigram_vect.get_feature_names())[[100,4,900]])


# unigram_train = pd.DataFrame(
#      unigram_fit.A, columns=unigram_vect.get_feature_names())

# unigram_test = pd.DataFrame(
#      unigram_transform.A, columns=unigram_vect.get_feature_names())

# local_words = unigram_vect.transform(X_train.query('label == "local"')['review_text'])
# remote_words = unigram_vect.transform(X_train.query('label == "remote"')['review_text'])
# drop_index = saliance(unigram_fit, local_words, remote_words, theta=.65)
# drop_words = unigram_train.columns[drop_index]

# unigram_labels = y_train
# drop_words = saliance(unigram_train, unigram_labels, theta=.65)

# unigram_train.drop(drop_words, axis=1, inplace=True)
# unigram_test.drop(drop_words, axis=1, inplace=True)

# print(unigram_train.shape[1], " n-grams in model")

# X_train = X_train.drop(['review_text', 'label'], axis='columns')
# X_train = X_train.join(unigram_train,
#     on=None, how='left', lsuffix='', rsuffix='', sort=False)

# X_test = X_test.drop(['review_text', 'label'], axis='columns')
# X_test = X_test.join(unigram_test,
#     on=None, how='left', lsuffix='', rsuffix='', sort=False)

# X_train = X_train.fillna(0)
# X_test = X_test.fillna(0)

['blue' '15' 'try']


In [20]:
import scipy
scipy.sparse.csr.csr_matrix(unigram_fit)[:,[1,2,3]].A

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [1, 0, 1],
       [0, 0, 0],
       [0, 0, 0]], dtype=int64)

In [None]:
#LassoLarsIC

In [None]:
logistic = sklearn.linear_model.LogisticRegression()
logistic_fit = logistic.fit(X_train, y_train)
y_pred = logistic_fit.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)

logistic.score(X_test, y_test)

In [None]:
dec = []
num = len(y_pred)
correct = np.array(y_test['label'] == y_pred)
for i in range(1,num):
    dec.append((i/len(correct), sum(correct[:i]/i)))

In [None]:
plt.figure(figsize=(15,10))
plt.plot(*zip(*dec[0:-1:100]), dashes=[3, 3])
plt.title('Word Frequency by Frequency Rank', fontsize=25)
plt.ylabel('Accuracy', fontsize=25)
plt.xlabel('Decision', fontsize=25)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb.score(X_test, y_test)

In [None]:
svm = sklearn.svm.LinearSVC()
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(20,10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=25)
    plt.grid(False)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=25)
    plt.yticks(tick_marks, classes, fontsize=25)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
class_names = np.array(['remote', 'local'])

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix (without normalization)')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()