In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# Yelp

In [49]:
def conv(x):
    try:
        return x.astype(np.int64)
    except:
        return 99

df_yelp = pd.read_csv('yelp_reviews_labelled_mini.csv')
df_yelp.dropna(inplace=True)
df_yelp['region'] = df_yelp['region'].astype(np.int64)
#df_yelp = df_yelp[df_yelp['region']<10]
categories = ['Phoenix', 'Las Vegas', 'Toronto', 'Charlotte', 'Cleveland', 'Pittsburgh', 
              'Montreal', 'Calgary', 'Madison', 'Champaign']

df_counts = df_yelp.groupby('region').count()
top_category_num = max(df_counts['text'])
top_category_name = categories[df_counts[df_counts['text']==max(df_counts['text'])].index[0]]

print("Baseline accuracy:  If we just guessed '{}' every time we would have accuracy of {:.2f}%"
      .format(top_category_name, (top_category_num/df_yelp.shape[0])*100))

Baseline accuracy:  If we just guessed 'Las Vegas' every time we would have accuracy of 34.37%


In [None]:
X = df_yelp['text'].tolist()
y = df_yelp['region'].tolist()
y = list(map(int, y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print("Training set has {} examples in {} categories, test set has {} examples".format(len(X_train), np.unique(y_train), len(X_test)))


In [3]:
vec = CountVectorizer()
train_vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
print("There are {:,} unique words in the vocabulary set, averaging {:.0f} words per example."
      .format(train_vocab.shape[1], train_vocab.nnz/train_vocab.shape[0]))
print("   {:.4f} of the entries in the matrix are non-zero."
     .format(train_vocab.nnz/(train_vocab.shape[1]*train_vocab.shape[0])))

There are 24,298 unique words in the vocabulary set, averaging 72 words per example.
   0.0030 of the entries in the matrix are non-zero.


## Multinomial Naive Bayes

### Unigram Model

In [4]:
alpha_values = [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0]
amax = [0, 0]
# Fit a MNB model for each value of alpha
for a in alpha_values:
    mnb = MultinomialNB(alpha=a)
    mnb.fit(train_vocab, y_train)
    mnb_predicted_labels = mnb.predict(test_vocab)
    mnb_f1 = metrics.f1_score(y_test, mnb_predicted_labels, average='weighted', labels=np.unique(mnb_predicted_labels))
    mnb_acc = metrics.accuracy_score(y_test, mnb_predicted_labels)

    # Print out the accuracy score for each alpha level
    print("F1 score for alpha={}: {:.4f}, accuracy: {:.2f}% ".format(a, mnb_f1, mnb_acc*100))
    # Keep track of which alpha value results in the highest accuracy
    if mnb_f1 > amax[1]:
        amax = [a, mnb_f1]    
# Print the optimal alpha value
print("Best alpha parameter found in test results: {}, returns an f1 score of {:.4f} "
      .format(amax[0], amax[1]))

F1 score for alpha=0.001: 0.4561, accuracy: 47.66% 
F1 score for alpha=0.01: 0.4678, accuracy: 48.26% 
F1 score for alpha=0.1: 0.4769, accuracy: 48.94% 
F1 score for alpha=0.5: 0.4831, accuracy: 51.30% 
F1 score for alpha=1.0: 0.4463, accuracy: 49.26% 
F1 score for alpha=2.0: 0.4113, accuracy: 46.01% 
F1 score for alpha=3.0: 0.3830, accuracy: 44.33% 
F1 score for alpha=5.0: 0.3890, accuracy: 42.44% 
F1 score for alpha=10.0: 0.3626, accuracy: 38.92% 
Best alpha parameter found in test results: 0.5, returns an f1 score of 0.4831 


### Bigram Model

In [5]:
# Define a bigram vocabulary
vec_bigram = CountVectorizer(ngram_range=(2,2))
train_vocab_b = vec_bigram.fit_transform(X_train)
test_vocab_b = vec_bigram.transform(X_test)

# Fit vocabulary to a Multinomial Naive Bayes classifier
mnb = MultinomialNB(alpha=amax[0])
mnb.fit(train_vocab_b, y_train)
mnb_predicted_labels = mnb.predict(test_vocab_b)
mnb_f1 = metrics.f1_score(y_test, mnb_predicted_labels, average='weighted', labels=np.unique(mnb_predicted_labels))
mnb_acc = metrics.accuracy_score(y_test, mnb_predicted_labels)

# Print out the accuracy score for each alpha level
print("F1 score for alpha={}: {:.4f}, accuracy: {:.2f}% ".format(amax[0], mnb_f1, mnb_acc*100))


F1 score for alpha=0.5: 0.3882, accuracy: 44.09% 


## Logistic Regression

In [6]:
cmax = [0, 0]
c_values = [0.010, 0.1000, 0.3000, 0.5000, 1.000, 2.000]

# Fit a LR model for each value of C
for c in c_values:
    log = LogisticRegression(C=c, penalty='l2', random_state=42, solver='lbfgs', max_iter=3000, multi_class='multinomial')
    log.fit(train_vocab, y_train)
    log_predicted_labels = log.predict(test_vocab)
    log_f1 = metrics.f1_score(y_test, log_predicted_labels, average='weighted', labels=np.unique(log_predicted_labels))
    log_acc = metrics.accuracy_score(y_test, log_predicted_labels)

    # Print out the accuracy score for each value of C
    print("F1 score for C={}: {:.4f}, accuracy: {:.2f}% ".format(c, log_f1, log_acc*100))
    # Keep track of which C value results in the highest accuracy
    if log_f1 > cmax[1]:
        cmax = [c, log_f1]  

# Print the optimal C value
print("Best C parameter found in test results: {}, returns an f1 score of {:.4f} "
      .format(cmax[0], cmax[1]))   

F1 score for C=0.01: 0.4132, accuracy: 46.41% 
F1 score for C=0.1: 0.4459, accuracy: 47.33% 
F1 score for C=0.3: 0.4450, accuracy: 46.45% 
F1 score for C=0.5: 0.4444, accuracy: 46.09% 
F1 score for C=1.0: 0.4465, accuracy: 46.01% 
F1 score for C=2.0: 0.4417, accuracy: 45.29% 
Best C parameter found in test results: 1.0, returns an f1 score of 0.4465 


### Bigram model

In [7]:
log = LogisticRegression(C=cmax[0], penalty='l2', random_state=42, solver='lbfgs', max_iter=1000, multi_class='multinomial')
log.fit(train_vocab_b, y_train)
log_predicted_labels = log.predict(test_vocab_b)
log_f1 = metrics.f1_score(y_test, log_predicted_labels, average='weighted', labels=np.unique(log_predicted_labels))
log_acc = metrics.accuracy_score(y_test, log_predicted_labels)

# Print out the accuracy score for each value of C
print("F1 score for C={}: {:.4f}, accuracy: {:.2f}% ".format(cmax[0], log_f1, log_acc*100))

F1 score for C=1.0: 0.3725, accuracy: 41.16% 


### Preprocessing

In [8]:
def empty_preprocessor(s):
    return s

def better_preprocessor(s):
    rs = s.lower()
    # Replace some separators with spaces
    rs = re.sub('\n|-|/|\.', ' ', rs)
    # Eliminate everything else that isn't a letter or number
    rs = re.sub('[^0-9a-z ]+', '', rs)
    # Eliminate extraneous spaces
    rs = re.sub('\s{2,}', ' ', rs)
    prs = []
    # Drop some low-value words
    dumbwords = ['is', 'it', 'to', 'the', 'and', 'not', 'no', 'on', 'of', 'for', 'as', 'by', 'in', 'by', 'am', 'etc', \
                 'was', 'that', 'has', 'at', 'or', 'we', 'be', 'had']
    for word in rs.split():
        # Eliminate the -ing and -ly suffices
        word = word[:-3] if word[-3:]=='ing' and len(word) > 5 else word
        word = word[:-2] if word[-2:]=='ly' and len(word) > 5 else word
        # Trim words to 9 characters
        word = word[:9] if len(word) > 9 else word
        # Eliminate single-character words
        if len(word) > 1 and word not in dumbwords:
            prs.append(word)
    
    return " ".join(prs)

proc_train_data = [better_preprocessor(x) for x in X_train]
proc_test_data = [better_preprocessor(x) for x in X_test]

In [9]:
# Make a baseline vectorizer
vec = CountVectorizer()
vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
# Make a preprocessed vectorizer
vec_proc = CountVectorizer(preprocessor=better_preprocessor)
vocab_proc = vec_proc.fit_transform(X_train)
test_vocab_proc = vec_proc.transform(X_test)

# Fit and predict the baseline
log = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log.fit(vocab, y_train)
log_predicted_labels = log.predict(test_vocab)
log_score = metrics.accuracy_score(y_test, log_predicted_labels)

# Fit and predict the pre-processed set
log_proc = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log_proc.fit(vocab_proc, y_train)
log_proc_predicted_labels = log_proc.predict(test_vocab_proc)
log_proc_score = metrics.accuracy_score(y_test, log_proc_predicted_labels)

# Print the results
print("Unprocessed: {:,} words with accuracy: {:.4f}\nPre-processed: {:,} words with accuracy: {:.4f}"
      .format(vocab.shape[1], log_score, vocab_proc.shape[1], log_proc_score))
print("Improvement: {:.4f}".format(log_proc_score-log_score))

# Find a wrong answer and print it out for better analysis
wrong = np.random.choice(np.where(y_test != log_proc_predicted_labels)[0].ravel())

print("\nSample wrong answer from the preprocessed set, post #{}:".format(wrong))
print("unprocessed prediction: {} ({})".format(categories[log_predicted_labels[wrong]], log_predicted_labels[wrong]))
print("preprocessed prediction: {} ({})".format(categories[log_proc_predicted_labels[wrong]], log_proc_predicted_labels[wrong]))
print("true label: {} ({})".format(categories[y_test[wrong]], y_test[wrong]))
print("true data: ",proc_test_data[wrong])
print()

Unprocessed: 24,298 words with accuracy: 0.4601
Pre-processed: 22,994 words with accuracy: 0.4629
Improvement: 0.0028

Sample wrong answer from the preprocessed set, post #1785:
unprocessed prediction: Las Vegas (1)
preprocessed prediction: Las Vegas (1)
true label: Phoenix (0)
true data:  eddie great help very accommoda however this location suffers particula from lack organizat honesty multiple occasions have been here only have technicia oblivious lie straight my face claim needed unnecessa repairs my most recent visit 10 17 took them hours fulfill basic oil change three half hours all while other customers were com out per usual after hours employee grunt advised me stand outside wait them they would finished short stand outside observed absence sense urgency technicia monotonou grudge around slow carry out their orders until every order finished except mine observed group technicia most stand around semi circle loud curs profanity racial slurs careful they will purpose dodge try s

### TFIDF

In [10]:
# Make a TFIDF Vectorizer and fit the training and dev vocabularies
vec = TfidfVectorizer()
vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
# Inverse vocabulary dictionary for word lookup
inv_vocab = {v: k for k, v in vec.vocabulary_.items()}
# Fit and predict a logistic regression based on the results
lr = LogisticRegression(C=100, solver='lbfgs', max_iter=4000, multi_class='multinomial')
lr.fit(vocab, y_train)
lr_predicted_labels = lr.predict(test_vocab)
# Calculate and print the resulting score
lr_score = metrics.f1_score(y_test, lr_predicted_labels, average='weighted')
lr_acc = metrics.accuracy_score(y_test, lr_predicted_labels)

print("Baseline F1 for TfidfVectorizer: {:.2f}, accuracy {:.2f}%\n".format(lr_score, lr_acc*100))

# Get the probabilities for each class prediction
probs = lr.predict_proba(test_vocab)
R = []
# Run through the probabilities and calculate the R ratio as defined in the prompt, saving the value in the R list
for x in range(0, len(probs)):
    num = np.max(probs[x])
    den = probs[x][np.unique(y_test).tolist().index(y_test[x])]
    R.append(num/den)
# Get the highest x number of R values
top = np.argsort(np.array(R))[len(R)-3:]

# Print the top misidentified documents as well as their TFIDF score and coefficients by class
print("TOP {} MISIDENTIFIED DOCUMENTS:".format(3))
c = 1
for i in top:
    print("DOCUMENT #{}".format(c))
    print("Predicted label: {}[{}] (P{:.1f}%), True label: {}[{}] (P{:.1f}%)"
          .format(categories[lr_predicted_labels[i]], lr_predicted_labels[i], np.max(probs[i])*100, \
                            categories[y_test[i]], y_test[i], probs[i][y_test[i]]*100))
    print("R ratio: {:.2f}".format(R[i]))
    print(X_test[i])
    '''
    print("\n{:10} {:>10} {:>15} {:>15} {:>15} {:>22} ".format("word", "Tfidf", categories[0], categories[1], \
                                                               categories[2], categories[3]))
    for w in np.nonzero(dev_vocab[i])[1]:
        coefs = np.round(lr.coef_[:,w], 2).flat
        print("{:10} {:10.3f} {:>15} {:>15} {:>15} {:>22}".format(inv_vocab[w], dev_vocab[i][0,w], \
                                                                  coefs[0], coefs[1], coefs[2], coefs[3])
    '''
    print("----\n")
    c += 1

Baseline F1 for TfidfVectorizer: 0.46, accuracy 47.37%

TOP 3 MISIDENTIFIED DOCUMENTS:
DOCUMENT #1
Predicted label: Las Vegas[1] (P100.0%), True label: Phoenix[0] (P0.0%)
R ratio: 37951.19
Been here twice and both times I thought both times the value for the money was great.  The happy hour and lunch specials will keep me coming back.  Awesome Vegas roll.  I judge all sushi restaurants by the Vegas roll! We also had sashimi and spicy tuna roll which were great!
----

DOCUMENT #2
Predicted label: Phoenix[0] (P100.0%), True label: Las Vegas[1] (P0.0%)
R ratio: 45201.00
Friendly and excellent guest services. Thank you Arizona our Waitness and Phi the manager. We will be back.
----

DOCUMENT #3
Predicted label: Phoenix[0] (P100.0%), True label: Las Vegas[1] (P0.0%)
R ratio: 38261497.63
Just as good the one in Scottsdale, AZ. Great bartenders, lots of TV and lots of great beer on tap.
----



### Word Analysis

In [11]:
top = 20
vec = CountVectorizer()
train_vocab = vec.fit_transform(X_train)
# Make an inverse vocabulary to look up words by index
inv_vocab = {v: k for k, v in vec.vocabulary_.items()}
log = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log.fit(train_vocab, y_train)
# Get the words with the highest coefficients from each class
topwords = np.argsort(log.coef_, 1)[:, train_vocab.shape[1]-top:]
df_topwords = pd.DataFrame()

for x in range(topwords.shape[0]):
    wordlist = [inv_vocab[x] for x in topwords[x]]
    df_topwords[categories[x]] = wordlist

df_topwords

Unnamed: 0,Phoenix,Las Vegas,Toronto,Charlotte,Cleveland,Pittsburgh,Montreal
0,true,sucked,village,arrive,spots,remember,only
1,tostadas,baby,uncomfortable,terrific,sarah,starbucks,around
2,wonton,tempura,westerns,interested,window,break,peanut
3,adequate,wynn,waffles,blvd,favorites,filling,sometimes
4,ray,lv,canada,lady,fall,florida,cheese
5,orange,nevada,rooftop,concord,beers,laws,regulars
6,cooks,following,multiple,flight,deliver,hoagies,session
7,flies,banana,court,inventory,american,mt,conversation
8,brews,tooth,upload,smelled,margarita,burgh,dish
9,chandler,noodle,yonge,gnocchi,le,milkshakes,rad
