In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# Yelp

In [54]:
def conv(x):
    try:
        return x.astype(np.int64)
    except:
        return 99

df_yelp = pd.read_csv('yelp_reviews_labelled_mini.csv')
df_yelp.dropna(inplace=True)
df_counts = df_yelp.groupby('region').count()

top_category_num = max(df_counts['text'])
top_category_name = df_counts[df_counts['text']==max(df_counts['text'])].index[0]

categories = df_counts.index.tolist()
df_counts

Unnamed: 0_level_0,text
region,Unnamed: 1_level_1
Calgary,156
Champaign,73
Charlotte,639
Cleveland,460
Las Vegas,3459
Madison,189
MontrÃ©al,302
Phoenix,3160
Pittsburgh,387
Toronto,1162


In [55]:
print("Baseline accuracy:  If we just guessed '{}' every time we would have accuracy of {:.2f}%"
      .format(top_category_name, (top_category_num/df_yelp.shape[0])*100))

Baseline accuracy:  If we just guessed 'Las Vegas' every time we would have accuracy of 34.59%


In [57]:
X = df_yelp['text'].tolist()
y = df_yelp['region'].tolist()
#y = list(map(int, y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print("Training set has {} examples in {} categories, test set has {} examples".format(len(X_train), np.unique(y_train), len(X_test)))


Training set has 7500 examples in ['Calgary' 'Champaign' 'Charlotte' 'Cleveland' 'Las Vegas' 'Madison'
 'MontrÃ©al' 'Phoenix' 'Pittsburgh' 'Toronto' 'other'] categories, test set has 2500 examples


In [58]:
vec = CountVectorizer()
train_vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
print("There are {:,} unique words in the vocabulary set, averaging {:.0f} words per example."
      .format(train_vocab.shape[1], train_vocab.nnz/train_vocab.shape[0]))
print("   {:.4f} of the entries in the matrix are non-zero."
     .format(train_vocab.nnz/(train_vocab.shape[1]*train_vocab.shape[0])))

There are 23,959 unique words in the vocabulary set, averaging 71 words per example.
   0.0030 of the entries in the matrix are non-zero.


## Multinomial Naive Bayes

### Unigram Model

In [59]:
alpha_values = [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0]
amax = [0, 0]
# Fit a MNB model for each value of alpha
for a in alpha_values:
    mnb = MultinomialNB(alpha=a)
    mnb.fit(train_vocab, y_train)
    mnb_predicted_labels = mnb.predict(test_vocab)
    mnb_f1 = metrics.f1_score(y_test, mnb_predicted_labels, average='weighted', labels=np.unique(mnb_predicted_labels))
    mnb_acc = metrics.accuracy_score(y_test, mnb_predicted_labels)

    # Print out the accuracy score for each alpha level
    print("F1 score for alpha={}: {:.4f}, accuracy: {:.2f}% ".format(a, mnb_f1, mnb_acc*100))
    # Keep track of which alpha value results in the highest accuracy
    if mnb_f1 > amax[1]:
        amax = [a, mnb_f1]    
# Print the optimal alpha value
print("Best alpha parameter found in test results: {}, returns an f1 score of {:.4f} "
      .format(amax[0], amax[1]))

F1 score for alpha=0.001: 0.4117, accuracy: 43.12% 
F1 score for alpha=0.01: 0.4164, accuracy: 43.64% 
F1 score for alpha=0.1: 0.4283, accuracy: 44.80% 
F1 score for alpha=0.5: 0.4425, accuracy: 47.04% 
F1 score for alpha=1.0: 0.4139, accuracy: 45.72% 
F1 score for alpha=2.0: 0.4397, accuracy: 43.68% 
F1 score for alpha=3.0: 0.4177, accuracy: 42.44% 
F1 score for alpha=5.0: 0.4539, accuracy: 40.16% 
F1 score for alpha=10.0: 0.3782, accuracy: 37.32% 
Best alpha parameter found in test results: 5.0, returns an f1 score of 0.4539 


### Bigram Model

In [60]:
# Define a bigram vocabulary
vec_bigram = CountVectorizer(ngram_range=(2,2))
train_vocab_b = vec_bigram.fit_transform(X_train)
test_vocab_b = vec_bigram.transform(X_test)

# Fit vocabulary to a Multinomial Naive Bayes classifier
mnb = MultinomialNB(alpha=amax[0])
mnb.fit(train_vocab_b, y_train)
mnb_predicted_labels = mnb.predict(test_vocab_b)
mnb_f1 = metrics.f1_score(y_test, mnb_predicted_labels, average='weighted', labels=np.unique(mnb_predicted_labels))
mnb_acc = metrics.accuracy_score(y_test, mnb_predicted_labels)

# Print out the accuracy score for each alpha level
print("F1 score for alpha={}: {:.4f}, accuracy: {:.2f}% ".format(amax[0], mnb_f1, mnb_acc*100))


F1 score for alpha=5.0: 0.3877, accuracy: 37.24% 


## Logistic Regression

In [61]:
cmax = [0, 0]
c_values = [0.010, 0.1000, 0.3000, 0.5000, 1.000, 2.000]

# Fit a LR model for each value of C
for c in c_values:
    log = LogisticRegression(C=c, penalty='l2', random_state=42, solver='lbfgs', max_iter=3000, multi_class='multinomial')
    log.fit(train_vocab, y_train)
    log_predicted_labels = log.predict(test_vocab)
    log_f1 = metrics.f1_score(y_test, log_predicted_labels, average='weighted', labels=np.unique(log_predicted_labels))
    log_acc = metrics.accuracy_score(y_test, log_predicted_labels)

    # Print out the accuracy score for each value of C
    print("F1 score for C={}: {:.4f}, accuracy: {:.2f}% ".format(c, log_f1, log_acc*100))
    # Keep track of which C value results in the highest accuracy
    if log_f1 > cmax[1]:
        cmax = [c, log_f1]  

# Print the optimal C value
print("Best C parameter found in test results: {}, returns an f1 score of {:.4f} "
      .format(cmax[0], cmax[1]))   

F1 score for C=0.01: 0.3814, accuracy: 44.52% 
F1 score for C=0.1: 0.4183, accuracy: 45.72% 
F1 score for C=0.3: 0.4148, accuracy: 44.64% 
F1 score for C=0.5: 0.4144, accuracy: 44.36% 
F1 score for C=1.0: 0.4163, accuracy: 44.20% 
F1 score for C=2.0: 0.4119, accuracy: 43.52% 
Best C parameter found in test results: 0.1, returns an f1 score of 0.4183 


### Bigram model

In [62]:
log = LogisticRegression(C=cmax[0], penalty='l2', random_state=42, solver='lbfgs', max_iter=1000, multi_class='multinomial')
log.fit(train_vocab_b, y_train)
log_predicted_labels = log.predict(test_vocab_b)
log_f1 = metrics.f1_score(y_test, log_predicted_labels, average='weighted', labels=np.unique(log_predicted_labels))
log_acc = metrics.accuracy_score(y_test, log_predicted_labels)

# Print out the accuracy score for each value of C
print("F1 score for C={}: {:.4f}, accuracy: {:.2f}% ".format(cmax[0], log_f1, log_acc*100))

F1 score for C=0.1: 0.3749, accuracy: 41.64% 


### Preprocessing

In [63]:
def empty_preprocessor(s):
    return s

def better_preprocessor(s):
    rs = s.lower()
    # Replace some separators with spaces
    rs = re.sub('\n|-|/|\.', ' ', rs)
    # Eliminate everything else that isn't a letter or number
    rs = re.sub('[^0-9a-z ]+', '', rs)
    # Eliminate extraneous spaces
    rs = re.sub('\s{2,}', ' ', rs)
    prs = []
    # Drop some low-value words
    dumbwords = ['is', 'it', 'to', 'the', 'and', 'not', 'no', 'on', 'of', 'for', 'as', 'by', 'in', 'by', 'am', 'etc', \
                 'was', 'that', 'has', 'at', 'or', 'we', 'be', 'had']
    for word in rs.split():
        # Eliminate the -ing and -ly suffices
        word = word[:-3] if word[-3:]=='ing' and len(word) > 5 else word
        word = word[:-2] if word[-2:]=='ly' and len(word) > 5 else word
        # Trim words to 9 characters
        word = word[:9] if len(word) > 9 else word
        # Eliminate single-character words
        if len(word) > 1 and word not in dumbwords:
            prs.append(word)
    
    return " ".join(prs)

proc_train_data = [better_preprocessor(x) for x in X_train]
proc_test_data = [better_preprocessor(x) for x in X_test]

In [68]:
# Make a baseline vectorizer
vec = CountVectorizer()
vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
# Make a preprocessed vectorizer
vec_proc = CountVectorizer(preprocessor=better_preprocessor)
vocab_proc = vec_proc.fit_transform(X_train)
test_vocab_proc = vec_proc.transform(X_test)

# Fit and predict the baseline
log = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log.fit(vocab, y_train)
log_predicted_labels = log.predict(test_vocab)
log_score = metrics.accuracy_score(y_test, log_predicted_labels)

# Fit and predict the pre-processed set
log_proc = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log_proc.fit(vocab_proc, y_train)
log_proc_predicted_labels = log_proc.predict(test_vocab_proc)
log_proc_score = metrics.accuracy_score(y_test, log_proc_predicted_labels)

# Print the results
print("Unprocessed: {:,} words with accuracy: {:.4f}\nPre-processed: {:,} words with accuracy: {:.4f}"
      .format(vocab.shape[1], log_score, vocab_proc.shape[1], log_proc_score))
print("Improvement: {:.4f}".format(log_proc_score-log_score))

# Find a wrong answer and print it out for better analysis
wrong = np.random.choice(np.where(y_test != log_proc_predicted_labels)[0].ravel())

print("\nSample wrong answer from the preprocessed set, post #{}:".format(wrong))
print("unprocessed prediction: {}".format(log_predicted_labels[wrong]))
print("preprocessed prediction: {}".format(log_proc_predicted_labels[wrong]))
print("true label: {}".format(y_test[wrong]))
print("true data: ",proc_test_data[wrong])
print()

Unprocessed: 23,959 words with accuracy: 0.4572
Pre-processed: 22,641 words with accuracy: 0.4576
Improvement: 0.0004

Sample wrong answer from the preprocessed set, post #558:
unprocessed prediction: Toronto
preprocessed prediction: Toronto
true label: Las Vegas
true data:  this mesa grill does live up one ny city ive eaten here two occasions dinner brunch very disappoin both times almost all dishes here were drawn with sauces think sauce should enhance dish over power only dish would recommend spicy chicken sweet potato hash their brunch menu 16 50with avocado tomato maytag blue cheese buttermil serrano dress dish very nice presented also hearty fill other good thing here their complimen mini blue corn breads sincere feel chefs this location should step up plate real refine their dishes cook skills inorder meet standard one nyc



### TFIDF

In [74]:
# Make a TFIDF Vectorizer and fit the training and dev vocabularies
vec = TfidfVectorizer()
vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
# Inverse vocabulary dictionary for word lookup
inv_vocab = {v: k for k, v in vec.vocabulary_.items()}
# Fit and predict a logistic regression based on the results
lr = LogisticRegression(C=100, solver='lbfgs', max_iter=4000, multi_class='multinomial')
lr.fit(vocab, y_train)
lr_predicted_labels = lr.predict(test_vocab)
# Calculate and print the resulting score
lr_score = metrics.f1_score(y_test, lr_predicted_labels, average='weighted', labels=np.unique(lr_predicted_labels))
lr_acc = metrics.accuracy_score(y_test, lr_predicted_labels)

print("Baseline F1 for TfidfVectorizer: {:.2f}, accuracy {:.2f}%\n".format(lr_score, lr_acc*100))

# Get the probabilities for each class prediction
probs = lr.predict_proba(test_vocab)
R = []
# Run through the probabilities and calculate the R ratio as defined in the prompt, saving the value in the R list
for x in range(0, len(probs)):
    num = np.max(probs[x])
    den = probs[x][np.unique(y_test).tolist().index(y_test[x])]
    R.append(num/den)
# Get the highest x number of R values
top = np.argsort(np.array(R))[len(R)-3:]

# Print the top misidentified documents as well as their TFIDF score and coefficients by class
print("TOP {} MISIDENTIFIED DOCUMENTS:".format(3))
c = 1
for i in top:
    print("DOCUMENT #{}".format(c))
    print("Predicted label: {} (P{:.1f}%), True label: {} (P{:.1f}%)"
          .format(lr_predicted_labels[i], np.max(probs[i])*100, y_test[i], probs[1][categories.index(y_test[1])]*100))
    print("R ratio: {:.2f}".format(R[i]))
    print(X_test[i])
    '''
    print("\n{:10} {:>10} {:>15} {:>15} {:>15} {:>22} ".format("word", "Tfidf", categories[0], categories[1], \
                                                               categories[2], categories[3]))
    for w in np.nonzero(dev_vocab[i])[1]:
        coefs = np.round(lr.coef_[:,w], 2).flat
        print("{:10} {:10.3f} {:>15} {:>15} {:>15} {:>22}".format(inv_vocab[w], dev_vocab[i][0,w], \
                                                                  coefs[0], coefs[1], coefs[2], coefs[3])
    '''
    print("----\n")
    c += 1

Baseline F1 for TfidfVectorizer: 0.44, accuracy 45.96%

TOP 3 MISIDENTIFIED DOCUMENTS:
DOCUMENT #1
Predicted label: Phoenix (P99.5%), True label: Charlotte (P20.6%)
R ratio: 29957.27
Starbucks in Terminal C.

$3.70 for a small (500ml) Fiji water. That's insane and I'm an idiot for paying it.
----

DOCUMENT #2
Predicted label: Phoenix (P99.4%), True label: Champaign (P20.6%)
R ratio: 55504.26
I came here for lunch on Friday.  It was pretty quiet- only a couple other people eating.  I had the chicken quesadilla and what was supposed to be a carne asada taco with no onions.  Prices were fairly low and the food came rather quickly.  First off, they messed up my order as the taco wasn't carne asada.  Then when I sent it back and it returned, the meat was carne asada, but was covered in onions.  By this point, I was over it, so I didn't bother to send it back a third time- just didn't eat it.  

I was expecting more from this place because of all of the rave reviews.  To be honest, my quesad

In [83]:
topwords = np.argsort(log.coef_, 1)[:, train_vocab.shape[1]-top:]
inv_vocab[topwords[0][::-1][0]]

'calgary'

### Word Analysis

In [84]:
top = 20
vec = CountVectorizer()
train_vocab = vec.fit_transform(X_train)
# Make an inverse vocabulary to look up words by index
inv_vocab = {v: k for k, v in vec.vocabulary_.items()}
log = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log.fit(train_vocab, y_train)
# Get the words with the highest coefficients from each class
topwords = np.argsort(log.coef_, 1)[:, train_vocab.shape[1]-top:]
df_topwords = pd.DataFrame()

for x in range(topwords.shape[0]):
    wordlist = [inv_vocab[x] for x in topwords[x][::-1]]
    df_topwords[categories[x]] = wordlist

df_topwords

Unnamed: 0,Calgary,Champaign,Charlotte,Cleveland,Las Vegas,Madison,MontrÃ©al,Phoenix,Pittsburgh,Toronto,other
0,calgary,champaign,charlotte,cleveland,vegas,madison,montreal,phoenix,pittsburgh,toronto,il
1,pub,ever,uptown,fantastic,strip,old,poutine,scottsdale,upstairs,favourite,again
2,found,too,craft,burgers,casino,always,le,arizona,times,flavour,poutine
3,service,pizza,matthews,drive,henderson,order,est,valley,recommend,rude,haddock
4,free,selection,wax,ohio,summerlin,way,terrace,az,addition,roti,das
5,massage,vet,ready,parking,show,helpful,cheap,tempe,brunch,latte,uns
6,wine,any,center,days,las,junk,de,pool,party,soft,had
7,kind,things,puppies,appetizer,greeted,things,favourite,patio,down,yonge,vineyard
8,list,chili,waited,meatballs,located,tree,floor,choose,waffles,canada,die
9,own,shop,hibachi,egg,buffet,find,meat,thank,applebee,winter,molto
