In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# Twitter

In [14]:
def conv(x):
    try:
        return x.astype(np.int64)
    except:
        return 99

df_twitter = pd.read_csv('labelled_tweet_locations.csv')
df_twitter.dropna(inplace=True)
#df_twitter['region'] = df_twitter['region'].astype(np.int64)

df_counts = df_twitter.groupby('region').count()
top_category_num = max(df_counts['text'])
top_category_name = df_counts[df_counts['text']==max(df_counts['text'])].index[0]

categories = df_counts.index.tolist()
df_counts

Unnamed: 0_level_0,text
region,Unnamed: 1_level_1
albuquerque,259
billings,95
calgary,16
charlotte,1126
chicago,1030
cincinnati,797
denver,316
houston,1887
kansas city,465
las vegas,335


In [3]:
print("Baseline accuracy:  If we just guessed '{}' every time we would have accuracy of {:.2f}%"
      .format(top_category_name, (top_category_num/df_twitter.shape[0])*100))

Baseline accuracy:  If we just guessed 'new york' every time we would have accuracy of 12.63%


In [4]:
X = df_twitter['text'].tolist()
y = df_twitter['region'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print("Training set has {} examples in {} categories, test set has {} examples".format(len(X_train), len(np.unique(y_train)), len(X_test)))


Training set has 14059 examples in 23 categories, test set has 4687 examples


In [5]:
vec = CountVectorizer()
train_vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
print("There are {:,} unique words in the vocabulary set, averaging {:.0f} words per example."
      .format(train_vocab.shape[1], train_vocab.nnz/train_vocab.shape[0]))
print("   {:.4f} of the entries in the matrix are non-zero."
     .format(train_vocab.nnz/(train_vocab.shape[1]*train_vocab.shape[0])))

There are 22,828 unique words in the vocabulary set, averaging 13 words per example.
   0.0006 of the entries in the matrix are non-zero.


## Multinomial Naive Bayes

### Unigram Model

In [6]:
alpha_values = [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0]
amax = [0, 0]
# Fit a MNB model for each value of alpha
for a in alpha_values:
    mnb = MultinomialNB(alpha=a)
    mnb.fit(train_vocab, y_train)
    mnb_predicted_labels = mnb.predict(test_vocab)
    mnb_f1 = metrics.f1_score(y_test, mnb_predicted_labels, average='weighted', labels=np.unique(mnb_predicted_labels))
    mnb_acc = metrics.accuracy_score(y_test, mnb_predicted_labels)

    # Print out the accuracy score for each alpha level
    print("F1 score for alpha={}: {:.4f}, accuracy: {:.2f}% ".format(a, mnb_f1, mnb_acc*100))
    # Keep track of which alpha value results in the highest accuracy
    if mnb_f1 > amax[1]:
        amax = [a, mnb_f1]    
# Print the optimal alpha value
print("Best alpha parameter found in test results: {}, returns an f1 score of {:.4f} "
      .format(amax[0], amax[1]))

F1 score for alpha=0.001: 0.1550, accuracy: 16.02% 
F1 score for alpha=0.01: 0.1572, accuracy: 16.17% 
F1 score for alpha=0.1: 0.1657, accuracy: 17.43% 
F1 score for alpha=0.5: 0.1597, accuracy: 18.41% 
F1 score for alpha=1.0: 0.1381, accuracy: 17.28% 
F1 score for alpha=2.0: 0.1100, accuracy: 15.38% 
F1 score for alpha=3.0: 0.0992, accuracy: 14.70% 
F1 score for alpha=5.0: 0.0795, accuracy: 13.74% 
F1 score for alpha=10.0: 0.0641, accuracy: 12.93% 
Best alpha parameter found in test results: 0.1, returns an f1 score of 0.1657 


### Bigram Model

In [7]:
# Define a bigram vocabulary
vec_bigram = CountVectorizer(ngram_range=(2,2))
train_vocab_b = vec_bigram.fit_transform(X_train)
test_vocab_b = vec_bigram.transform(X_test)

# Fit vocabulary to a Multinomial Naive Bayes classifier
mnb = MultinomialNB(alpha=amax[0])
mnb.fit(train_vocab_b, y_train)
mnb_predicted_labels = mnb.predict(test_vocab_b)
mnb_f1 = metrics.f1_score(y_test, mnb_predicted_labels, average='weighted', labels=np.unique(mnb_predicted_labels))
mnb_acc = metrics.accuracy_score(y_test, mnb_predicted_labels)

# Print out the accuracy score for each alpha level
print("F1 score for alpha={}: {:.4f}, accuracy: {:.2f}% ".format(amax[0], mnb_f1, mnb_acc*100))


F1 score for alpha=0.1: 0.1306, accuracy: 14.08% 


## Logistic Regression

In [8]:
cmax = [0, 0]
c_values = [0.010, 0.1000, 0.3000, 0.5000, 1.000, 2.000, 3.0]

# Fit a LR model for each value of C
for c in c_values:
    log = LogisticRegression(C=c, penalty='l2', random_state=42, solver='lbfgs', max_iter=3000, multi_class='multinomial')
    log.fit(train_vocab, y_train)
    log_predicted_labels = log.predict(test_vocab)
    log_f1 = metrics.f1_score(y_test, log_predicted_labels, average='weighted', labels=np.unique(log_predicted_labels))
    log_acc = metrics.accuracy_score(y_test, log_predicted_labels)

    # Print out the accuracy score for each value of C
    print("F1 score for C={}: {:.4f}, accuracy: {:.2f}% ".format(c, log_f1, log_acc*100))
    # Keep track of which C value results in the highest accuracy
    if log_f1 > cmax[1]:
        cmax = [c, log_f1]  

# Print the optimal C value
print("Best C parameter found in test results: {}, returns an f1 score of {:.4f} "
      .format(cmax[0], cmax[1]))   

F1 score for C=0.01: 0.0933, accuracy: 14.02% 
F1 score for C=0.1: 0.1645, accuracy: 18.99% 
F1 score for C=0.3: 0.1858, accuracy: 19.93% 
F1 score for C=0.5: 0.1859, accuracy: 19.54% 
F1 score for C=1.0: 0.1818, accuracy: 18.90% 
F1 score for C=2.0: 0.1802, accuracy: 18.54% 
F1 score for C=3.0: 0.1775, accuracy: 18.22% 
Best C parameter found in test results: 0.5, returns an f1 score of 0.1859 


### Bigram model

In [9]:
log = LogisticRegression(C=cmax[0], penalty='l2', random_state=42, solver='lbfgs', max_iter=1000, multi_class='multinomial')
log.fit(train_vocab_b, y_train)
log_predicted_labels = log.predict(test_vocab_b)
log_f1 = metrics.f1_score(y_test, log_predicted_labels, average='weighted', labels=np.unique(log_predicted_labels))
log_acc = metrics.accuracy_score(y_test, log_predicted_labels)

# Print out the accuracy score for each value of C
print("F1 score for C={}: {:.4f}, accuracy: {:.2f}% ".format(cmax[0], log_f1, log_acc*100))

F1 score for C=0.5: 0.1357, accuracy: 16.19% 


### Preprocessing

In [10]:
def empty_preprocessor(s):
    return s

def better_preprocessor(s):
    rs = s.lower()
    # Replace some separators with spaces
    rs = re.sub('\n|-|/|\.', ' ', rs)
    # Eliminate everything else that isn't a letter or number
    rs = re.sub('[^0-9a-z ]+', '', rs)
    # Eliminate extraneous spaces
    rs = re.sub('\s{2,}', ' ', rs)
    prs = []
    # Drop some low-value words
    dumbwords = ['is', 'it', 'to', 'the', 'and', 'not', 'no', 'on', 'of', 'for', 'as', 'by', 'in', 'by', 'am', 'etc', \
                 'was', 'that', 'has', 'at', 'or', 'we', 'be', 'had']
    for word in rs.split():
        # Eliminate the -ing and -ly suffices
        word = word[:-3] if word[-3:]=='ing' and len(word) > 5 else word
        word = word[:-2] if word[-2:]=='ly' and len(word) > 5 else word
        # Trim words to 9 characters
        word = word[:9] if len(word) > 9 else word
        # Eliminate single-character words
        if len(word) > 1 and word not in dumbwords:
            prs.append(word)
    
    return " ".join(prs)

proc_train_data = [better_preprocessor(x) for x in X_train]
proc_test_data = [better_preprocessor(x) for x in X_test]

In [23]:
# Make a baseline vectorizer
vec = CountVectorizer()
vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
# Make a preprocessed vectorizer
vec_proc = CountVectorizer(preprocessor=better_preprocessor)
vocab_proc = vec_proc.fit_transform(X_train)
test_vocab_proc = vec_proc.transform(X_test)

# Fit and predict the baseline
log = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log.fit(vocab, y_train)
log_predicted_labels = log.predict(test_vocab)
log_score = metrics.accuracy_score(y_test, log_predicted_labels)

# Fit and predict the pre-processed set
log_proc = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log_proc.fit(vocab_proc, y_train)
log_proc_predicted_labels = log_proc.predict(test_vocab_proc)
log_proc_score = metrics.accuracy_score(y_test, log_proc_predicted_labels)

# Print the results
print("Unprocessed: {:,} words with accuracy: {:.4f}\nPre-processed: {:,} words with accuracy: {:.4f}"
      .format(vocab.shape[1], log_score, vocab_proc.shape[1], log_proc_score))
print("Improvement: {:.4f}".format(log_proc_score-log_score))

# Find a wrong answer and print it out for better analysis
wrong = np.random.choice(np.where(y_test != log_proc_predicted_labels)[0].ravel())

print("\nSample wrong answer from the preprocessed set, post #{}:".format(wrong))
print("unprocessed prediction: {}".format(log_predicted_labels[wrong]))
print("preprocessed prediction: {}".format(log_proc_predicted_labels[wrong]))
print("true label: {}".format(y_test[wrong]))
print("true data: ",proc_test_data[wrong])
print()

Unprocessed: 22,828 words with accuracy: 0.1954
Pre-processed: 21,719 words with accuracy: 0.1931
Improvement: -0.0023

Sample wrong answer from the preprocessed set, post #187:
unprocessed prediction: tampa
preprocessed prediction: tampa
true label: chicago
true data:  id like know more about heard he hid out southern state



### TFIDF

In [28]:
# Make a TFIDF Vectorizer and fit the training and dev vocabularies
vec = TfidfVectorizer()
vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
# Inverse vocabulary dictionary for word lookup
inv_vocab = {v: k for k, v in vec.vocabulary_.items()}
# Fit and predict a logistic regression based on the results
lr = LogisticRegression(C=100, solver='lbfgs', max_iter=4000, multi_class='multinomial')
lr.fit(vocab, y_train)
lr_predicted_labels = lr.predict(test_vocab)
# Calculate and print the resulting score
lr_score = metrics.f1_score(y_test, lr_predicted_labels, average='weighted')
lr_acc = metrics.accuracy_score(y_test, lr_predicted_labels)

print("Baseline F1 for TfidfVectorizer: {:.2f}, accuracy {:.2f}%\n".format(lr_score, lr_acc*100))

# Get the probabilities for each class prediction
probs = lr.predict_proba(test_vocab)
R = []
# Run through the probabilities and calculate the R ratio as defined in the prompt, saving the value in the R list
for x in range(0, len(probs)):
    num = np.max(probs[x])
    den = probs[x][np.unique(y_test).tolist().index(y_test[x])]
    R.append(num/den)
# Get the highest x number of R values
top = np.argsort(np.array(R))[len(R)-3:]

# Print the top misidentified documents as well as their TFIDF score and coefficients by class
print("TOP {} MISIDENTIFIED DOCUMENTS:".format(3))
c = 1
for i in top:
    print("DOCUMENT #{}".format(c))
    print("Predicted label: {} (P{:.1f}%), True label: {} (P{:.1f}%)"
          .format(lr_predicted_labels[i], np.max(probs[i])*100, y_test[i], probs[i][categories.index(y_test[1])]*100))
    print("R ratio: {:.2f}".format(R[i]))
    print(X_test[i])
    '''
    print("\n{:10} {:>10} {:>15} {:>15} {:>15} {:>22} ".format("word", "Tfidf", categories[0], categories[1], \
                                                               categories[2], categories[3]))
    for w in np.nonzero(dev_vocab[i])[1]:
        coefs = np.round(lr.coef_[:,w], 2).flat
        print("{:10} {:10.3f} {:>15} {:>15} {:>15} {:>22}".format(inv_vocab[w], dev_vocab[i][0,w], \
                                                                  coefs[0], coefs[1], coefs[2], coefs[3])
    '''
    print("----\n")
    c += 1

Baseline F1 for TfidfVectorizer: 0.17, accuracy 17.26%

TOP 3 MISIDENTIFIED DOCUMENTS:
DOCUMENT #1
Predicted label: new york (P99.7%), True label: nashville (P0.0%)
R ratio: 285295.66
Looking for a good checking alternative to  . Suggestions?
----

DOCUMENT #2
Predicted label: los angeles (P98.9%), True label: seattle (P0.0%)
R ratio: 341757.33
Send some to California
----

DOCUMENT #3
Predicted label: charlotte (P99.5%), True label: houston (P0.0%)
R ratio: 897102.93
This guy is taking full responsibility of his Dreambox journey! He is very consistent on seeing how many standards he is meeting each week!   😍 
----



### Word Analysis

In [29]:
top = 20
vec = CountVectorizer()
train_vocab = vec.fit_transform(X_train)
# Make an inverse vocabulary to look up words by index
inv_vocab = {v: k for k, v in vec.vocabulary_.items()}
log = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log.fit(train_vocab, y_train)
# Get the words with the highest coefficients from each class
topwords = np.argsort(log.coef_, 1)[:, train_vocab.shape[1]-top:]
df_topwords = pd.DataFrame()

for x in range(topwords.shape[0]):
    wordlist = [inv_vocab[x] for x in topwords[x][::-1]]
    df_topwords[categories[x]] = wordlist

df_topwords

Unnamed: 0,albuquerque,billings,calgary,charlotte,chicago,cincinnati,denver,houston,kansas city,las vegas,...,nashville,new york,oklahoma city,phoenix,pittsburgh,san francisco,seattle,tampa,toronto,washington
0,nm,mt,id,nc,il,ohio,co,tx,mo,vegas,...,al,ct,dallas,arizona,cleveland,ca,wa,fl,toronto,va
1,tx,id,accounting,sc,wi,ky,colorado,houston,ne,ut,...,tn,nj,tx,az,pittsburgh,nv,portland,florida,ny,md
2,progress,wy,with,carolina,chicago,louisville,denver,la,ia,nv,...,atlanta,ny,ar,phoenix,pa,california,id,miami,mi,maryland
3,does,great,mt,ga,milwaukee,indianapolis,trade,winniesun,ks,nut,...,ms,ma,ok,native,oh,oakland,seattle,orlando,ontario,dc
4,mexico,needs,hiring,va,michigan,marion,salt,austin,nebraska,utah,...,ga,boston,texas,americans,number,sf,vancouver,beach,hamilton,virginia
5,number,too,summer,charlotte,mi,comes,beautiful,texas,desmoines,mountain,...,nashville,york,landscapechat,fine,wv,relax,exciting,disney,daughter,baltimore
6,ruthless,careerarc,godaddy,ncat,wisconsin,cincinnati,wy,sanantonio,iowa,booth,...,tennessee,brooklyn,weeks,costume,watch,sacramento,election,single,lmfao,washington
7,beto,idaho,ssl,congrats,illinois,kickback,decisions,fw,disappointed,turned,...,mississippi,ri,oklahoma,november,bill,slap,nampa,kingdom,cry,happened
8,alright,set,certificate,argue,lafayette,dang,spell,weeks,kinda,hotel,...,ky,nyc,tulsa,forward,akron,memories,test,easy,buffalo,episode
9,away,wow,cad,cousin,cedarrapids,proud,coloradosprings,growing,somebody,sema2018,...,memphis,pa,laughing,morning,philly,anime,oregon,tampa,union,de
