In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# Twitter

In [18]:
def conv(x):
    try:
        return x.astype(np.int64)
    except:
        return 99

df_twitter = pd.read_csv('tweets_labelled_balanced.csv', nrows=100000)
df_twitter.dropna(inplace=True)
#df_twitter['region'] = df_twitter['region'].astype(np.int64)

df_counts = df_twitter.groupby('region').count()
top_category_num = max(df_counts['text'])
top_category_name = df_counts[df_counts['text']==max(df_counts['text'])].index[0]

categories = df_counts.index.tolist()
df_counts

Unnamed: 0_level_0,text
region,Unnamed: 1_level_1
3,7723
4,7745
5,7809
7,7655
10,7633
13,7760
14,7485
15,7766
18,7653
19,7767


In [13]:
print("Baseline accuracy:  If we just guessed '{}' every time we would have accuracy of {:.2f}%"
      .format(top_category_name, (top_category_num/df_twitter.shape[0])*100))

Baseline accuracy:  If we just guessed '5' every time we would have accuracy of 7.81%


In [12]:
X = df_twitter['text'].tolist()
y = df_twitter['region'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print("Training set has {} examples in {} categories, test set has {} examples".format(len(X_train), len(np.unique(y_train)), len(X_test)))


Training set has 75000 examples in 13 categories, test set has 25000 examples


In [14]:
vec = CountVectorizer()
train_vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
print("There are {:,} unique words in the vocabulary set, averaging {:.0f} words per example."
      .format(train_vocab.shape[1], train_vocab.nnz/train_vocab.shape[0]))
print("   {:.4f} of the entries in the matrix are non-zero."
     .format(train_vocab.nnz/(train_vocab.shape[1]*train_vocab.shape[0])))

There are 61,334 unique words in the vocabulary set, averaging 13 words per example.
   0.0002 of the entries in the matrix are non-zero.


## Multinomial Naive Bayes

### Unigram Model

In [19]:
alpha_values = [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0]
amax = [0, 0]
# Fit a MNB model for each value of alpha
for a in alpha_values:
    mnb = MultinomialNB(alpha=a)
    mnb.fit(train_vocab, y_train)
    mnb_predicted_labels = mnb.predict(test_vocab)
    mnb_f1 = metrics.f1_score(y_test, mnb_predicted_labels, average='weighted', labels=np.unique(mnb_predicted_labels))
    mnb_acc = metrics.accuracy_score(y_test, mnb_predicted_labels)

    # Print out the accuracy score for each alpha level
    print("F1 score for alpha={}: {:.4f}, accuracy: {:.2f}% ".format(a, mnb_f1, mnb_acc*100))
    # Keep track of which alpha value results in the highest accuracy
    if mnb_f1 > amax[1]:
        amax = [a, mnb_f1]    
# Print the optimal alpha value
print("Best alpha parameter found in test results: {}, returns an f1 score of {:.4f} "
      .format(amax[0], amax[1]))

F1 score for alpha=0.001: 0.1744, accuracy: 17.35% 
F1 score for alpha=0.01: 0.1782, accuracy: 17.72% 
F1 score for alpha=0.1: 0.1834, accuracy: 18.22% 
F1 score for alpha=0.5: 0.1856, accuracy: 18.51% 
F1 score for alpha=1.0: 0.1825, accuracy: 18.35% 
F1 score for alpha=2.0: 0.1783, accuracy: 18.16% 
F1 score for alpha=3.0: 0.1734, accuracy: 17.80% 
F1 score for alpha=5.0: 0.1674, accuracy: 17.35% 
F1 score for alpha=10.0: 0.1560, accuracy: 16.46% 
Best alpha parameter found in test results: 0.5, returns an f1 score of 0.1856 


### Bigram Model

In [7]:
# Define a bigram vocabulary
vec_bigram = CountVectorizer(ngram_range=(2,2))
train_vocab_b = vec_bigram.fit_transform(X_train)
test_vocab_b = vec_bigram.transform(X_test)

# Fit vocabulary to a Multinomial Naive Bayes classifier
mnb = MultinomialNB(alpha=amax[0])
mnb.fit(train_vocab_b, y_train)
mnb_predicted_labels = mnb.predict(test_vocab_b)
mnb_f1 = metrics.f1_score(y_test, mnb_predicted_labels, average='weighted', labels=np.unique(mnb_predicted_labels))
mnb_acc = metrics.accuracy_score(y_test, mnb_predicted_labels)

# Print out the accuracy score for each alpha level
print("F1 score for alpha={}: {:.4f}, accuracy: {:.2f}% ".format(amax[0], mnb_f1, mnb_acc*100))


F1 score for alpha=0.1: 0.1309, accuracy: 13.05% 


## Logistic Regression

In [20]:
cmax = [0, 0]
c_values = [0.010, 0.1000, 0.3000, 0.5000, 1.000, 2.000, 3.0]

# Fit a LR model for each value of C
for c in c_values:
    log = LogisticRegression(C=c, penalty='l2', random_state=42, solver='lbfgs', max_iter=3000, multi_class='multinomial')
    log.fit(train_vocab, y_train)
    log_predicted_labels = log.predict(test_vocab)
    log_f1 = metrics.f1_score(y_test, log_predicted_labels, average='weighted', labels=np.unique(log_predicted_labels))
    log_acc = metrics.accuracy_score(y_test, log_predicted_labels)

    # Print out the accuracy score for each value of C
    print("F1 score for C={}: {:.4f}, accuracy: {:.2f}% ".format(c, log_f1, log_acc*100))
    # Keep track of which C value results in the highest accuracy
    if log_f1 > cmax[1]:
        cmax = [c, log_f1]  

# Print the optimal C value
print("Best C parameter found in test results: {}, returns an f1 score of {:.4f} "
      .format(cmax[0], cmax[1]))   

F1 score for C=0.01: 0.1705, accuracy: 17.17% 
F1 score for C=0.1: 0.1828, accuracy: 18.22% 
F1 score for C=0.3: 0.1851, accuracy: 18.44% 
F1 score for C=0.5: 0.1871, accuracy: 18.62% 
F1 score for C=1.0: 0.1859, accuracy: 18.52% 
F1 score for C=2.0: 0.1829, accuracy: 18.21% 
F1 score for C=3.0: 0.1815, accuracy: 18.08% 
Best C parameter found in test results: 0.5, returns an f1 score of 0.1871 


### Bigram model

In [9]:
log = LogisticRegression(C=cmax[0], penalty='l2', random_state=42, solver='lbfgs', max_iter=1000, multi_class='multinomial')
log.fit(train_vocab_b, y_train)
log_predicted_labels = log.predict(test_vocab_b)
log_f1 = metrics.f1_score(y_test, log_predicted_labels, average='weighted', labels=np.unique(log_predicted_labels))
log_acc = metrics.accuracy_score(y_test, log_predicted_labels)

# Print out the accuracy score for each value of C
print("F1 score for C={}: {:.4f}, accuracy: {:.2f}% ".format(cmax[0], log_f1, log_acc*100))

F1 score for C=0.5: 0.1041, accuracy: 13.04% 


### Preprocessing

In [10]:
def empty_preprocessor(s):
    return s

def better_preprocessor(s):
    rs = s.lower()
    # Replace some separators with spaces
    rs = re.sub('\n|-|/|\.', ' ', rs)
    # Eliminate everything else that isn't a letter or number
    rs = re.sub('[^0-9a-z ]+', '', rs)
    # Eliminate extraneous spaces
    rs = re.sub('\s{2,}', ' ', rs)
    prs = []
    # Drop some low-value words
    dumbwords = ['is', 'it', 'to', 'the', 'and', 'not', 'no', 'on', 'of', 'for', 'as', 'by', 'in', 'by', 'am', 'etc', \
                 'was', 'that', 'has', 'at', 'or', 'we', 'be', 'had']
    for word in rs.split():
        # Eliminate the -ing and -ly suffices
        word = word[:-3] if word[-3:]=='ing' and len(word) > 5 else word
        word = word[:-2] if word[-2:]=='ly' and len(word) > 5 else word
        # Trim words to 9 characters
        word = word[:9] if len(word) > 9 else word
        # Eliminate single-character words
        if len(word) > 1 and word not in dumbwords:
            prs.append(word)
    
    return " ".join(prs)

proc_train_data = [better_preprocessor(x) for x in X_train]
proc_test_data = [better_preprocessor(x) for x in X_test]

In [11]:
# Make a baseline vectorizer
vec = CountVectorizer()
vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
# Make a preprocessed vectorizer
vec_proc = CountVectorizer(preprocessor=better_preprocessor)
vocab_proc = vec_proc.fit_transform(X_train)
test_vocab_proc = vec_proc.transform(X_test)

# Fit and predict the baseline
log = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log.fit(vocab, y_train)
log_predicted_labels = log.predict(test_vocab)
log_score = metrics.accuracy_score(y_test, log_predicted_labels)

# Fit and predict the pre-processed set
log_proc = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log_proc.fit(vocab_proc, y_train)
log_proc_predicted_labels = log_proc.predict(test_vocab_proc)
log_proc_score = metrics.accuracy_score(y_test, log_proc_predicted_labels)

# Print the results
print("Unprocessed: {:,} words with accuracy: {:.4f}\nPre-processed: {:,} words with accuracy: {:.4f}"
      .format(vocab.shape[1], log_score, vocab_proc.shape[1], log_proc_score))
print("Improvement: {:.4f}".format(log_proc_score-log_score))

# Find a wrong answer and print it out for better analysis
wrong = np.random.choice(np.where(y_test != log_proc_predicted_labels)[0].ravel())

print("\nSample wrong answer from the preprocessed set, post #{}:".format(wrong))
print("unprocessed prediction: {}".format(log_predicted_labels[wrong]))
print("preprocessed prediction: {}".format(log_proc_predicted_labels[wrong]))
print("true label: {}".format(y_test[wrong]))
print("true data: ",proc_test_data[wrong])
print()

Unprocessed: 20,676 words with accuracy: 0.1500
Pre-processed: 19,704 words with accuracy: 0.1504
Improvement: 0.0004

Sample wrong answer from the preprocessed set, post #1165:
unprocessed prediction: tampa
preprocessed prediction: tampa
true label: new york
true data:  nobody above law rally response djt forc sessions resign his place matt whitaker trump crony who plans dismantle muellers investiga into russian collusion read here



### TFIDF

In [9]:
# Make a TFIDF Vectorizer and fit the training and dev vocabularies
vec = TfidfVectorizer()
vocab = vec.fit_transform(X_train)
test_vocab = vec.transform(X_test)
# Inverse vocabulary dictionary for word lookup
inv_vocab = {v: k for k, v in vec.vocabulary_.items()}
# Fit and predict a logistic regression based on the results
lr = LogisticRegression(C=100, solver='lbfgs', max_iter=4000, multi_class='multinomial')
lr.fit(vocab, y_train)
lr_predicted_labels = lr.predict(test_vocab)
# Calculate and print the resulting score
lr_score = metrics.f1_score(y_test, lr_predicted_labels, average='weighted')
lr_acc = metrics.accuracy_score(y_test, lr_predicted_labels)

print("Baseline F1 for TfidfVectorizer: {:.2f}, accuracy {:.2f}%\n".format(lr_score, lr_acc*100))

# Get the probabilities for each class prediction
probs = lr.predict_proba(test_vocab)
R = []
# Run through the probabilities and calculate the R ratio as defined in the prompt, saving the value in the R list
for x in range(0, len(probs)):
    num = np.max(probs[x])
    den = probs[x][np.unique(y_test).tolist().index(y_test[x])]
    R.append(num/den)
# Get the highest x number of R values
top = np.argsort(np.array(R))[len(R)-3:]

# Print the top misidentified documents as well as their TFIDF score and coefficients by class
print("TOP {} MISIDENTIFIED DOCUMENTS:".format(3))
c = 1
for i in top:
    print("DOCUMENT #{}".format(c))
    print("Predicted label: {} (P{:.1f}%), True label: {} (P{:.1f}%)"
          .format(lr_predicted_labels[i], np.max(probs[i])*100, y_test[i], probs[i][categories.index(y_test[1])]*100))
    print("R ratio: {:.2f}".format(R[i]))
    print(X_test[i])
    '''
    print("\n{:10} {:>10} {:>15} {:>15} {:>15} {:>22} ".format("word", "Tfidf", categories[0], categories[1], \
                                                               categories[2], categories[3]))
    for w in np.nonzero(dev_vocab[i])[1]:
        coefs = np.round(lr.coef_[:,w], 2).flat
        print("{:10} {:10.3f} {:>15} {:>15} {:>15} {:>22}".format(inv_vocab[w], dev_vocab[i][0,w], \
                                                                  coefs[0], coefs[1], coefs[2], coefs[3])
    '''
    print("----\n")
    c += 1

Baseline F1 for TfidfVectorizer: 0.16, accuracy 15.51%

TOP 3 MISIDENTIFIED DOCUMENTS:
DOCUMENT #1
Predicted label: 18 (P95.5%), True label: 14 (P0.0%)
R ratio: 9178644.85
My review of Death House                  
----

DOCUMENT #2
Predicted label: 20 (P100.0%), True label: 15 (P0.0%)
R ratio: 12970851.07
I HATE LOOSE SHEETS
----

DOCUMENT #3
Predicted label: 21 (P100.0%), True label: 15 (P0.0%)
R ratio: 45209742.04
JFC how does anyone afford to live in Toronto?
----



### Word Analysis

In [10]:
top = 20
vec = CountVectorizer()
train_vocab = vec.fit_transform(X_train)
# Make an inverse vocabulary to look up words by index
inv_vocab = {v: k for k, v in vec.vocabulary_.items()}
log = LogisticRegression(C=cmax[0], penalty='l2', solver='lbfgs', max_iter=4000, multi_class='multinomial')
log.fit(train_vocab, y_train)
# Get the words with the highest coefficients from each class
topwords = np.argsort(log.coef_, 1)[:, train_vocab.shape[1]-top:]
df_topwords = pd.DataFrame()

for x in range(topwords.shape[0]):
    wordlist = [inv_vocab[x] for x in topwords[x][::-1]]
    df_topwords[categories[x]] = wordlist

df_topwords.to_csv('twitter_top20words.csv', index=False, mode='w', header=True)
df_topwords

Unnamed: 0,3,4,5,7,10,13,14,15,18,19,20,21,22
0,nc,chicago,columbus,tx,ca,nashville,nj,tx,ca,seattle,fl,toronto,va
1,sc,il,indianapolis,houston,california,atlanta,ny,ks,nv,wa,florida,sabres,md
2,carolina,wi,ky,texas,la,memphis,incident,texas,california,vancouver,orlando,ontario,virginia
3,charlotte,ia,cincinnati,austin,losangeles,al,boston,ou,fresno,portland,miami,mi,dc
4,georgia,mi,indiana,la,los,tennessee,ct,dallas,sanfrancisco,oregon,ucf,ny,maryland
5,ga,illinois,colts,astroworld,angeles,tn,nyc,tulsa,san,gardnerminshew,tampa,detroit,baltimore
6,va,wisconsin,ohio,louisiana,hollywood,georgia,york,oklahomacity,oakland,id,fsu,mississauga,de
7,savannah,michigan,louisville,shreveport,sandiego,ms,ma,ar,campfire,washington,sfltraffic,canadian,22
8,atl,snow,kentucky,neworleans,ew,ga,massachusetts,oklahoma,sf,gocougs,disney,buffalo,spent
9,ways,milwaukee,equipment,nola,disneyland,preds,pa,dallascowboys,hella,pdx,flake,leafs,harbor


## Topic Analysis

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Get a random sample of reviews
df_twitter_sample = pd.read_csv('yelp_reviews_labelled_mini.csv')
#df_twitter_sample = df_tweets500.sample(n=50000)
df_twitter_sample.dropna(inplace=True)
data = df_twitter_sample['text'].tolist()
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = data[22]
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())
 

[0.02995487 0.00138905 0.48210042 0.03152504 0.00138899 0.00138917
 0.05447826 0.00138904 0.39499619 0.00138898] 1.0


In [15]:
data[22]

'The staff was so incredibly rude that I will not ever attempt to go there again.  My girlfriend and I are locals and were heading inside after waiting patiently in line.  I asked the ID check staff about the local discount and he said I needed to go through the exit door and back to front desk for a ticket.  As we were walking toward the exit, a security guy shoved me backward.  I still do not understand why because it was not an alarmed door and others were leaving that direction.  He offered no apology and no explanation when I asked why he had shoved me.  We left and walked back to the end of the line to wait to talk to the people at "the desk."  They would not walk over to talk with us so I stepped onto the red carpet to talk with them.  They then informed me that I was "nobody" and escorted me out.  We were not cutting in front of anyone or being beligerent.  I found their treatment absolutely way out of line.  I am sure they get tired of dealing with drunks, but that is no excus

In [16]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
