In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib.colors import ListedColormap
from matplotlib.colors import LogNorm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from subprocess import check_output
#from wordcloud import WordCloud, STOPWORDS

#ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
from sklearn.mixture import GaussianMixture

from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.pipeline import make_pipeline

%matplotlib inline
plt.style.use('bmh')

In [2]:
#1. Train Data
with open('../data/train.json') as fin:
   trainjson = json.load(fin)
train = pd.io.json.json_normalize(trainjson)
#2. Test Data
with open('../data/test.json') as fin:
   testjson = json.load(fin)
test = pd.io.json.json_normalize(testjson)

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)

train_labels_master = train[['requester_received_pizza']]
train_data_master = train[test.columns & train.columns]
train_only_data_master = train[train.columns[~train.columns.isin(test.columns)]].drop(['requester_received_pizza'], axis = 1)

#Apply train_test_split twice to get train, test, and dev set
x_train, x_test, y_train, y_test = train_test_split(
   train_data_master,
   train_labels_master.values.ravel(), test_size=0.2, random_state=0)

x_test, x_dev, y_test, y_dev = train_test_split(
   x_test,
   y_test, test_size=0.2, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(x_dev.shape)
print(y_train.shape)
print(y_test.shape)
print(y_dev.shape)

Train Shape: (4040, 32)
Test Shape: (1631, 17)
(3232, 17)
(646, 17)
(162, 17)
(3232,)
(646,)
(162,)


In [23]:
# Isolate the text column for the training and dev dataframes
import re
from sklearn.grid_search import GridSearchCV

x_train_text = x_train.request_text_edit_aware
x_test_text = x_test.request_text_edit_aware

### Run text cleaning preprocessing on the training dataset to remove a lot of the junk.

# Define custom text preprocessor
def text_cleaner(s):
    # Establish a compiled regex that finds words shorter than 3 characters
    shortword = re.compile(r'\W*\b\w{1,3}\b')
    
    # Convert all text to lowercase
    text = s.lower()
    
    # Remove newlines and punctuation marks
    text = re.sub(r'\n', ' ', text)
    text = re.sub('[,?]',' ',text)
    text = re.sub('\. ',' ',text)
    text = re.sub(' \.',' ',text)
    text = re.sub('\.{2,}',' ',text)
    text = re.sub(r'/',' ',text)
    text = re.sub('-','',text)
    text = re.sub('"','',text)
    text = re.sub('[<>()]',' ',text)

    # Convert sequences of numbers to zero
    text = re.sub('\d+', '0', text)
    
    # Remove short words
    text = shortword.sub('', text)
    
    # Remove extra whitespace
    text = re.sub(' +',' ',text)

    return text

# Set up count vectorizer to use custom preprocessor
# Using bigrams in the vectorizer gains about a percentage point of accuracy, but it appears that using
# trigrams or larger n-grams doesn't provide any further gains. 
vectotron = CountVectorizer(preprocessor=text_cleaner, analyzer='word',ngram_range=(2,2)) 
x_train_vect = vectotron.fit_transform(x_train_text)
x_test_vect = vectotron.transform(x_test_text)
# print(vectotron.vocabulary_)


# Fit a Bernoulli Naive Bayes model using the vectorized text and use GridSearch to optimize params
model_TextNB = BernoulliNB()
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 20.0, 50.0, 100.0]}
BernNB_clf = GridSearchCV(model_TextNB,param_grid=alphas)
BernNB_clf.fit(x_train_vect,y_train)
print('Optimized score for BernoulliNB (alpha=',BernNB_clf.best_params_['alpha'],'): ',BernNB_clf.best_score_,'\n',sep='')
alpha_optimal = BernNB_clf.best_params_['alpha']

# Predict and check accuracy
model_TextNB = BernoulliNB(alpha=alpha_optimal)
model_TextNB.fit(x_train_vect,y_train)
predict_NB = model_TextNB.predict(x_test_vect)
test_accNB = metrics.accuracy_score(y_test, predict_NB)
print(test_accNB)

score_NB = model_TextNB.score(x_test_vect, y_test)
print(score_NB)


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Optimized score for BernoulliNB (alpha=10.0): 0.7561881188118812

0.738390092879
0.738390092879


In [24]:
# Try a Tfid vectorizer instead of counts, still using bigrams
vectimus_prime = TfidfVectorizer(preprocessor=text_cleaner, analyzer='word',ngram_range=(2,2)) 
x_train_vect = vectimus_prime.fit_transform(x_train_text)
x_test_vect = vectimus_prime.transform(x_test_text)

# Try Logistic Regression instead of Naive Bayes
model_logR = LogisticRegression()
model_logR.fit(x_train_vect,y_train)
predict_logR = model_logR.predict(x_test_vect)
test_acclogR = metrics.accuracy_score(y_test,predict_logR)
print(test_acclogR)

score_logR = model_logR.score(x_test_vect, y_test)
print(score_logR)

# Accuracy score is exactly the same as the BernoulliNB model. That seems weird, right?

0.736842105263
0.736842105263


In [25]:
# Try reducing the vocabulary to eliminate meaningless words.

vectorsaurus = CountVectorizer(preprocessor=text_cleaner, analyzer='word', ngram_range=(2,2)) 
x_train_vect = vectorsaurus.fit_transform(x_train_text)
x_test_vect = vectorsaurus.transform(x_test_text)

# Determine weights with Logistic Regression and L1 regularization
model_logR2 = LogisticRegression(penalty='l1')
model_logR2_fit = model_logR2.fit(x_train_vect, y_train)

# Create list of vocabulary words and their associated weights, then filter out everything with weight of zero.
word_weights = dict(zip(vectorsaurus.vocabulary_.keys(),model_logR2_fit.coef_[0]))
word_weights = dict((k, v) for k, v in word_weights.items() if v != 0)

# Create new vocabulary without zero-weight features
# new_vocab = { key: vectorsaurus.vocabulary_[key] for key in word_weights.keys() }
new_vocab = list(word_weights.keys())

# Re-run the vectorization, and run the models again using the new data
vectorsaurus_rex = CountVectorizer(preprocessor=text_cleaner, vocabulary=new_vocab)
x_train_vect = vectorsaurus_rex.fit_transform(x_train_text)
x_test_vect = vectorsaurus_rex.transform(x_test_text)

model_logR3 = LogisticRegression()
model_logR3_fit = model_logR3.fit(x_train_vect, y_train)
predict_logR3 = model_logR3_fit.predict(x_test_vect)
test_acclogR3 = metrics.accuracy_score(y_test,predict_logR3)

print(test_acclogR3)

score_logR3 = model_logR3_fit.score(x_test_vect, y_test)
print(score_logR3)
# Ok, this is getting weird.


0.738390092879
0.738390092879


In [26]:
# try reducing vocab down to disease words

# load in disease vocab
import csv
disease_list = []

#with open('../data/diseases.csv', mode='r') as fin:
#    reader = csv.reader(fin)
#    diseases = {}
#    for row in reader:
#        diseases[row[0].lower()] = 0
    
disease_list = list(diseases.keys())
disease_list.append("sick")
disease_list.append("doctor")
disease_list.append("doctors")
disease_list.append("dying")
disease_list.append("died")
disease_list.append("hospice")
disease_list.append("pain")
disease_list.append("medical")
disease_list.append("insurance")

# Set up count vectorizer to use custom preprocessor
vectotron = CountVectorizer(vocabulary=disease_list, preprocessor=text_cleaner, analyzer='word',ngram_range=(2,2)) 
x_train_vect = vectotron.fit_transform(x_train_text)
x_test_vect = vectotron.transform(x_test_text)

# Fit a Bernoulli Naive Bayes model using the vectorized text and use GridSearch to optimize params
model_TextNB = BernoulliNB()
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 20.0, 50.0, 100.0]}
BernNB_clf = GridSearchCV(model_TextNB,param_grid=alphas)
BernNB_clf_fit = BernNB_clf.fit(x_train_vect,y_train)
# print('Optimized score for BernoulliNB (alpha=',BernNB_clf.best_params_['alpha'],'): ',BernNB_clf.best_score_,'\n',sep='')
alpha_optimal = BernNB_clf_fit.best_params_['alpha']

# Predict and check accuracy
model_TextNB = BernoulliNB(alpha=alpha_optimal)
model_TextNB_fit = model_TextNB.fit(x_train_vect,y_train)
predict_NB = model_TextNB_fit.predict(x_test_vect)
test_accNB = metrics.accuracy_score(y_test,predict_NB)

print(test_accNB)


NameError: name 'diseases' is not defined