In [2]:
from spacy.en import English
parser = English()

In [3]:
import foodbornenyc.models.models as models
from foodbornenyc.models.businesses import Business, business_category_table
from foodbornenyc.models.documents import YelpReview, Tweet, Document
from foodbornenyc.models.locations import Location
from foodbornenyc.models.metadata import metadata

In [5]:
from sklearn.externals import joblib
from foodbornenyc.settings import yelp_classify_config as config

#old classifier
sick = joblib.load("../"+config['model_file'])
sick.steps

Pipeline(steps=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        ...alty='l2', random_state=57, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

In [66]:
import numpy as np
from sklearn.metrics import roc_auc_score
def analyze(reviews, classifier):
    predictions_new = classifier.predict_proba(reviews['X'])
    pred = np.array([pred[1] for pred in predictions_new])
    print "ROC_AUC SCORE ::", roc_auc_score(reviews['y'], pred, average='micro')
    # determine true/false positive/negative rates
    tp_rate = 0.0
    fp_rate = 0.0
    tn_rate = 0.0
    fn_rate = 0.0

    for review, pred in zip(reviews['y'], predictions_new):
        if review == 1.0 and pred[1] > 0.5: tp_rate += 1
        elif review == 1.0 and pred[1] < 0.5: fn_rate += 1
        elif review == 0.0 and pred[1] > 0.5: fp_rate += 1
        elif review == 0.0 and pred[1] < 0.5: tn_rate += 1
    tp_rate /= len(reviews['y'])
    fn_rate /= len(reviews['y'])
    fp_rate /= len(reviews['y'])
    tn_rate /= len(reviews['y'])
    print "True positive ::", tp_rate
    print "False negative ::", fn_rate
    print "False positive ::", fp_rate
    print "True negative ::", tn_rate
    print "FP / TP ::", fp_rate / tp_rate
    print "FN / TN ::", fn_rate / tn_rate

In [67]:
#importing and reading data files
import xlrd

sheet1 = xlrd.open_workbook('data/yelp_sick_classifier_data.xlsx').sheet_by_index(0)
reviews = {'X':np.array([cell.value for cell in sheet1.col(1)][1:]), 'y':np.array([cell.value for cell in sheet1.col(2)][1:])}

sheet2 = xlrd.open_workbook('data/sick_test_preds.xlsx').sheet_by_index(0)
reviews2 = {'X':np.array([cell.value for cell in sheet2.col(1)][1:]), 'y':np.array([cell.value for cell in sheet2.col(2)][1:])}

In [12]:
# key words to watch out for: poisoning, sick, 
# tokens that perform strictly negation: not, n't, no, none, nobody, neither, 
# if negation word's head == key word's head, prepend key word with "not" and remove negation word
# only potential issue is double negative, e.g. "no one didn't get food poisoning", but this is a first step
from spacy import attrs
example = u"I hope none of us gets sick tonight. I didn't order food poisoning. I do not think you should come here because I got food poisoning."

# uncomment the following to see an example of a dependency parse
# parsedEx = parser(example)
# for token in parsedEx:
#     print token.orth_, token.dep_, token.head, [t.orth_ for t in token.children]

I nsubj hope  []
hope ROOT hope  [u'I', u'gets', u'.']
none nsubj gets  [u'of']
of prep none  [u'us']
us pobj of  []
gets ccomp hope  [u'none', u'sick', u'tonight']
sick acomp gets  []
tonight npadvmod gets  []
. punct hope  []
I nsubj order  []
did aux order  []
n't neg order  []
order ROOT order  [u'I', u'did', u"n't", u'poisoning', u'.']
food compound poisoning []
poisoning dobj order  [u'food']
. punct order  []
I nsubj think  []
do aux think  []
not neg think  []
think ROOT think  [u'I', u'do', u'not', u'come', u'.']
you nsubj come  []
should aux come  []
come ccomp think  [u'you', u'should', u'here', u'got']
here advmod come  []
because mark got  []
I nsubj got  []
got advcl come  [u'because', u'I', u'poisoning']
food compound poisoning []
poisoning dobj got  [u'food']
. punct think  []


In [13]:
def transform_doc_1(doc): 
    """if root of sentence had negation and sentence contained kw"""
    kw = ['poisoning', 'sick']
    neg = ['not', "n't", 'no', 'none', 'nobody', 'neither']
    parsedDoc = parser(doc.lower())
    tokens = [[t.orth_ for t in s] for s in parsedDoc.sents] #this will be modified
    sents = list(parsedDoc.sents)
    for i in range(len(sents)):
        # each span has only one root
        if not any([c.orth_ in neg for c in sents[i].root.children]): continue #if there's no negation
        neg_i = [j for j in range(len(sents[i])) if sents[i][j].orth_ in neg][0]
        kw_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in kw]
        
        if len(kw_list) == 0: continue
        
        kw_i = kw_list[0]
        
        #now modify
        tokens[i].insert(kw_i, "not")
        tokens[i].pop(neg_i)
    #now we join everything with spaces
    out = []
    for sent in tokens:
        out.append(" ".join(sent))
    return " ".join(out)

print transform_doc_1(example)

i hope none of us gets sick tonight . i did order food not poisoning . i do think you should come here because i got food not poisoning .


In [14]:
def transform_doc_2(doc): 
    """if negation and kw share a head"""
    kw = ['poisoning', 'sick']
    neg = ['not', "n't", 'no', 'none', 'nobody', 'neither']
    parsedDoc = parser(doc.lower())
    tokens = [[t.orth_ for t in s] for s in parsedDoc.sents] #this will be modified
    sents = list(parsedDoc.sents)
    for i in range(len(sents)):
        if not any([c.orth_ in neg for c in sents[i]]): continue #if there's no negation
        neg_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in neg]
        kw_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in kw]
        
        if len(kw_list) == 0: continue
        
        # attempt at handling double negatives
        double_negative = True
        kw_i = -1
        neg_i = -1
        
        for j in neg_list:
            for k in kw_list:
                if sents[i][j].head == sents[i][k].head and double_negative:
                    neg_i = j
                    kw_i = k
                    double_negative = False
                elif sents[i][j].head == sents[i][k].head and not double_negative:
                    double_negative = True
        
        if double_negative: continue
        
        #now modify
        tokens[i].insert(kw_i, "not")
        tokens[i].pop(neg_i)
    #now we join everything with spaces
    out = []
    for sent in tokens:
        out.append(" ".join(sent))
    return " ".join(out)

print transform_doc_2(example)

i hope of us gets not sick tonight . i did order food not poisoning . i do not think you should come here because i got food poisoning .


In [105]:
from sklearn.base import TransformerMixin

class NegationTransformer(TransformerMixin):
    """ Brings negation words closer to relevant key terms to make it detectable with n-gram detector """
    
    def __init__(self, transform_doc=transform_doc_1):
        self.transform_doc = transform_doc
    
    def transform(self, X, **transform_params):
        return np.array([self.transform_doc(doc) for doc in X])
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {'transform_doc' : self.transform_doc}


In [106]:
#from yelp classifier training notebook
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def my_roc_auc(ground_truth, predictions):
    ground_truth = np.array(ground_truth)
    predictions = np.array(predictions)
    return metrics.roc_auc_score(ground_truth, predictions, average='micro')

my_roc_auc_scorer = metrics.make_scorer(my_roc_auc, needs_threshold=True, greater_is_better=True)

# param_grid = {
#     'count__ngram_range':[(1,1),(1,2),(1,3)],
#     'tfidf__norm':['l1', 'l2'],
#     'tfidf__use_idf':[True, False],
#     'tfidf__sublinear_tf':[True,False],
#     'logreg__C':[.001, .01, .1]
# }
param_grid = {
   # 'negTransformer__transform_doc': [transform_doc_1, transform_doc_2],
    'count__ngram_range': [(1, 3)],
    'count__max_df' : [ .95],
    'count__stop_words': [None],
    'count__lowercase' : [True],
    'count__max_features': [None],
    'count__strip_accents': ['unicode'],
    'tfidf__use_idf' : [True],
    'tfidf__norm': [('l2')],
    'logreg__C': [100],
    'logreg__dual' : [True],
    'logreg__fit_intercept': [True],
    'logreg__penalty': ['l2'],
    'logreg__intercept_scaling':[.01],
    'logreg__random_state': [57],
    'logreg__solver': ['liblinear']
}

In [107]:
pipe1 = Pipeline([
    ('negTransformer', NegationTransformer(transform_doc_1)),
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('logreg', LogisticRegression())
    ])
pipe2 = Pipeline([
    ('negTransformer', NegationTransformer(transform_doc_2)),
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('logreg', LogisticRegression())
    ])


grid_search1 = GridSearchCV(pipe1, 
                           param_grid,
                           scoring=my_roc_auc_scorer,
                           n_jobs=-1, verbose=1)
grid_search2 = GridSearchCV(pipe2, 
                           param_grid,
                           scoring=my_roc_auc_scorer,
                           n_jobs=-1, verbose=1)
#does it make a difference if it's put in as a preprocessor?

In [109]:
def split_dev_test(data, test_size=.2):
    train_data = {}
    test_data = {}
    for train, test in cross_validation.StratifiedShuffleSplit(data['y'], n_iter=1, test_size=test_size, random_state=0):
        train_data['X'] = data['X'][train]
        train_data['y'] = data['y'][train]
        test_data['X'] = data['X'][test]
        test_data['y'] = data['y'][test]
        
    print "Training/Dev data shape: ", train_data['X'].shape, train_data['y'].shape
    print "Test data shape: ",test_data['X'].shape, test_data['y'].shape
    return train_data, test_data

train_data, test_data = split_dev_test(reviews)

Training/Dev data shape:  (1113,) (1113,)
Test data shape:  (279,) (279,)


In [110]:
def split_and_test(data, grid_search, test_size=.2, n_iter=1):
    train_data = {}
    test_data = {}
    for train, test in cross_validation.StratifiedShuffleSplit(data['y'], n_iter=n_iter, test_size=test_size, random_state=0):
        train_data['X'] = data['X'][train]
        train_data['y'] = data['y'][train]
        test_data['X'] = data['X'][test]
        test_data['y'] = data['y'][test]
        %time grid_search.fit(train_data['X'], train_data['y'])
        print("Best score: %0.3f" % grid_search.best_score_)
        %time analyze(test_data, grid_search.best_estimator_)

split_and_test(reviews, grid_search1)
print
split_and_test(reviews, grid_search2)
        

Fitting 3 folds for each of 1 candidates, totalling 3 fits
CPU times: user 18.2 s, sys: 375 ms, total: 18.6 s
Wall time: 42.6 s
Best score: 0.889
ROC_AUC SCORE :: 0.885131646877
True positive :: 0.444444444444
False negative :: 0.089605734767
False positive :: 0.114695340502
True negative :: 0.351254480287
FP / TP :: 0.258064516129
FN / TN :: 0.255102040816
CPU times: user 4.38 s, sys: 55.8 ms, total: 4.43 s
Wall time: 4.57 s

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   24.0s finished


CPU times: user 18.8 s, sys: 482 ms, total: 19.3 s
Wall time: 45.4 s
Best score: 0.891
ROC_AUC SCORE :: 0.886525554982
True positive :: 0.448028673835
False negative :: 0.0860215053763
False positive :: 0.111111111111
True negative :: 0.354838709677
FP / TP :: 0.248
FN / TN :: 0.242424242424
CPU times: user 4.43 s, sys: 49 ms, total: 4.48 s
Wall time: 4.71 s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   25.4s finished


In [112]:
param_grid_temp = {
    'count__ngram_range': [(1, 3)],
    'count__max_df' : [ .95],
    'count__stop_words': [None],
    'count__lowercase' : [True],
    'count__max_features': [None],
    'tfidf__use_idf' : [True],
    'tfidf__norm': [('l2')],
    'log__C': [100],
    'log__dual' : [True],
    'log__fit_intercept': [True],
    'log__penalty': ['l2'],
    'log__intercept_scaling':[.01],
    'log__random_state': [57],
    'log__solver': ['liblinear']
}
split_and_test(reviews, GridSearchCV(sick, 
                           param_grid_temp,
                           scoring=my_roc_auc_scorer,
                           n_jobs=1, verbose=1))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
CPU times: user 7.07 s, sys: 203 ms, total: 7.27 s
Wall time: 7.41 s
Best score: 0.890
ROC_AUC SCORE :: 0.887816210635
True positive :: 0.451612903226
False negative :: 0.0824372759857
False positive :: 0.114695340502
True negative :: 0.351254480287
FP / TP :: 0.253968253968
FN / TN :: 0.234693877551
CPU times: user 196 ms, sys: 4.44 ms, total: 201 ms
Wall time: 208 ms


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.2s finished


In [113]:
def reveal_fp(reviews, classifier):
    prediction = classifier.predict_proba([a['text'] for a in reviews])
    for review, pred in zip(reviews, prediction):
        if review['label'] == 0.0 and pred[1] > 0.5:
            print pred[1]
            print review['text']
            print

reveal_fp(reviews2, grid_search2.best_estimator_)

0.984934589869
After posting my original review, the NYC Health Dept contacted me and urged me to call 311 to report the food poisoning incident at Atlas Cafe.  I would like to clarify I did not contract food poisoning from Atlas Cafe.  I merely likened the urgency to post my one star review of Atlas to my urgency to use the bathroom when I did contract food poisoning from a wedding in Jersey.

0.987344037757
After reading all the rave reviews about Schnitzi, I finally decided to try it out. What a mistake!!!
The place is filthy, the menu prices are too high and the food sucks. I tried their double burger. They barely put tomatoes or pickles in my sandwich. The sauce wasn&#39;t that great, but the worst part was the meat. The burgers were two pieces of rubber that tasted horrible.
All night I was burping nasty burps that felt like I ate a whole rotten cow. The next day I was still sick from this crappy burger and all my clothes smelled of filthy food.
I don&#39;t recommend this place t