In [27]:
from spacy.en import English
parser = English()

In [28]:
import foodbornenyc.models.models as models
from foodbornenyc.models.businesses import Business, business_category_table
from foodbornenyc.models.documents import YelpReview, Tweet, Document
from foodbornenyc.models.locations import Location
from foodbornenyc.models.metadata import metadata

In [29]:
import xlrd
f = xlrd.open_workbook('data/yelp_sick_classifier_data.xlsx')
sheet1 = f.sheet_by_index(0)

In [30]:
from sklearn.externals import joblib
from foodbornenyc.settings import yelp_classify_config as config

sick = joblib.load("../"+config['model_file'])
sick

Pipeline(steps=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        ...alty='l2', random_state=57, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

In [31]:
import numpy as np
from sklearn.metrics import roc_auc_score
def analyze(reviews, classifier):
    textonly = [a["text"] for a in reviews]
    predictions_new = classifier.predict_proba(textonly)
    label = np.array([review['label'] for review in reviews])
    pred = np.array([pred[1] for pred in predictions_new])
    print "ROC_AUC SCORE ::", roc_auc_score(label, pred, average='micro')
    # determine true/false positive/negative rates
    tp_rate = 0.0
    fp_rate = 0.0
    tn_rate = 0.0
    fn_rate = 0.0

    for review, pred in zip(reviews, predictions_new):
        if review['label'] == 1.0 and pred[1] > 0.5: tp_rate += 1
        elif review['label'] == 1.0 and pred[1] < 0.5: fn_rate += 1
        elif review['label'] == 0.0 and pred[1] > 0.5: fp_rate += 1
        elif review['label'] == 0.0 and pred[1] < 0.5: tn_rate += 1
    tp_rate /= len(reviews)
    fn_rate /= len(reviews)
    fp_rate /= len(reviews)
    tn_rate /= len(reviews)
    print "True positive ::", tp_rate
    print "False negative ::", fn_rate
    print "False positive ::", fp_rate
    print "True negative ::", tn_rate
    print "FP / TP ::", fp_rate / tp_rate
    print "FN / TN ::", fn_rate / tn_rate

In [32]:
reviews = []
for i, (rev, label) in enumerate(zip(sheet1.col(1), sheet1.col(2))):
    if i == 0: continue
    reviews.append({"text":rev.value, "label":label.value})

In [33]:
analyze(reviews, sick)

ROC_AUC SCORE :: 0.996294837238
True positive :: 0.523706896552
False negative :: 0.00933908045977
False positive :: 0.00646551724138
True negative :: 0.460488505747
FP / TP :: 0.0123456790123
FN / TN :: 0.0202808112324


In [34]:
sheet2 = xlrd.open_workbook('data/sick_test_preds.xlsx').sheet_by_index(0)
reviews2 = []
for i, (rev, label) in enumerate(zip(sheet2.col(0), sheet2.col(3))):
    if i == 0: continue
    reviews2.append({"text":rev.value, "label":label.value})


In [35]:
analyze(reviews2, sick)

ROC_AUC SCORE :: 0.999638616417
True positive :: 0.530465949821
False negative :: 0.00358422939068
False positive :: 0.0143369175627
True negative :: 0.451612903226
FP / TP :: 0.027027027027
FN / TN :: 0.00793650793651


In [36]:
sick.steps

[('count',
  CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=0.95, max_features=None, min_df=1,
          ngram_range=(1, 3), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('tfidf',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('log',
  LogisticRegression(C=100, class_weight=None, dual=True, fit_intercept=True,
            intercept_scaling=0.01, max_iter=100, multi_class='ovr',
            n_jobs=1, penalty='l2', random_state=57, solver='liblinear',
            tol=0.0001, verbose=0, warm_start=False))]

In [37]:
# key words to watch out for: poisoning, sick, 
# tokens that perform strictly negation: not, n't, no, none, nobody, neither, 
# if negation word's head == key word's head, prepend key word with "not" and remove negation word
# only potential issue is double negative, e.g. "no one didn't get food poisoning", but this is a first step
from spacy import attrs
example = u"I hope none of us gets sick tonight. I didn't order food poisoning. I do not think you should come here because I got food poisoning."
parsedEx = parser(example)
for token in parsedEx:
    print token.orth_, token.dep_, token.head, [t.orth_ for t in token.children]

I nsubj hope  []
hope ROOT hope  [u'I', u'gets', u'.']
none nsubj gets  [u'of']
of prep none  [u'us']
us pobj of  []
gets ccomp hope  [u'none', u'sick', u'tonight']
sick acomp gets  []
tonight npadvmod gets  []
. punct hope  []
I nsubj order  []
did aux order  []
n't neg order  []
order ROOT order  [u'I', u'did', u"n't", u'poisoning', u'.']
food compound poisoning []
poisoning dobj order  [u'food']
. punct order  []
I nsubj think  []
do aux think  []
not neg think  []
think ROOT think  [u'I', u'do', u'not', u'come', u'.']
you nsubj come  []
should aux come  []
come ccomp think  [u'you', u'should', u'here', u'got']
here advmod come  []
because mark got  []
I nsubj got  []
got advcl come  [u'because', u'I', u'poisoning']
food compound poisoning []
poisoning dobj got  [u'food']
. punct think  []


In [38]:
def transform_doc_1(doc): 
    #if root of sentence had negation and sentence contained kw
    kw = ['poisoning', 'sick']
    neg = ['not', "n't", 'no', 'none', 'nobody', 'neither']
    parsedDoc = parser(doc.lower())
    tokens = [[t.orth_ for t in s] for s in parsedDoc.sents] #this will be modified
    sents = list(parsedDoc.sents)
    for i in range(len(sents)):
        # each span has only one root
        if not any([c.orth_ in neg for c in sents[i].root.children]): continue #if there's no negation
        neg_i = [j for j in range(len(sents[i])) if sents[i][j].orth_ in neg][0]
        kw_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in kw]
        
        if len(kw_list) == 0: continue
        
        kw_i = kw_list[0]
        
        #now modify
        tokens[i].insert(kw_i, "not")
        tokens[i].pop(neg_i)
    #now we join everything with spaces
    out = []
    for sent in tokens:
        out.append(" ".join(sent))
    return " ".join(out)

print transform_doc_1(example)

i hope none of us gets sick tonight . i did order food not poisoning . i do think you should come here because i got food not poisoning .


In [39]:
def transform_doc_2(doc): 
    #if negation and kw share a head
    kw = ['poisoning', 'sick']
    neg = ['not', "n't", 'no', 'none', 'nobody', 'neither']
    parsedDoc = parser(doc.lower())
    tokens = [[t.orth_ for t in s] for s in parsedDoc.sents] #this will be modified
    sents = list(parsedDoc.sents)
    for i in range(len(sents)):
        if not any([c.orth_ in neg for c in sents[i]]): continue #if there's no negation
        neg_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in neg]
        kw_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in kw]
        
        if len(kw_list) == 0: continue
        
        # attempt at handling double negatives
        double_negative = True
        kw_i = -1
        neg_i = -1
        
        for j in neg_list:
            for k in kw_list:
                if sents[i][j].head == sents[i][k].head and double_negative:
                    neg_i = j
                    kw_i = k
                    double_negative = False
                elif sents[i][j].head == sents[i][k].head and not double_negative:
                    double_negative = True
        
        if double_negative: continue
        
        #now modify
        tokens[i].insert(kw_i, "not")
        tokens[i].pop(neg_i)
    #now we join everything with spaces
    out = []
    for sent in tokens:
        out.append(" ".join(sent))
    return " ".join(out)

print transform_doc_2(example)

i hope of us gets not sick tonight . i did order food not poisoning . i do not think you should come here because i got food poisoning .


In [40]:
from sklearn.base import TransformerMixin
class NegationTransformer(TransformerMixin):
    """ Brings negation words closer to relevant key terms to make it detectable with n-gram detector """
    
    def transform(self, X, **transform_params):
        return np.array([transform_doc_2(doc) for doc in X])
    
    def fit(self, X, y=None, **fit_params):
        return self
    

In [41]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('negTransformer', NegationTransformer()), ('oldPipe', sick)])
pipe

Pipeline(steps=[('negTransformer', <__main__.NegationTransformer object at 0x175ce7150>), ('oldPipe', Pipeline(steps=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95,...y='l2', random_state=57, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]))])

In [44]:
%time analyze(reviews, pipe)

ROC_AUC SCORE :: 0.996213974705
True positive :: 0.524425287356
False negative :: 0.00862068965517
False positive :: 0.00718390804598
True negative :: 0.459770114943
FP / TP :: 0.013698630137
FN / TN :: 0.01875
CPU times: user 20.6 s, sys: 1.48 s, total: 22 s
Wall time: 23.5 s


In [45]:
%time analyze(reviews2, pipe)

ROC_AUC SCORE :: 0.999638616417
True positive :: 0.530465949821
False negative :: 0.00358422939068
False positive :: 0.0179211469534
True negative :: 0.448028673835
FP / TP :: 0.0337837837838
FN / TN :: 0.008
CPU times: user 4.1 s, sys: 45.7 ms, total: 4.15 s
Wall time: 4.19 s


In [46]:
#try refitting the pipeline?
pipe1 = Pipeline([('negTransformer', NegationTransformer()), ('oldPipe', sick)])
pipe2 = Pipeline([('negTransformer', NegationTransformer()), ('oldPipe', sick)])

In [47]:
#fitting against reviews from scratch
from sklearn import cross_validation
data = {}
data['X'] = [review['text'] for review in reviews]
data['y'] = [review['label'] for review in reviews]
folds = cross_validation.StratifiedKFold(data['y'], n_folds=3, random_state=0, shuffle=True)

In [48]:
#from yelp classifier training notebook
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def my_roc_auc(ground_truth, predictions):
    ground_truth = np.array(ground_truth)
    predictions = np.array(predictions)
    return metrics.roc_auc_score(ground_truth, predictions, average='micro')

my_roc_auc_scorer = metrics.make_scorer(my_roc_auc, needs_threshold=True, greater_is_better=True)
# Feature Extractors
cv = CountVectorizer(
        input=u'content', 
        encoding=u'utf-8', 
        decode_error=u'strict', 
        strip_accents='unicode', 
        lowercase=True,
        analyzer=u'word', 
        preprocessor=None, 
        tokenizer=None, 
        stop_words='english', 
        #token_pattern=u'(?u)\\b\w\w+\b', # one alphanumeric is a token
        ngram_range=(1, 2), 
        max_df=.9, 
        min_df=2, 
        max_features=None, 
        vocabulary=None, 
        binary=False, 
        #dtype=type 'numpy.int64'>
        )
from sklearn.feature_extraction.text import TfidfTransformer
tf = TfidfTransformer(
        norm='l2',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=False
)

# Final Classifier
from sklearn.naive_bayes import MultinomialNB
lr = LogisticRegression(C=.05,
                        fit_intercept=True,
                        random_state=0,
                        class_weight='balanced',
                        n_jobs=-1)

pipeline = Pipeline([
    ('count', cv),
    ('tfidf', tf),
    ('logreg', lr)
    ])

param_grid = {
    'count__ngram_range':[(1,1),(1,2),(1,3)],
    'tfidf__norm':['l1', 'l2'],
    'tfidf__use_idf':[True, False],
    'tfidf__sublinear_tf':[True,False],
    'logreg__C':[.001, .01, .1]
}
grid_search = GridSearchCV(pipeline, 
                           param_grid,
                           cv = folds,
                           scoring=my_roc_auc_scorer,
                           n_jobs=-1, verbose=1)

In [49]:
grid_search.fit(np.array(data['X']), data['y'])
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:  2.0min finished


()
Best score: 0.875
Best parameters set:
	count__ngram_range: (1, 3)
	logreg__C: 0.1
	tfidf__norm: 'l2'
	tfidf__sublinear_tf: True
	tfidf__use_idf: True


In [50]:
%time analyze(reviews, grid_search.best_estimator_)

ROC_AUC SCORE :: 0.962345013477
True positive :: 0.451149425287
False negative :: 0.0818965517241
False positive :: 0.0337643678161
True negative :: 0.433189655172
FP / TP :: 0.0748407643312
FN / TN :: 0.189054726368
CPU times: user 867 ms, sys: 23.4 ms, total: 890 ms
Wall time: 908 ms


In [51]:
%time analyze(reviews2, grid_search.best_estimator_)

ROC_AUC SCORE :: 0.960247805885
True positive :: 0.469534050179
False negative :: 0.0645161290323
False positive :: 0.0465949820789
True negative :: 0.41935483871
FP / TP :: 0.0992366412214
FN / TN :: 0.153846153846
CPU times: user 183 ms, sys: 13.1 ms, total: 197 ms
Wall time: 194 ms
