In [2]:
from spacy.en import English
parser = English()

In [3]:
import foodbornenyc.models.models as models
from foodbornenyc.models.businesses import Business, business_category_table
from foodbornenyc.models.documents import YelpReview, Tweet, Document
from foodbornenyc.models.locations import Location
from foodbornenyc.models.metadata import metadata

In [4]:
import xlrd
f = xlrd.open_workbook('data/yelp_sick_classifier_data.xlsx')
sheet1 = f.sheet_by_index(0)

In [5]:
from sklearn.externals import joblib
from foodbornenyc.settings import yelp_classify_config as config

sick = joblib.load("../"+config['model_file'])
sick

Pipeline(steps=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        ...alty='l2', random_state=57, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

In [6]:
import numpy as np
from sklearn.metrics import roc_auc_score
def analyze(reviews, classifier):
    textonly = [a["text"] for a in reviews]
    predictions_new = classifier.predict_proba(textonly)
    label = np.array([review['label'] for review in reviews])
    pred = np.array([pred[1] for pred in predictions_new])
    print "ROC_AUC SCORE ::", roc_auc_score(label, pred, average='micro')
    # determine true/false positive/negative rates
    tp_rate = 0.0
    fp_rate = 0.0
    tn_rate = 0.0
    fn_rate = 0.0

    for review, pred in zip(reviews, predictions_new):
        if review['label'] == 1.0 and pred[1] > 0.5: tp_rate += 1
        elif review['label'] == 1.0 and pred[1] < 0.5: fn_rate += 1
        elif review['label'] == 0.0 and pred[1] > 0.5: fp_rate += 1
        elif review['label'] == 0.0 and pred[1] < 0.5: tn_rate += 1
    tp_rate /= len(reviews)
    fn_rate /= len(reviews)
    fp_rate /= len(reviews)
    tn_rate /= len(reviews)
    print "True positive ::", tp_rate
    print "False negative ::", fn_rate
    print "False positive ::", fp_rate
    print "True negative ::", tn_rate
    print "FP / TP ::", fp_rate / tp_rate
    print "FN / TN ::", fn_rate / tn_rate

In [7]:
reviews = []
for i, (rev, label) in enumerate(zip(sheet1.col(1), sheet1.col(2))):
    if i == 0: continue
    reviews.append({"text":rev.value, "label":label.value})

In [8]:
analyze(reviews, sick)

ROC_AUC SCORE :: 0.996294837238
True positive :: 0.523706896552
False negative :: 0.00933908045977
False positive :: 0.00646551724138
True negative :: 0.460488505747
FP / TP :: 0.0123456790123
FN / TN :: 0.0202808112324


In [9]:
sheet2 = xlrd.open_workbook('data/sick_test_preds.xlsx').sheet_by_index(0)
reviews2 = []
for i, (rev, label) in enumerate(zip(sheet2.col(0), sheet2.col(3))):
    if i == 0: continue
    reviews2.append({"text":rev.value, "label":label.value})


In [10]:
analyze(reviews2, sick)

ROC_AUC SCORE :: 0.999638616417
True positive :: 0.530465949821
False negative :: 0.00358422939068
False positive :: 0.0143369175627
True negative :: 0.451612903226
FP / TP :: 0.027027027027
FN / TN :: 0.00793650793651


In [11]:
sick.steps

[('count',
  CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=0.95, max_features=None, min_df=1,
          ngram_range=(1, 3), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('tfidf',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('log',
  LogisticRegression(C=100, class_weight=None, dual=True, fit_intercept=True,
            intercept_scaling=0.01, max_iter=100, multi_class='ovr',
            n_jobs=1, penalty='l2', random_state=57, solver='liblinear',
            tol=0.0001, verbose=0, warm_start=False))]

In [12]:
# key words to watch out for: poisoning, sick, 
# tokens that perform strictly negation: not, n't, no, none, nobody, neither, 
# if negation word's head == key word's head, prepend key word with "not" and remove negation word
# only potential issue is double negative, e.g. "no one didn't get food poisoning", but this is a first step
from spacy import attrs
example = u"I hope none of us gets sick tonight. I didn't order food poisoning. I do not think you should come here because I got food poisoning."
parsedEx = parser(example)
for token in parsedEx:
    print token.orth_, token.dep_, token.head, [t.orth_ for t in token.children]

I nsubj hope  []
hope ROOT hope  [u'I', u'gets', u'.']
none nsubj gets  [u'of']
of prep none  [u'us']
us pobj of  []
gets ccomp hope  [u'none', u'sick', u'tonight']
sick acomp gets  []
tonight npadvmod gets  []
. punct hope  []
I nsubj order  []
did aux order  []
n't neg order  []
order ROOT order  [u'I', u'did', u"n't", u'poisoning', u'.']
food compound poisoning []
poisoning dobj order  [u'food']
. punct order  []
I nsubj think  []
do aux think  []
not neg think  []
think ROOT think  [u'I', u'do', u'not', u'come', u'.']
you nsubj come  []
should aux come  []
come ccomp think  [u'you', u'should', u'here', u'got']
here advmod come  []
because mark got  []
I nsubj got  []
got advcl come  [u'because', u'I', u'poisoning']
food compound poisoning []
poisoning dobj got  [u'food']
. punct think  []


In [13]:
def transform_doc_1(doc): 
    #if root of sentence had negation and sentence contained kw
    kw = ['poisoning', 'sick']
    neg = ['not', "n't", 'no', 'none', 'nobody', 'neither']
    parsedDoc = parser(doc.lower())
    tokens = [[t.orth_ for t in s] for s in parsedDoc.sents] #this will be modified
    sents = list(parsedDoc.sents)
    for i in range(len(sents)):
        # each span has only one root
        if not any([c.orth_ in neg for c in sents[i].root.children]): continue #if there's no negation
        neg_i = [j for j in range(len(sents[i])) if sents[i][j].orth_ in neg][0]
        kw_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in kw]
        
        if len(kw_list) == 0: continue
        
        kw_i = kw_list[0]
        
        #now modify
        tokens[i].insert(kw_i, "not")
        tokens[i].pop(neg_i)
    #now we join everything with spaces
    out = []
    for sent in tokens:
        out.append(" ".join(sent))
    return " ".join(out)

print transform_doc_1(example)

i hope none of us gets sick tonight . i did order food not poisoning . i do think you should come here because i got food not poisoning .


In [14]:
def transform_doc_2(doc): 
    #if negation and kw share a head
    kw = ['poisoning', 'sick']
    neg = ['not', "n't", 'no', 'none', 'nobody', 'neither']
    parsedDoc = parser(doc.lower())
    tokens = [[t.orth_ for t in s] for s in parsedDoc.sents] #this will be modified
    sents = list(parsedDoc.sents)
    for i in range(len(sents)):
        if not any([c.orth_ in neg for c in sents[i]]): continue #if there's no negation
        neg_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in neg]
        kw_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in kw]
        
        if len(kw_list) == 0: continue
        
        # attempt at handling double negatives
        double_negative = True
        kw_i = -1
        neg_i = -1
        
        for j in neg_list:
            for k in kw_list:
                if sents[i][j].head == sents[i][k].head and double_negative:
                    neg_i = j
                    kw_i = k
                    double_negative = False
                elif sents[i][j].head == sents[i][k].head and not double_negative:
                    double_negative = True
        
        if double_negative: continue
        
        #now modify
        tokens[i].insert(kw_i, "not")
        tokens[i].pop(neg_i)
    #now we join everything with spaces
    out = []
    for sent in tokens:
        out.append(" ".join(sent))
    return " ".join(out)

print transform_doc_2(example)

i hope of us gets not sick tonight . i did order food not poisoning . i do not think you should come here because i got food poisoning .


In [15]:
from sklearn.base import TransformerMixin
class NegationTransformer(TransformerMixin):
    """ Brings negation words closer to relevant key terms to make it detectable with n-gram detector """
    
    def transform(self, X, **transform_params):
        return np.array([transform_doc_2(doc) for doc in X])
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}
    

In [16]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('negTransformer', NegationTransformer()), ('oldPipe', sick)])
pipe

Pipeline(steps=[('negTransformer', <__main__.NegationTransformer object at 0x1a0eac890>), ('oldPipe', Pipeline(steps=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95,...y='l2', random_state=57, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]))])

In [17]:
%time analyze(reviews, pipe)

ROC_AUC SCORE :: 0.996213974705
True positive :: 0.524425287356
False negative :: 0.00862068965517
False positive :: 0.00718390804598
True negative :: 0.459770114943
FP / TP :: 0.013698630137
FN / TN :: 0.01875
CPU times: user 22.3 s, sys: 342 ms, total: 22.6 s
Wall time: 23.5 s


In [18]:
%time analyze(reviews2, pipe)

ROC_AUC SCORE :: 0.999638616417
True positive :: 0.530465949821
False negative :: 0.00358422939068
False positive :: 0.0179211469534
True negative :: 0.448028673835
FP / TP :: 0.0337837837838
FN / TN :: 0.008
CPU times: user 4.27 s, sys: 70.6 ms, total: 4.34 s
Wall time: 4.4 s


In [19]:
#try refitting the pipeline?
pipe1 = Pipeline([('negTransformer', NegationTransformer()), ('oldPipe', sick)])
pipe2 = Pipeline([('negTransformer', NegationTransformer()), ('oldPipe', sick)])

In [20]:
#fitting against reviews from scratch
from sklearn import cross_validation
data = {}
data['X'] = [review['text'] for review in reviews]
data['y'] = [review['label'] for review in reviews]
#folds = cross_validation.StratifiedKFold(data['y'], n_folds=3, random_state=0, shuffle=True)

In [45]:
#from yelp classifier training notebook
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def my_roc_auc(ground_truth, predictions):
    ground_truth = np.array(ground_truth)
    predictions = np.array(predictions)
    return metrics.roc_auc_score(ground_truth, predictions, average='micro')

my_roc_auc_scorer = metrics.make_scorer(my_roc_auc, needs_threshold=True, greater_is_better=True)
# Feature Extractors
cv = CountVectorizer(
        input=u'content', 
        encoding=u'utf-8', 
        decode_error=u'strict', 
        strip_accents='unicode', 
        lowercase=True,
        analyzer=u'word', 
        preprocessor=None, 
        tokenizer=None, 
        stop_words='english', 
        #token_pattern=u'(?u)\\b\w\w+\b', # one alphanumeric is a token
        ngram_range=(1, 2), 
        max_df=.9, 
        min_df=2, 
        max_features=None, 
        vocabulary=None, 
        binary=False, 
        #dtype=type 'numpy.int64'>
        )
from sklearn.feature_extraction.text import TfidfTransformer
tf = TfidfTransformer(
        norm='l2',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=False
)

# Final Classifier
from sklearn.naive_bayes import MultinomialNB
lr = LogisticRegression(C=.05,
                        fit_intercept=True,
                        random_state=0,
                        class_weight='balanced',
                        n_jobs=-1)

pipeline = Pipeline([
    ('negtransform', NegationTransformer()),
    ('count', cv),
    ('tfidf', tf),
    ('logreg', lr)
    ])

# param_grid = {
#     'count__ngram_range':[(1,1),(1,2),(1,3)],
#     'tfidf__norm':['l1', 'l2'],
#     'tfidf__use_idf':[True, False],
#     'tfidf__sublinear_tf':[True,False],
#     'logreg__C':[.001, .01, .1]
# }
param_grid = {
    'count__ngram_range': [(1, 3)],
    'count__max_df' : [ .95],
    'count__stop_words': [None],
    'count__lowercase' : [True],
    'count__max_features': [None],
    'tfidf__use_idf' : [True],
    'tfidf__norm': [('l2')],
    'logreg__C': [100],
    'logreg__dual' : [True],
    'logreg__fit_intercept': [True],
    'logreg__penalty': ['l2'],
    'logreg__intercept_scaling':[.01],
    'logreg__random_state': [57],
    'logreg__solver': ['liblinear']
}
grid_search = GridSearchCV(pipeline, 
                           param_grid,
                           #cv = folds,
                           scoring=my_roc_auc_scorer,
                           n_jobs=-1, verbose=1)

In [46]:
def split_dev_test(data, test_size=.2):
    train_data = {'X': [], 'y': [] }
    test_data = {'X': [], 'y': [] }
    for train, test in cross_validation.StratifiedShuffleSplit([a['label'] for a in reviews], n_iter=1, test_size=test_size, random_state=0):
        train_data['X'] = np.array([data[i]['text'] for i in train])
        train_data['y'] = np.array([data[i]['label'] for i in train])
        test_data['X'] = np.array([data[i]['text'] for i in test])
        test_data['y'] = np.array([data[i]['label'] for i in test])
        
    print "Training/Dev data shape: ", train_data['X'].shape, train_data['y'].shape
    print "Test data shape: ",test_data['X'].shape, test_data['y'].shape
    return train_data, test_data

train_data, test_data = split_dev_test(reviews)

Training/Dev data shape:  (1113,) (1113,)
Test data shape:  (279,) (279,)


In [47]:
%time grid_search.fit(train_data['X'], train_data['y'])
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
CPU times: user 19 s, sys: 601 ms, total: 19.6 s
Wall time: 50.5 s
()
Best score: 0.886
Best parameters set:
	count__lowercase: True
	count__max_df: 0.95
	count__max_features: None
	count__ngram_range: (1, 3)
	count__stop_words: None
	logreg__C: 100
	logreg__dual: True
	logreg__fit_intercept: True
	logreg__intercept_scaling: 0.01
	logreg__penalty: 'l2'
	logreg__random_state: 57
	logreg__solver: 'liblinear'
	tfidf__norm: 'l2'
	tfidf__use_idf: True


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   29.7s finished


In [55]:
def split_and_test(data, grid_search, test_size=.2):
    train_data = {'X': [], 'y': [] }
    test_data = {'X': [], 'y': [] }
    for train, test in cross_validation.StratifiedShuffleSplit([a['label'] for a in reviews], n_iter=1, test_size=test_size, random_state=0):
        train_data['X'] = np.array([data[i]['text'] for i in train])
        train_data['y'] = np.array([data[i]['label'] for i in train])
        test_data['X'] = np.array([data[i]['text'] for i in test])
        test_data['y'] = np.array([data[i]['label'] for i in test])
        %time grid_search.fit(train_data['X'], train_data['y'])
        print("Best score: %0.3f" % grid_search.best_score_)
        %time analyze([{'text': a, 'label': b} for (a,b) in zip(test_data['X'],test_data['y'])], grid_search.best_estimator_)

split_and_test(reviews, grid_search)
        

Fitting 3 folds for each of 1 candidates, totalling 3 fits
CPU times: user 20 s, sys: 576 ms, total: 20.6 s
Wall time: 48.6 s
Best score: 0.886
ROC_AUC SCORE :: 0.886112545173
True positive :: 0.433691756272
False negative :: 0.100358422939
False positive :: 0.10394265233
True negative :: 0.362007168459
FP / TP :: 0.239669421488
FN / TN :: 0.277227722772
CPU times: user 4.11 s, sys: 36 ms, total: 4.14 s
Wall time: 4.22 s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   27.0s finished


In [59]:
param_grid_temp = {
    'count__ngram_range': [(1, 3)],
    'count__max_df' : [ .95],
    'count__stop_words': [None],
    'count__lowercase' : [True],
    'count__max_features': [None],
    'tfidf__use_idf' : [True],
    'tfidf__norm': [('l2')],
    'log__C': [100],
    'log__dual' : [True],
    'log__fit_intercept': [True],
    'log__penalty': ['l2'],
    'log__intercept_scaling':[.01],
    'log__random_state': [57],
    'log__solver': ['liblinear']
}
split_and_test(reviews, GridSearchCV(sick, 
                           param_grid_temp,
                           scoring=my_roc_auc_scorer,
                           n_jobs=1, verbose=1))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
CPU times: user 6.95 s, sys: 495 ms, total: 7.45 s
Wall time: 8.23 s
Best score: 0.890
ROC_AUC SCORE :: 0.887816210635
True positive :: 0.451612903226
False negative :: 0.0824372759857
False positive :: 0.114695340502
True negative :: 0.351254480287
FP / TP :: 0.253968253968
FN / TN :: 0.234693877551
CPU times: user 185 ms, sys: 4.49 ms, total: 189 ms
Wall time: 193 ms


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.0s finished


In [52]:
%time analyze([{'text': a, 'label': b} for (a,b) in zip(test_data['X'],test_data['y'])], grid_search.best_estimator_)
#TODO: format reviews in X and y arrays instead of individual objects...

ROC_AUC SCORE :: 0.886112545173
True positive :: 0.433691756272
False negative :: 0.100358422939
False positive :: 0.10394265233
True negative :: 0.362007168459
FP / TP :: 0.239669421488
FN / TN :: 0.277227722772
CPU times: user 4.17 s, sys: 50.7 ms, total: 4.22 s
Wall time: 4.31 s


In [53]:
%time analyze(reviews2, grid_search.best_estimator_)

ROC_AUC SCORE :: 0.886164171399
True positive :: 0.433691756272
False negative :: 0.100358422939
False positive :: 0.10394265233
True negative :: 0.362007168459
FP / TP :: 0.239669421488
FN / TN :: 0.277227722772
CPU times: user 4.26 s, sys: 45.2 ms, total: 4.31 s
Wall time: 4.4 s


In [54]:
def reveal_fp(reviews, classifier):
    prediction = classifier.predict_proba([a['text'] for a in reviews])
    for review, pred in zip(reviews, prediction):
        if review['label'] == 0.0 and pred[1] > 0.5:
            print pred[1]
            print review['text']
            print

reveal_fp(reviews2, grid_search.best_estimator_)

0.992694891784
After posting my original review, the NYC Health Dept contacted me and urged me to call 311 to report the food poisoning incident at Atlas Cafe.  I would like to clarify I did not contract food poisoning from Atlas Cafe.  I merely likened the urgency to post my one star review of Atlas to my urgency to use the bathroom when I did contract food poisoning from a wedding in Jersey.

0.996696994241
After reading all the rave reviews about Schnitzi, I finally decided to try it out. What a mistake!!!
The place is filthy, the menu prices are too high and the food sucks. I tried their double burger. They barely put tomatoes or pickles in my sandwich. The sauce wasn&#39;t that great, but the worst part was the meat. The burgers were two pieces of rubber that tasted horrible.
All night I was burping nasty burps that felt like I ate a whole rotten cow. The next day I was still sick from this crappy burger and all my clothes smelled of filthy food.
I don&#39;t recommend this place t

In [35]:
grid_search2 = GridSearchCV(sick, 
                           param_grid,
                           cv = folds,
                           scoring=my_roc_auc_scorer,
                           n_jobs=-1, verbose=1)
sick

Pipeline(steps=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        ...alty='l2', random_state=57, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

In [62]:
grid_search2.fit(np.array(data['X']), data['y'])
print()

print("Best score: %0.3f" % grid_search2.best_score_)
print("Best parameters set:")
best_parameters2 = grid_search2.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters2[param_name]))

Fitting 3 folds for each of 72 candidates, totalling 216 fits


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    157     pkg_name = mod_name.rpartition('.')[0]
    158     main_globals = sys.modules["__main__"].__dict__
    159     if alter_argv:
    160         sys.argv[0] = fname
    161     return _run_code(code, main_globals, None,
--> 162                      "__main__", fname, loader, pkg_name)
        fname = '/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    163 
    164 def run_module(mod_name, init_globals=None,
    165                run_name=None, alter_sys=False):
    166     """Execute a module's code without importing it

...........................................................................
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x1028ef4b0, file "/Use...2.7/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/Users/kevin...python2.7/site-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x1028ef4b0, file "/Use...2.7/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/Users/kevin...python2.7/site-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    587         
    588         If a global instance already exists, this reinitializes and starts it
    589         """
    590         app = cls.instance(**kwargs)
    591         app.initialize(argv)
--> 592         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    593 
    594 #-----------------------------------------------------------------------------
    595 # utility functions, for convenience
    596 #-----------------------------------------------------------------------------

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    384     def start(self):
    385         if self.poller is not None:
    386             self.poller.start()
    387         self.kernel.start()
    388         try:
--> 389             ioloop.IOLoop.instance().start()
    390         except KeyboardInterrupt:
    391             pass
    392 
    393 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    146             PollIOLoop.configure(ZMQIOLoop)
    147         return PollIOLoop.instance()
    148     
    149     def start(self):
    150         try:
--> 151             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    152         except ZMQError as e:
    153             if e.errno == ETERM:
    154                 # quietly return on ETERM
    155                 pass

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    861                 self._events.update(event_pairs)
    862                 while self._events:
    863                     fd, events = self._events.popitem()
    864                     try:
    865                         fd_obj, handler_func = self._handlers[fd]
--> 866                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    867                     except (OSError, IOError) as e:
    868                         if errno_from_exception(e) == errno.EPIPE:
    869                             # Happens when the client closes the connection
    870                             pass

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    428             # dispatch events:
    429             if events & IOLoop.ERROR:
    430                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    431                 return
    432             if events & IOLoop.READ:
--> 433                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    434                 if not self.socket:
    435                     return
    436             if events & IOLoop.WRITE:
    437                 self._handle_send()

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    460                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    461         else:
    462             if self._recv_callback:
    463                 callback = self._recv_callback
    464                 # self._recv_callback = None
--> 465                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    466                 
    467         # self.update_state()
    468         
    469 

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    402         close our socket."""
    403         try:
    404             # Use a NullContext to ensure that all StackContexts are run
    405             # inside our blanket exception handler rather than outside.
    406             with stack_context.NullContext():
--> 407                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    408         except:
    409             gen_log.error("Uncaught exception, closing connection.",
    410                           exc_info=True)
    411             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    247         if self.control_stream:
    248             self.control_stream.on_recv(self.dispatch_control, copy=False)
    249 
    250         def make_dispatcher(stream):
    251             def dispatcher(msg):
--> 252                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    253             return dispatcher
    254 
    255         for s in self.shell_streams:
    256             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {u'allow_stdin': True, u'code': u'grid_search2.fit(np.array(data[\'X\']), data[\...r" % (param_name, best_parameters2[param_name]))', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'msg_id': u'1E98ACA7EBEF4F789B8396E4507419E2', u'msg_type': u'execute_request', u'session': u'15B18E86CB1040EAB3CDC14267BDB3E4', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'1E98ACA7EBEF4F789B8396E4507419E2', 'msg_type': u'execute_request', 'parent_header': {}})
    208         else:
    209             # ensure default_int_handler during handler call
    210             sig = signal(SIGINT, default_int_handler)
    211             self.log.debug("%s: %s", msg_type, msg)
    212             try:
--> 213                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['15B18E86CB1040EAB3CDC14267BDB3E4']
        msg = {'buffers': [], 'content': {u'allow_stdin': True, u'code': u'grid_search2.fit(np.array(data[\'X\']), data[\...r" % (param_name, best_parameters2[param_name]))', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'msg_id': u'1E98ACA7EBEF4F789B8396E4507419E2', u'msg_type': u'execute_request', u'session': u'15B18E86CB1040EAB3CDC14267BDB3E4', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'1E98ACA7EBEF4F789B8396E4507419E2', 'msg_type': u'execute_request', 'parent_header': {}}
    214             except Exception:
    215                 self.log.error("Exception in message handler:", exc_info=True)
    216             finally:
    217                 signal(SIGINT, sig)

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['15B18E86CB1040EAB3CDC14267BDB3E4'], parent={'buffers': [], 'content': {u'allow_stdin': True, u'code': u'grid_search2.fit(np.array(data[\'X\']), data[\...r" % (param_name, best_parameters2[param_name]))', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'msg_id': u'1E98ACA7EBEF4F789B8396E4507419E2', u'msg_type': u'execute_request', u'session': u'15B18E86CB1040EAB3CDC14267BDB3E4', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'1E98ACA7EBEF4F789B8396E4507419E2', 'msg_type': u'execute_request', 'parent_header': {}})
    357         if not silent:
    358             self.execution_count += 1
    359             self._publish_execute_input(code, parent, self.execution_count)
    360 
    361         reply_content = self.do_execute(code, silent, store_history,
--> 362                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    363 
    364         # Flush output before sending the reply.
    365         sys.stdout.flush()
    366         sys.stderr.flush()

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u'grid_search2.fit(np.array(data[\'X\']), data[\...r" % (param_name, best_parameters2[param_name]))', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    170 
    171         reply_content = {}
    172         # FIXME: the shell calls the exception handler itself.
    173         shell._reply_content = None
    174         try:
--> 175             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u'grid_search2.fit(np.array(data[\'X\']), data[\...r" % (param_name, best_parameters2[param_name]))'
        store_history = True
        silent = False
    176         except:
    177             status = u'error'
    178             # FIXME: this code right now isn't being used yet by default,
    179             # because the run_cell() call above directly fires off exception

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u'grid_search2.fit(np.array(data[\'X\']), data[\...r" % (param_name, best_parameters2[param_name]))', store_history=True, silent=False, shell_futures=True)
   2897                 self.displayhook.exec_result = result
   2898 
   2899                 # Execute the user code
   2900                 interactivity = "none" if silent else self.ast_node_interactivity
   2901                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2902                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2903 
   2904                 # Reset this so later displayed values do not modify the
   2905                 # ExecutionResult
   2906                 self.displayhook.exec_result = None

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Expr object>, <_ast.Print object>, <_ast.Print object>, <_ast.Print object>, <_ast.Assign object>, <_ast.For object>], cell_name='<ipython-input-62-d7cb5029435f>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3001 
   3002         try:
   3003             for i, node in enumerate(to_run_exec):
   3004                 mod = ast.Module([node])
   3005                 code = compiler(mod, cell_name, "exec")
-> 3006                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x1a14e2db0, file "<ipython-input-62-d7cb5029435f>", line 1>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   3007                     return True
   3008 
   3009             for i, node in enumerate(to_run_interactive):
   3010                 mod = ast.Interactive([node])

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x1a14e2db0, file "<ipython-input-62-d7cb5029435f>", line 1>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3061         outflag = 1  # happens in more places, so it's easier as default
   3062         try:
   3063             try:
   3064                 self.hooks.pre_run_code_hook()
   3065                 #rprint('Running code', repr(code_obj)) # dbg
-> 3066                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x1a14e2db0, file "<ipython-input-62-d7cb5029435f>", line 1>
        self.user_global_ns = {'Business': <class 'foodbornenyc.models.businesses.Business'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'Document': <class 'foodbornenyc.models.documents.Document'>, 'English': <class 'spacy.en.English'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', u"#from yelp classifier training notebook\nfrom ...                           n_jobs=-1, verbose=1)", u'from spacy.en import English\nparser = English()', u'import foodbornenyc.models.models as models\nf...rom foodbornenyc.models.metadata import metadata', u"import xlrd\nf = xlrd.open_workbook('data/yelp...sifier_data.xlsx')\nsheet1 = f.sheet_by_index(0)", u'from sklearn.externals import joblib\nfrom foo... joblib.load("../"+config[\'model_file\'])\nsick', u'import numpy as np\nfrom sklearn.metrics impor...t fp_rate / tp_rate\n    print fn_rate / tn_rate', u'reviews = []\nfor i, (rev, label) in enumerate...:rev.value, "label":label.value})\nprint reviews', u'analyze(reviews, sick)', u'sheet2 = xlrd.open_workbook(\'data/sick_test_p...rev.value, "label":label.value})\nprint reviews2', u'analyze(reviews2, sick)', u'sick.steps', u'# key words to watch out for: poisoning, sick,..._, token.head, [t.orth_ for t in token.children]', u'def transform_doc_1(doc): \n    #if root of se... " ".join(out)\n\nprint transform_doc_1(example)', u'def transform_doc_2(doc): \n    #if negation a... " ".join(out)\n\nprint transform_doc_2(example)', u'from sklearn.base import TransformerMixin\ncla...=None, **fit_params):\n        return self\n    ', u"from sklearn.pipeline import Pipeline\npipe = ...egationTransformer()), ('oldPipe', sick)])\npipe", u'analyze(reviews, pipe)', u'analyze(reviews2, pipe)', u"#try refitting the pipeline?\npipe1 = Pipeline...er', NegationTransformer()), ('oldPipe', sick)])", ...], 'Location': <class 'foodbornenyc.models.locations.Location'>, 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'NegationTransformer': <class '__main__.NegationTransformer'>, ...}
        self.user_ns = {'Business': <class 'foodbornenyc.models.businesses.Business'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'Document': <class 'foodbornenyc.models.documents.Document'>, 'English': <class 'spacy.en.English'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', u"#from yelp classifier training notebook\nfrom ...                           n_jobs=-1, verbose=1)", u'from spacy.en import English\nparser = English()', u'import foodbornenyc.models.models as models\nf...rom foodbornenyc.models.metadata import metadata', u"import xlrd\nf = xlrd.open_workbook('data/yelp...sifier_data.xlsx')\nsheet1 = f.sheet_by_index(0)", u'from sklearn.externals import joblib\nfrom foo... joblib.load("../"+config[\'model_file\'])\nsick', u'import numpy as np\nfrom sklearn.metrics impor...t fp_rate / tp_rate\n    print fn_rate / tn_rate', u'reviews = []\nfor i, (rev, label) in enumerate...:rev.value, "label":label.value})\nprint reviews', u'analyze(reviews, sick)', u'sheet2 = xlrd.open_workbook(\'data/sick_test_p...rev.value, "label":label.value})\nprint reviews2', u'analyze(reviews2, sick)', u'sick.steps', u'# key words to watch out for: poisoning, sick,..._, token.head, [t.orth_ for t in token.children]', u'def transform_doc_1(doc): \n    #if root of se... " ".join(out)\n\nprint transform_doc_1(example)', u'def transform_doc_2(doc): \n    #if negation a... " ".join(out)\n\nprint transform_doc_2(example)', u'from sklearn.base import TransformerMixin\ncla...=None, **fit_params):\n        return self\n    ', u"from sklearn.pipeline import Pipeline\npipe = ...egationTransformer()), ('oldPipe', sick)])\npipe", u'analyze(reviews, pipe)', u'analyze(reviews2, pipe)', u"#try refitting the pipeline?\npipe1 = Pipeline...er', NegationTransformer()), ('oldPipe', sick)])", ...], 'Location': <class 'foodbornenyc.models.locations.Location'>, 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'NegationTransformer': <class '__main__.NegationTransformer'>, ...}
   3067             finally:
   3068                 # Reset our crash handler in place
   3069                 sys.excepthook = old_excepthook
   3070         except SystemExit as e:

...........................................................................
/Users/kevinzeng/FoodborneNYC/notebooks/<ipython-input-62-d7cb5029435f> in <module>()
----> 1 
      2 
      3 
      4 
      5 
      6 grid_search2.fit(np.array(data['X']), data['y'])
      7 print()
      8 
      9 print("Best score: %0.3f" % grid_search2.best_score_)
     10 print("Best parameters set:")
     11 best_parameters2 = grid_search2.best_estimator_.get_params()
     12 for param_name in sorted(param_grid.keys()):
     13     print("\t%s: %r" % (param_name, best_parameters2[param_name]))

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=sklearn.cross_validation.Stratif...rer(my_roc_auc, needs_threshold=True), verbose=1), X=array([ u'My friends and I ordered 3 burgers and...#worldcup #Newyorkbars.'], 
      dtype='<U4952'), y=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...])
    799         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    800             Target relative to X for classification or regression;
    801             None for unsupervised learning.
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...er(my_roc_auc, needs_threshold=True), verbose=1)>
        X = array([ u'My friends and I ordered 3 burgers and...#worldcup #Newyorkbars.'], 
      dtype='<U4952')
        y = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...]
        self.param_grid = {'count__ngram_range': [(1, 1), (1, 2), (1, 3)], 'logreg__C': [0.001, 0.01, 0.1], 'tfidf__norm': ['l1', 'l2'], 'tfidf__sublinear_tf': [True, False], 'tfidf__use_idf': [True, False]}
    805 
    806 
    807 class RandomizedSearchCV(BaseSearchCV):
    808     """Randomized search on hyper parameters.

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=sklearn.cross_validation.Stratif...rer(my_roc_auc, needs_threshold=True), verbose=1), X=array([ u'My friends and I ordered 3 burgers and...#worldcup #Newyorkbars.'], 
      dtype='<U4952'), y=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...], parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    548         )(
    549             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    550                                     train, test, self.verbose, parameters,
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    554                 for train, test in cv)
    555 
    556         # Out is a list of triplet: score, estimator, n_test_samples
    557         n_fits = len(out)

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object <genexpr>>)
    805             if pre_dispatch == "all" or n_jobs == 1:
    806                 # The iterable was consumed all at once by the above for loop.
    807                 # No need to wait for async callbacks to trigger to
    808                 # consumption.
    809                 self._iterating = False
--> 810             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    811             # Make sure that we get a last message telling us we are done
    812             elapsed_time = time.time() - self._start_time
    813             self._print('Done %3i out of %3i | elapsed: %s finished',
    814                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Thu May  5 21:20:01 2016
PID: 49031     Python 2.7.9: /Users/kevinzeng/.virtualenvs/fbnyc/bin/python
...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(steps=[('count', CountVectorizer(analyz...      tol=0.0001, verbose=0, warm_start=False))]), memmap([ u'My friends and I ordered 3 burgers an...#worldcup #Newyorkbars.'], 
      dtype='<U4952'), [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...], make_scorer(my_roc_auc, needs_threshold=True), array([   0,    3,    6,    7,    8,   10,   11,...1383, 1384, 1386, 1388,
       1389, 1390, 1391]), array([   1,    2,    4,    5,    9,   12,   14,...1373, 1376, 1378, 1379,
       1380, 1385, 1387]), 1, {'count__ngram_range': (1, 1), 'logreg__C': 0.001, 'tfidf__norm': 'l1', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}, {})
        kwargs = {'error_score': 'raise', 'return_parameters': True}
        self.items = [(<function _fit_and_score>, (Pipeline(steps=[('count', CountVectorizer(analyz...      tol=0.0001, verbose=0, warm_start=False))]), memmap([ u'My friends and I ordered 3 burgers an...#worldcup #Newyorkbars.'], 
      dtype='<U4952'), [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...], make_scorer(my_roc_auc, needs_threshold=True), array([   0,    3,    6,    7,    8,   10,   11,...1383, 1384, 1386, 1388,
       1389, 1390, 1391]), array([   1,    2,    4,    5,    9,   12,   14,...1373, 1376, 1378, 1379,
       1380, 1385, 1387]), 1, {'count__ngram_range': (1, 1), 'logreg__C': 0.001, 'tfidf__norm': 'l1', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}, {}), {'error_score': 'raise', 'return_parameters': True})]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator=Pipeline(steps=[('count', CountVectorizer(analyz...      tol=0.0001, verbose=0, warm_start=False))]), X=memmap([ u'My friends and I ordered 3 burgers an...#worldcup #Newyorkbars.'], 
      dtype='<U4952'), y=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...], scorer=make_scorer(my_roc_auc, needs_threshold=True), train=array([   0,    3,    6,    7,    8,   10,   11,...1383, 1384, 1386, 1388,
       1389, 1390, 1391]), test=array([   1,    2,    4,    5,    9,   12,   14,...1373, 1376, 1378, 1379,
       1380, 1385, 1387]), verbose=1, parameters={'count__ngram_range': (1, 1), 'logreg__C': 0.001, 'tfidf__norm': 'l1', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
   1515     fit_params = fit_params if fit_params is not None else {}
   1516     fit_params = dict([(k, _index_param_value(X, v, train))
   1517                       for k, v in fit_params.items()])
   1518 
   1519     if parameters is not None:
-> 1520         estimator.set_params(**parameters)
        estimator.set_params = <bound method Pipeline.set_params of Pipeline(st...     tol=0.0001, verbose=0, warm_start=False))])>
        parameters = {'count__ngram_range': (1, 1), 'logreg__C': 0.001, 'tfidf__norm': 'l1', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
   1521 
   1522     start_time = time.time()
   1523 
   1524     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/Users/kevinzeng/.virtualenvs/fbnyc/lib/python2.7/site-packages/sklearn/base.py in set_params(self=Pipeline(steps=[('count', CountVectorizer(analyz...      tol=0.0001, verbose=0, warm_start=False))]), **params={'count__ngram_range': (1, 1), 'logreg__C': 0.001, 'tfidf__norm': 'l1', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True})
    256                 name, sub_name = split
    257                 if name not in valid_params:
    258                     raise ValueError('Invalid parameter %s for estimator %s. '
    259                                      'Check the list of available parameters '
    260                                      'with `estimator.get_params().keys()`.' %
--> 261                                      (name, self))
        name = 'logreg'
        self = Pipeline(steps=[('count', CountVectorizer(analyz...      tol=0.0001, verbose=0, warm_start=False))])
    262                 sub_object = valid_params[name]
    263                 sub_object.set_params(**{sub_name: value})
    264             else:
    265                 # simple objects case

ValueError: Invalid parameter logreg for estimator Pipeline(steps=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        ...alty='l2', random_state=57, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________