In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import twokenize.twokenize as tokenizer

from pipelines.helpers import ItemGetter

def make_classifier():
    clf = Pipeline([
        ("getter", ItemGetter("text")),
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression())])

    clf_params = {
        'clf__C': 200,
        'clf__dual': False,
        'clf__max_iter': 100,
        'clf__multi_class': 'ovr',
        'clf__penalty': 'l2',
        'tfidf__tokenizer':tokenizer.tokenize,
        'tfidf__ngram_range':(1, 3),
        'tfidf__max_features':200000
    }

    clf.set_params(**clf_params)
    return clf

In [9]:
clf_alc = make_classifier()
clf_fpa = make_classifier()
clf_fpl = make_classifier()

In [27]:
clf_fpa = make_classifier()
clf_fpl = make_classifier()

In [8]:
%%time

from data import DataAccess, LabelGetter

X = DataAccess.get_as_dataframe()
L = LabelGetter(X)

Wall time: 29.6 s


In [11]:
%%time
clf_alc.fit(*L.get_alcohol())

Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=200000, min_df=1,
        ngram_range=(1, 3), norm='l2',...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [14]:
from sklearn.cross_validation import train_test_split

In [12]:
X, y = L.get_alcohol()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=23)

In [16]:
clf_alc.fit(X_train, y_train)

Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=200000, min_df=1,
        ngram_range=(1, 3), norm='l2',...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [18]:
from classification.reporting import ClassificationReporting

In [21]:
reporting = ClassificationReporting(clf_alc, X_train, X_test, y_train, y_test, 2)

In [25]:
report = reporting.set_name("Test Classifier").set_level('alc').create_report(1)



Training Results
~~~~~~~~~~~~~~~~
confusion_matrix
[[3632, 0], [0, 6853]]


classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3632
          1       1.00      1.00      1.00      6853

avg / total       1.00      1.00      1.00     10485



f1_score
1.0


accuracy_score
1.0




Testing Results Results
~~~~~~~~~~~~~~~~~~~~~~~
confusion_matrix
[[888, 835], [161, 3281]]


classification_report
             precision    recall  f1-score   support

          0       0.85      0.52      0.64      1723
          1       0.80      0.95      0.87      3442

avg / total       0.81      0.81      0.79      5165



f1_score
0.8682191055834877


accuracy_score
0.8071636011616651






In [29]:
X, y = L.get_first_person()
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=24)
clf_fpa.fit(X_train, y_train)
reporting = ClassificationReporting(clf_fpa, X_train, X_test, y_train, y_test, 2)
report = reporting.set_name("Test Classifier").set_level('fpa').create_report(1)



Training Results
~~~~~~~~~~~~~~~~
confusion_matrix
[[2338, 3], [0, 4235]]


classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2341
          1       1.00      1.00      1.00      4235

avg / total       1.00      1.00      1.00      6576



f1_score
0.9996459341437508


accuracy_score
0.9995437956204379




Testing Results Results
~~~~~~~~~~~~~~~~~~~~~~~
confusion_matrix
[[448, 670], [313, 1809]]


classification_report
             precision    recall  f1-score   support

          0       0.59      0.40      0.48      1118
          1       0.73      0.85      0.79      2122

avg / total       0.68      0.70      0.68      3240



f1_score
0.786350793305803


accuracy_score
0.696604938271605






In [30]:
X, y = L.get_first_person_label()
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=25)
clf_fpl.fit(X_train, y_train)
reporting = ClassificationReporting(clf_fpl, X_train, X_test, y_train, y_test, 3)
report = reporting.set_name("Test Classifier").set_level('fpl').create_report(1)

Training Results
~~~~~~~~~~~~~~~~
confusion_matrix
[[2166, 0, 0], [0, 1131, 0], [0, 0, 962]]


classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2166
          1       1.00      1.00      1.00      1131
          2       1.00      1.00      1.00       962

avg / total       1.00      1.00      1.00      4259



f1_score
1.0


accuracy_score
1.0




Testing Results Results
~~~~~~~~~~~~~~~~~~~~~~~
confusion_matrix
[[800, 172, 149], [139, 343, 63], [149, 62, 221]]


classification_report
             precision    recall  f1-score   support

          0       0.74      0.71      0.72      1121
          1       0.59      0.63      0.61       545
          2       0.51      0.51      0.51       432

avg / total       0.65      0.65      0.65      2098



f1_score
0.6510548553743395


accuracy_score
0.6501429933269781






In [11]:
import pickle
import pandas as pd

In [33]:
folder = 'simple classifiers/'
pickle.dump(clf_alc, open(folder +'clf_alc_simple.p', 'wb'))
pickle.dump(clf_fpa, open(folder +'clf_fpa_simple.p', 'wb'))
pickle.dump(clf_fpl, open(folder +'clf_fpl_simple.p', 'wb'))

In [7]:
folder = 'simple classifiers/'
clf_alc = pickle.load( open(folder +'clf_alc_simple.p', 'rb'))
clf_fpa = pickle.load( open(folder +'clf_fpa_simple.p', 'rb'))
clf_fpl = pickle.load( open(folder +'clf_fpl_simple.p', 'rb'))

In [8]:
clf_alc

Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=200000, min_df=1,
        ngram_range=(1, 3), norm='l2',...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [1]:
from classification.prediction import PredictionTransformer

In [9]:
clf = PredictionTransformer(clf_alc, clf_fpa, clf_fpl)

In [12]:
folder = 'C:/Users/Tom Work/PycharmProjects/nyu-twipsy/tweets_split'
all_tweets = pd.read_csv(folder + '/tweets_0.csv', encoding='utf8').dropna()

In [13]:
%%time
labeled= clf(all_tweets)
pickle.dump(labeled, open('June_labeled_all_simple.p', 'wb'))

Wall time: 9min 48s


In [15]:
labeled.index = pd.to_datetime(labeled.created_at)

In [16]:
pickle.dump(labeled, open('June_labeled_all_simple.p', 'wb'))