# Labeling all data

In [9]:
import pandas as pd
import os

### Loading and running all classifiers

In [10]:
from __private import fs

In [11]:
print(*fs.list(), sep="\n")

alcohol|accuracy:0.8143360752056404|f1:0.8192219679633866|type:LogisticRegression
alcohol|accuracy:0.8401880141010576|f1:0.8498896247240618|type:SVC
alcohol|accuracy:0.8425381903642774|f1:0.8562231759656651|type:RandomForestClassifier
first_person_label|accuracy:0.5637860082304527|f1:0.5574430033343769|type:SVC
first_person_label|accuracy:0.5637860082304527|f1:0.5643693591852614|type:LogisticRegression
first_person|accuracy:0.6951871657754011|f1:0.8034482758620688|type:RandomForestClassifier
first_person|accuracy:0.7005347593582888|f1:0.7751004016064257|type:LogisticRegression
first_person|accuracy:0.7032085561497327|f1:0.8062827225130889|type:RandomForestClassifier
first_person|accuracy:0.7112299465240641|f1:0.8021978021978021|type:SVC


In [12]:
from classification import dao

In [13]:
%%time
alcohol_classifier = dao.ClassifierAccess.get_byfile(
    "alcohol|accuracy:0.8143360752056404|f1:0.8192219679633866|type:LogisticRegression"
)

firstperson_classifier = dao.ClassifierAccess.get_byfile(
    "first_person|accuracy:0.7112299465240641|f1:0.8021978021978021|type:SVC"
)

firstpersonlevel_classifier = dao.ClassifierAccess.get_byfile(
    "first_person_label|accuracy:0.5637860082304527|f1:0.5643693591852614|type:LogisticRegression"
)

CPU times: user 476 ms, sys: 260 ms, total: 736 ms
Wall time: 16.3 s


In [14]:
from data import DataAccess, LabelGetter

In [15]:
X = DataAccess.get_as_dataframe()
L = LabelGetter(X)

In [16]:
%%time
firstpersonlevel_classifier.fit(*L.get_first_person_label())

CPU times: user 5.24 s, sys: 1.15 s, total: 6.39 s
Wall time: 6.4 s


Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=F...one,
          solver='liblinear', tol=0.000449897709599141, verbose=0,
          warm_start=None))])

In [17]:
%%time
firstperson_classifier.fit(*L.get_first_person())

CPU times: user 3min 45s, sys: 1.55 s, total: 3min 47s
Wall time: 3min 45s


Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=F...probability=True,
  random_state=None, shrinking=True, tol=0.0008753898561476732,
  verbose=False))])

In [18]:
del X
del L

In [31]:
%%time

from_path = "./control_dump/"
to_path = "./control_dump/predicted/"

def to_predict(x):
    i, file = x
    print("starting ", i)
    df = pd.read_csv(open(from_path + file,'rU'), encoding='utf-8', engine='c').dropna()

    fn = to_path + "predicted_" + file
    df.columns = ["time", "id", "text"]
    
    if not os.path.isfile(fn):
        print("starting classification", i)
        alcohol_predictions = alcohol_classifier.predict_proba(df[["text"]])[:,1]
        df["prediction_alcohol_svc"] = alcohol_predictions

        filter_alcohol = df["prediction_alcohol_svc"] > .75
        
        print("first person")
        
        df["prediction_firstperson_svc"] = 0
        
        df.loc[filter_alcohol, "prediction_firstperson_svc"] = firstperson_classifier.predict_proba(
            df[filter_alcohol][["text"]]
        )[:,1]


        firstpersonlevel_predictions = firstpersonlevel_classifier.predict_proba(
            df[filter_alcohol][["text"]]
        )

        df_firstpersonlevel_predictions = pd.DataFrame(
            firstpersonlevel_predictions, columns=[
                "prediction_firstperson_level_0", 
                "prediction_firstperson_level_2", 
                "prediction_firstperson_level_3"],
            index=df[filter_alcohol].index)

        df = df.join(df_firstpersonlevel_predictions).fillna(0)
        df[list(range(1,12))].to_csv(fn)
        print(i, file)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 24.1 µs


In [32]:
from os import listdir
from multiprocessing import Pool