In [234]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

from sklearn.pipeline import Pipeline, FeatureUnion

from pipeline.text_cleaner import TextCleaner
from pipeline.stemmer import Stemmer
from pipeline.dataframe_vectorizer import DataframeVectorizer

from utils import dump_model, load_and_split, other_name, load_and_split_quick

from sklearn import metrics
import pickle
import datetime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [224]:
#global
DataPipeline = Pipeline(steps=[
        ('clean_words', TextCleaner(key='text')),
        ('stem', Stemmer()),
        ('vectorize', DataframeVectorizer(vectorizer=CountVectorizer()))])
#global
FastDataPipeline = Pipeline(steps=[('vectorize', DataframeVectorizer(vectorizer=CountVectorizer()))])

In [225]:
#global
LearningPipeline = Pipeline(steps=[
    ('logistic', LogisticRegression())
])

In [226]:
def dump_models(name, f1_score, time_mark,
                data_pipeline=DataPipeline,
                learn_pipeline=LearningPipeline):
    path = '../dumps/history/%s__%s__%s__' % (name, time_mark, f1_score)
    dump_model(path + 'data.bin', data_pipeline)
    dump_model(path + 'learn.bin', learn_pipeline) 
    path = '../dumps/%s__' % (name)
    dump_model(path + 'data.bin', data_pipeline)
    dump_model(path + 'learn.bin', learn_pipeline) 

In [240]:
def store_results(name, y, predicted, 
                  data_pipeline=DataPipeline,
                  learn_pipeline=LearningPipeline):
    acc = round(metrics.accuracy_score(y, predicted), 5)
    
    dt=datetime.datetime.now()
    time_mark = dt.strftime('%Y%m%d%H%M')

    dump_models(name, acc, time_mark, 
                data_pipeline=data_pipeline,
                learn_pipeline=learn_pipeline)

In [228]:
def calculate_other_performance(name):
    other_dataset = other_name(name)
    X_train, X_test, y_train, y_test = load_and_split(other_dataset)
    calculate_performance(name, other_dataset, X_test, y_test)

In [241]:
def calculate_performance(trained, testing, X, y,
                         data_pipeline=DataPipeline,
                         learn_pipeline=LearningPipeline):
    X_processed = data_pipeline.transform(X)
    predicted = learn_pipeline.predict(X_processed)
    print('Performance %s on %s' % (trained, testing))
    print(metrics.classification_report(y, predicted))
    print('Accuracy: ', metrics.accuracy_score(y, predicted))
    return predicted

In [230]:
def train(name, quick_mode=False):
    data_pipeline = DataPipeline
    suffix = ''
    if quick_mode:
        data_pipeline = FastDataPipeline
        X_train, X_test, y_train, y_test = load_and_split_quick(name)
        sufix = '-quick'
    else:
        X_train, X_test, y_train, y_test = load_and_split(name)
    
    
    X_after_processing = data_pipeline.fit_transform(X_train)
    print("Data processed!")
    
    LearningPipeline.fit(X_after_processing, y_train)
    print("Models trained!")
    
    predicted = calculate_performance(name, name, X_test, y_test,
                                     data_pipeline=data_pipeline)
    store_results(name + suffix, y_test, predicted, 
                  data_pipeline=data_pipeline)
    calculate_other_performance(name)
    
    
    

In [238]:
train('reviews_rt_all')

Data processed!
Models trained!
Performance reviews_rt_all on reviews_rt_all
             precision    recall  f1-score   support

          0       0.73      0.66      0.69      7590
          1       0.81      0.86      0.83     12932

avg / total       0.78      0.79      0.78     20522

Performance reviews_rt_all on imdb_small
             precision    recall  f1-score   support

          0       0.80      0.91      0.85      5000
          1       0.89      0.78      0.83      5000

avg / total       0.85      0.84      0.84     10000



In [239]:
train('imdb_small')

Data processed!
Models trained!
Performance imdb_small on imdb_small
             precision    recall  f1-score   support

          0       0.89      0.88      0.89      5000
          1       0.88      0.89      0.89      5000

avg / total       0.89      0.89      0.89     10000

Performance imdb_small on reviews_rt_all
             precision    recall  f1-score   support

          0       0.60      0.63      0.61      7590
          1       0.78      0.75      0.76     12932

avg / total       0.71      0.71      0.71     20522

