In [None]:
# %load utils.py
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

def dump_model(path, model):
    with open(path, 'wb') as f:
        pickle.dump(model, f)

def load_dumped(path):
    result = None
    with open(path, 'rb') as f:
        result = pickle.load(f)
    return result

def load_and_split(name):
    path = "../data/%s.csv" % name
    df = pd.read_csv(path, sep="|")
    return train_test_split(df[['text']], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

def load_and_split_quick(name):
    path = "../data/%s.csv" % name
    df = pd.read_csv(path, sep="|")
    processed = load_dumped("../data/processed/%s.bin" % name)
    return train_test_split(df[['text']], df['label'], test_size=0.2, random_state=42, stratify=df['label'])


def other_name(name):
    other_dataset = 'imdb_small'
    if (name == 'imdb_small'):
        other_dataset = 'reviews_rt_all'
    return other_dataset


In [3]:
# %load train.py

import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.externals import joblib

from sklearn.pipeline import Pipeline, FeatureUnion

from pipeline.text_cleaner import TextCleaner
from pipeline.stemmer import Stemmer
from pipeline.dataframe_vectorizer import DataframeVectorizer
from pipeline.lemmatizer import Lemmatizer

from utils import dump_model, load_and_split, other_name, load_file

import pickle
from sklearn.model_selection import train_test_split

from sklearn import metrics
import pickle
import datetime

STOPWORDS = ['a','an','by','did','does', 'was', 'were', 'i', 'the', 'and', 'if']

#global
DataPipeline = Pipeline(steps=[
        ('clean_words', TextCleaner(key='text')),
        ('lemmatizer', Lemmatizer()),
        ('stemmer', Stemmer()),
        ('vectorize', DataframeVectorizer(vectorizer=CountVectorizer(ngram_range=(1, 2),stop_words=STOPWORDS))),
        ])
#global
FastDataPipeline = Pipeline(steps=[('vectorize', DataframeVectorizer(vectorizer=CountVectorizer()))])

#global
LearningPipeline = Pipeline(steps=[
    ('logistic', LogisticRegressionCV(n_jobs=2,verbose=1))
])

def dump_models(name, f1_score, time_mark,
                data_pipeline=DataPipeline,
                learn_pipeline=LearningPipeline):
    path = '../dumps/history/%s__%s__%s__' % (name, time_mark, f1_score)
    dump_model(path + 'data.bin', data_pipeline)
    dump_model(path + 'learn.bin', learn_pipeline)
    path = '../dumps/%s__' % (name)
    dump_model(path + 'data.bin', data_pipeline)
    dump_model(path + 'learn.bin', learn_pipeline)

def store_results(name, y, predicted,
                  data_pipeline=DataPipeline,
                  learn_pipeline=LearningPipeline):
    acc = round(metrics.accuracy_score(y, predicted), 5)

    dt=datetime.datetime.now()
    time_mark = dt.strftime('%Y%m%d%H%M')

    dump_models(name, acc, time_mark,
                data_pipeline=data_pipeline,
                learn_pipeline=learn_pipeline)

def calculate_other_performance(name):
    other_dataset = other_name(name)
    X_train, X_test, y_train, y_test = load_and_split(other_dataset)
    calculate_performance(name, other_dataset, X_test, y_test)

def calculate_performance(trained, testing, X, y,
                         data_pipeline=DataPipeline,
                         learn_pipeline=LearningPipeline):
    X_processed = data_pipeline.transform(X)
    predicted = learn_pipeline.predict(X_processed)
    print('Performance %s on %s' % (trained, testing))
    print(metrics.classification_report(y, predicted))
    print('Accuracy: ', metrics.accuracy_score(y, predicted))
    return predicted

def train(name):
    data_pipeline = DataPipeline
    X_train, X_test, y_train, y_test = load_and_split(name)


    X_after_processing = data_pipeline.fit_transform(X_train)
    print("Data processed!")

    LearningPipeline.fit(X_after_processing, y_train)
    print("Models trained!")

    predicted = calculate_performance(name, name, X_test, y_test,
                                     data_pipeline=data_pipeline)
    store_results(name, y_test, predicted,
                  data_pipeline=data_pipeline)
    calculate_other_performance(name)

In [4]:
X_train_rt, X_test_rt, y_train_rt, y_test_rt = load_and_split('reviews_rt_all')
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = load_and_split('imdb_small')

In [5]:
X_train = pd.concat([X_train_rt, X_train_imdb])
X_test = pd.concat([X_test_rt, X_test_imdb])
y_train = pd.concat([y_train_rt, y_train_imdb])
y_test = pd.concat([y_test_rt, y_test_imdb])

In [7]:
X_p = DataPipeline.fit_transform(X_train)

In [37]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import GridSearchCV

In [38]:
l = PassiveAggressiveClassifier(warm_start=False, random_state=42)

In [39]:
params={'C': [7,9,10,15,20], 'n_iter':[20,50,100],  'loss':['squared_hinge']}

In [55]:
clf=GridSearchCV(estimator=l, param_grid=params, scoring='accuracy', n_jobs=-1)

In [56]:
clf.fit(X_p, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=PassiveAggressiveClassifier(C=1.0, class_weight=None, fit_intercept=True,
              loss='hinge', n_iter=5, n_jobs=1, random_state=42,
              shuffle=True, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [0.01, 0.1, 1, 10, 100], 'n_iter': [5, 20], 'loss': ['squared_hinge', 'hinge']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [57]:
print(clf.best_params_)

{'C': 10, 'loss': 'squared_hinge', 'n_iter': 20}


In [58]:
X_test_d = DataPipeline.transform(X_test)


In [59]:
predict = clf.predict(X_test_d)
print('Validation:')
print('Accuracy: ', metrics.accuracy_score(y_test, predict))

Validation:
Accuracy:  0.819179608151


In [60]:
X_v_rt = DataPipeline.transform(X_test_rt)


In [64]:
predict = l.predict(X_v_rt)
print('test_RT:')
print('Accuracy: ', metrics.accuracy_score(y_test_rt, predict))

test_RT:
Accuracy:  0.795780138388


In [62]:
X_v = DataPipeline.transform(X_test_imdb)


In [63]:
predict = clf.predict(X_v)
print('test_imdb:')
print('Accuracy: ', metrics.accuracy_score(y_test_imdb, predict))

test_imdb:
Accuracy:  0.8832


In [66]:
import joblib

In [68]:
dump_model('../dumps/mix_data.bin', DataPipeline)

In [71]:
clf=PassiveAggressiveClassifier(warm_start=False, random_state=42, **clf.best_params_)

In [72]:
clf.fit(X_p, y_train)

PassiveAggressiveClassifier(C=10, class_weight=None, fit_intercept=True,
              loss='squared_hinge', n_iter=20, n_jobs=1, random_state=42,
              shuffle=True, verbose=0, warm_start=False)

In [73]:
predict = clf.predict(X_test_d)
print('Validation:')
print('Accuracy: ', metrics.accuracy_score(y_test, predict))

Validation:
Accuracy:  0.819179608151


In [74]:
#dump_model('../dumps/mix_learn.bin', clf)

In [75]:
joblib.dump(DataPipeline, '../dumps/mix_data.pcl')

['../dumps/mix_data.pcl']

In [77]:
joblib.dump( clf, '../dumps/mix_learn.pcl')

['../dumps/mix_learn.pcl']