In [21]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '../')

import pandas as pd
import re
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.decomposition import TruncatedSVD

from sklearn.externals import joblib

from sklearn.pipeline import Pipeline, FeatureUnion

from pipeline.text_cleaner import TextCleaner
from pipeline.stemmer import Stemmer
from pipeline.dataframe_vectorizer import DataframeVectorizer
from pipeline.lemmatizer import Lemmatizer

from sklearn.model_selection import train_test_split

from utils import dump_model, load_and_split, other_name, load_and_split_quick

from sklearn import metrics
import pickle
import datetime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
#global
DataPipeline = Pipeline(steps=
        [
        ('clean_words', TextCleaner()),
        ('lemmatizer', Lemmatizer()),
        ('stemmer', Stemmer()),
        ('vectorize', DataframeVectorizer(vectorizer=CountVectorizer()))])

In [9]:
#global
LearningPipeline = Pipeline(steps=[
    ('logistic', LogisticRegression(verbose=1))
])

In [10]:
def dump_models(name, f1_score, time_mark,
                data_pipeline=DataPipeline,
                learn_pipeline=LearningPipeline):
    path = 'dumps/history/%s__%s__%s__' % (name, time_mark, f1_score)
    dump_model(path + 'data.bin', data_pipeline)
    dump_model(path + 'learn.bin', learn_pipeline)
    path = 'dumps/%s__' % (name)
    dump_model(path + 'data.bin', data_pipeline)
    dump_model(path + 'learn.bin', learn_pipeline)

In [11]:
def store_results(name, y, predicted,
                  data_pipeline=DataPipeline,
                  learn_pipeline=LearningPipeline):
    acc = round(metrics.accuracy_score(y, predicted), 5)

    dt=datetime.datetime.now()
    time_mark = dt.strftime('%Y%m%d%H%M')

    dump_models(name, acc, time_mark,
                data_pipeline=data_pipeline,
                learn_pipeline=learn_pipeline)

In [12]:
def calculate_other_performance(name):
    other_dataset = other_name(name)
    X_train, X_test, y_train, y_test = load_and_split(other_dataset)
    calculate_performance(name, other_dataset, X_test, y_test)

In [13]:
def calculate_performance(trained, testing, X, y,
                         data_pipeline=DataPipeline,
                         learn_pipeline=LearningPipeline):
    X_processed = data_pipeline.transform(X)
    predicted = learn_pipeline.predict(X_processed)
    print('Performance %s on %s' % (trained, testing))
    print(metrics.classification_report(y, predicted))
    print('Accuracy: ', metrics.accuracy_score(y, predicted))
    return predicted

In [14]:
def train(name):
    data_pipeline = DataPipeline
    X_train, X_test, y_train, y_test = load_and_split(name)


    X_after_processing = data_pipeline.fit_transform(X_train)
    print("Data processed!")

    LearningPipeline.fit(X_after_processing, y_train)
    print("Models trained!")

    predicted = calculate_performance(name, name, X_test, y_test,
                                     data_pipeline=data_pipeline)
    store_results(name, y_test, predicted,
                  data_pipeline=data_pipeline)
    calculate_other_performance(name)
    
    
    

In [18]:
name='reviews_rt_all'
path = "../data/%s.csv" % name


In [19]:
df = pd.read_csv(path, sep="|")

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df[['text']], df['label'], test_size=0.2, random_state=42    , stratify=df['label'])

In [23]:
X_p = DataPipeline.fit_transform(X_train)

In [24]:
from scipy.sparse.linalg import svds

In [25]:
u,s,v = svds(X_p.asfptype(), k=200)

In [26]:
v.shape

(200, 34029)

In [27]:
X_p.shape

(82088, 34029)

In [28]:
X_svd=X_p*v.transpose()

In [29]:
type(X_svd)

numpy.ndarray

In [30]:
LearningPipeline.fit(X_svd, y_train)   

[LibLinear]

Pipeline(steps=[('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False))])

In [31]:
predicted = LearningPipeline.predict(X_svd)   

In [32]:
print(metrics.classification_report(y_train, predicted))

             precision    recall  f1-score   support

          0       0.63      0.38      0.47     30362
          1       0.70      0.87      0.78     51726

avg / total       0.67      0.69      0.66     82088



In [33]:
X_p = DataPipeline.transform(X_test)

In [34]:
X_svd=X_p*v.transpose()

In [35]:
predicted = LearningPipeline.predict(X_svd)   

In [37]:
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.62      0.37      0.47      7590
          1       0.70      0.87      0.78     12932

avg / total       0.67      0.69      0.66     20522



In [38]:
print(metrics.accuracy_score(y_test, predicted))

0.685118409512


In [33]:
u,s,v = svds(X_p.asfptype(), k=2000) # Очень долго

In [34]:
X_svd=X_p*v.transpose()

In [35]:
LearningPipeline.fit(X_svd, y_train)   

[LibLinear]

Pipeline(steps=[('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False))])

In [36]:
predicted = LearningPipeline.predict(X_svd)  

In [37]:
print(metrics.classification_report(y_train, predicted))

             precision    recall  f1-score   support

          0       0.75      0.64      0.69     30362
          1       0.81      0.87      0.84     51726

avg / total       0.78      0.79      0.78     82088

