In [42]:
import os
import string
import pandas as pd
import xgboost
import numpy as np
import textblob

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import mlens
from sklearn import neighbors
from sklearn.externals import joblib
from catboost import CatBoostClassifier, Pool

import keras
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from mlens.ensemble import BlendEnsemble
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imblearn.pipeline import make_pipeline as make_pipeline
from imblearn.metrics import classification_report_imbalanced
from ast import literal_eval

In [5]:
RANDOM_STATE = 42
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' #workaround for macOS mkl issue
BIG_CATEGORY = 'beauty'
# load dataset
data_directory = os.path.join(os.path.split(os.getcwd())[0], 'data')
prob_dir = os.path.join(data_directory, 'probabilities', BIG_CATEGORY)
train = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_train_split.csv'))
valid = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_valid_split.csv'))
test = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_test_split.csv'))
train_x, train_y = train['title'], train['Category']
valid_x, valid_y = valid['title'], valid['Category']
test_x = test['title']

In [6]:
itemid_train = train['itemid']
itemid_valid = valid['itemid']
itemid_test = test['itemid']

In [7]:
train['extractions'] = train['extractions'].map(literal_eval)
valid['extractions'] = valid['extractions'].map(literal_eval)
test['extractions'] = test['extractions'].map(literal_eval)
train['extractions'] = train['extractions'].map(lambda s: ' '.join(s) if s else pd.NaT)
valid['extractions'] = valid['extractions'].map(lambda s: ' '.join(s) if s else pd.NaT)
test['extractions'] = test['extractions'].map(lambda s: ' '.join(s) if s else pd.NaT)

In [32]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

count_vect = CountVectorizer(analyzer='word', strip_accents='unicode',#Stop words may not be needed as they seem to be already removed
                             stop_words=None, ngram_range=(1,8))  # \b[^\d\W]{3,}\b
count_vect.fit(train['title'])
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', strip_accents='unicode',
                             stop_words=None,) #token_pattern=r'\b[^\d\W]{3,}\b')
tfidf_vect.fit(train['title'])
# ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', strip_accents='unicode',
                                   stop_words=None, ngram_range=(1,8)) 
tfidf_vect_ngram.fit(train['title'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 8), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [9]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, cross_validate=False,
                save_model=False, extract_probs=False, feature_vector_test=None, model_name='sklearn'):
    # fit the training dataset on the classifier
    #if isinstance(classifier, xgboost.XGBClassifier):
    #    feature_vector_train = feature_vector_train.to_csc()
    #    feature_vector_valid = feature_vector_valid.to_csc()
    if cross_validate:
        kfold = model_selection.StratifiedKFold(n_splits=5, random_state=7, shuffle=True)
        results = model_selection.cross_val_score(classifier, feature_vector_train, 
                                                  label, cv=kfold, n_jobs=-1)
        print("CV Accuracy: %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))
        return results.mean()*100
    else:
        classifier.fit(feature_vector_train, label)
        # predict the labels on validation dataset
        predictions = classifier.predict(feature_vector_train)
        print('Train Acc: {}'.format(metrics.accuracy_score(predictions, label)))
        predictions = classifier.predict(feature_vector_valid)
    if extract_probs:
        val_preds = classifier.predict_proba(feature_vector_valid)
        test_preds = classifier.predict_proba(feature_vector_test)
        print(val_preds.shape)
        print(test_preds.shape)
        os.makedirs(os.path.join(prob_dir, model_name),exist_ok=True)
        np.save(os.path.join(prob_dir, model_name, 'valid.npy'), val_preds)
        np.save(os.path.join(prob_dir, model_name, 'test.npy'), test_preds)
    if save_model:
        model_path = os.path.join(data_directory, 'keras_checkpoints', 
                                  BIG_CATEGORY, model_name)
        os.makedirs(model_path, exist_ok=True)
        joblib.dump(classifier, os.path.join(model_path, model_name + '.joblib'))
        
    return metrics.accuracy_score(predictions, valid_y)

In [203]:
accuracy = train_model(make_pipeline(count_vect, naive_bayes.MultinomialNB(alpha=0.25)),
                       train_x, train_y, valid_x, cross_validate=False,
                       extract_probs=True, feature_vector_test=test_x, model_name='nb_ngrams_2')
print("NB, Count Vectors: ", accuracy)

Train Acc: 0.87346139418841
(57317, 17)
(76545, 17)
NB, Count Vectors:  0.771620985048066


In [227]:
# Linear Classifier on Count Vectors
accuracy = train_model(make_pipeline(count_vect,
                                     linear_model.LogisticRegression(solver='sag', n_jobs=6, multi_class='multinomial',
                                                                     tol=1e-4, C=1.e4 / 533292)),
                       train_x, train_y, valid_x, cross_validate=False,
                save_model=True, extract_probs=True, feature_vector_test=test_x, model_name='log_reg')
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(make_pipeline(tfidf_vect_ngram,
                                     linear_model.LogisticRegression(solver='sag', n_jobs=6, multi_class='multinomial',
                                                                     tol=1e-4, C=1.e4 / 533292)),
                       train_x, train_y, valid_x, cross_validate=False,
                save_model=True, extract_probs=True, feature_vector_test=test_x, model_name='log_reg_tfidf')
print("LR, N-Gram Vectors: ", accuracy)



Train Acc: 0.8361906989435933
(32065, 27)
(40417, 27)
LR, Count Vectors:  0.7988460938718228
Train Acc: 0.5994854402993802
(32065, 27)
(40417, 27)
LR, N-Gram Vectors:  0.6474348978637143


In [None]:
accuracy = train_model(make_pipeline(count_vect, ensemble.RandomForestClassifier(n_estimators=80, max_depth=580, min_samples_leaf=2)),
                       train_x, train_y, valid_x,cross_validate=False,
                        save_model=True, extract_probs=True, feature_vector_test=test_x, model_name='rf')
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(make_pipeline(tfidf_vect, ensemble.RandomForestClassifier(n_estimators=80, max_depth=580, min_samples_leaf=2)),
                       train_x, train_y, valid_x, cross_validate=False,
                       save_model=True, extract_probs=True, feature_vector_test=test_x, model_name='rf_tfidf')
print("RF, WordLevel TF-IDF: ", accuracy)

In [36]:
# Extereme Gradient Boosting on Count Vectors
xgb = xgboost.XGBClassifier(max_depth=6, learning_rate=0.1, scale_pos_weight=1,
                          n_estimators=150, silent=True,
                          objective="binary:logistic", booster='gbtree',
                          n_jobs=12, nthread=None, gamma=0, min_child_weight=1,
                          max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
                          reg_alpha=0, reg_lambda=1)
accuracy = train_model(make_pipeline(count_vect, xgb), train_x, train_y,
                       valid_x, cross_validate=False, save_model=True, 
                       extract_probs=True, feature_vector_test=test_x, 
                       model_name='xgb')
print("Xgb,CountVec: ", accuracy)

Train Acc: 0.7656564863520976
(57317, 17)
(76545, 17)
Xgb,CountVec:  0.7565818169129578


In [39]:
xgb = xgboost.XGBClassifier(max_depth=6, learning_rate=0.1, scale_pos_weight=1,
                          n_estimators=150, silent=True,
                          objective="binary:logistic", booster='gbtree',
                          n_jobs=-1, nthread=None, gamma=0, min_child_weight=1,
                          max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
                          reg_alpha=0, reg_lambda=1)
accuracy = train_model(make_pipeline(tfidf_vect_ngram, xgb), train_x, train_y, valid_x,
                       cross_validate=False, save_model=True, 
                       extract_probs=True, feature_vector_test=test_x, 
                       model_name='xgb_tfidf')
print("Xgb,TFIDF ", accuracy)

Train Acc: 0.773983058979526
(57317, 17)
(76545, 17)
Xgb,TFIDF  0.7542439415880106


In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=80, leaf_size=30)
accuracy = train_model(make_pipeline(tfidf_vect_ngram, knn),
                       train_x, train_y, valid_x, cross_validate=False, save_model=True, extract_probs=True,
                       model_name='knn80_tfidf', feature_vector_test=test_x)
print("KNN _ count: ", accuracy)

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=160, leaf_size=30)
accuracy = train_model(make_pipeline(tfidf_vect_ngram, knn),
                       train_x, train_y, valid_x, cross_validate=False, save_model=True, extract_probs=True,
                       model_name='knn160_tfidf', feature_vector_test=test_x)
print("KNN _ count: ", accuracy)

In [None]:
Train Acc: 0.7695209930822713
(57317, 17)
(76545, 17)
KNN _ count:  0.7393443480991678
    KNN10 tfidf

In [None]:
params = {
    'max_depth': [9, 11, 13],
    #'learning_rate': [0.05, 0.1, 0.2],
    #'n_estimators': range(50, 200, 50),
    #'gamma': [i/10.0 for i in range(0, 5)],
    #'subsample': [i/10.0 for i in range(6, 10)],
    #'colsample_bytree': [i/10.0 for i in range(6, 10)],
    #'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
}
ensemble = BlendEnsemble(scorer=accuracy_score, random_state=42, verbose=2)
ensemble.add([
    RandomForestClassifier(n_estimators=100, max_depth=58*10, min_samples_leaf=10),  
    LogisticRegression(solver='sag', n_jobs=12, multi_class='multinomial', tol=1e-4, C=1.e4 / 533292),
    naive_bayes.MultinomialNB(alpha=0.25),
    xgboost.XGBClassifier(max_depth=6, learning_rate=0.1, scale_pos_weight=1,
                          n_estimators=150, silent=True,
                          objective="binary:logistic", booster='gbtree',
                          n_jobs=12, nthread=None, gamma=0, min_child_weight=1,
                          max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
                          reg_alpha=0, reg_lambda=1),
], proba=True)

# Attach the final meta estimator
ensemble.add_meta(LogisticRegression(solver='sag', n_jobs=12, multi_class='multinomial',
                                     tol=1e-4, C=1.e4 / 533292))


accuracy = train_model(make_pipeline(tfidf_vect_ngram, ensemble),
                                     train_x, train_y, valid_x,
                                  cross_validate=False, save_model=True, 
                                   extract_probs=True, feature_vector_test=test_x, 
                                   model_name='blend_ensemble')


Fitting 2 layers
Processing layer-1             

ValueError("Classification metrics can't handle a mix of multiclass and continuous-multioutput targets",)
ValueError("Classification metrics can't handle a mix of multiclass and continuous-multioutput targets",)
ValueError("Classification metrics can't handle a mix of multiclass and continuous-multioutput targets",)
