In [36]:
import os
import string
import pandas as pd
import xgboost
import numpy as np

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.naive_bayes import MultinomialNB
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import neighbors
from sklearn.externals import joblib
import mlens

import keras
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imblearn.pipeline import make_pipeline as make_pipeline
from imblearn.metrics import classification_report_imbalanced
from ast import literal_eval

In [50]:
RANDOM_STATE = 42
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' #workaround for macOS mkl issue
BIG_CATEGORY = 'beauty'
# load dataset
data_directory = os.path.join(os.path.split(os.getcwd())[0], 'data')
prob_dir = os.path.join(data_directory, 'probabilities', BIG_CATEGORY)
train = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_train_split.csv'))
valid = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_valid_split.csv'))
test = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_test_split.csv'))
train_x, train_y = train['itemid'].values.reshape(-1,1), train['Category']
valid_x, valid_y = valid['itemid'].values.reshape(-1,1), valid['Category']
test_x = test['itemid'].values.reshape(-1,1)

In [3]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [37]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, 
                save_model=False, extract_probs=False, feature_vector_test=None, model_name='sklearn'):
    # fit the training dataset on the classifier
    #if isinstance(classifier, xgboost.XGBClassifier):
    #    feature_vector_train = feature_vector_train.to_csc()
    #    feature_vector_valid = feature_vector_valid.to_csc()
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_train)
    print('Train Acc: {}'.format(metrics.accuracy_score(predictions, label)))
    predictions = classifier.predict(feature_vector_valid)
    if extract_probs:
        val_preds = classifier.predict_proba(feature_vector_valid)
        test_preds = classifier.predict_proba(feature_vector_test)
        print(val_preds.shape)
        print(test_preds.shape)
        os.makedirs(os.path.join(prob_dir, model_name),exist_ok=True)
        np.save(os.path.join(prob_dir, model_name, 'valid.npy'), val_preds)
        np.save(os.path.join(prob_dir, model_name, 'test.npy'), test_preds)
    if save_model:
        model_path = os.path.join(data_directory, 'keras_checkpoints', 
                                  BIG_CATEGORY, model_name)
        os.makedirs(model_path, exist_ok=True)
        joblib.dump(classifier, os.path.join(model_path, model_name + '.joblib'))
        
    return metrics.accuracy_score(predictions, valid_y)

In [12]:
accuracy = train_model(naive_bayes.GaussianNB(),
                       train_x, train_y, valid_x)
                       #extract_probs=True, feature_vector_test=test_x, model_name='nb_ngrams')
print("NB, Count Vectors: ", accuracy)

Train Acc: 0.2835134734326067
NB, Count Vectors:  0.2835110002268088


In [51]:
accuracy = train_model(neighbors.KNeighborsClassifier(n_neighbors=25, leaf_size=10),
                       train_x, train_y, valid_x, save_model=True, extract_probs=True,
                      model_name='KNN_itemid', feature_vector_test=test_x)
print("NN, Count Vectors: ", accuracy)

Train Acc: 0.4271370373278201
(57317, 17)
(76545, 17)
NN, Count Vectors:  0.38239963710591973


In [52]:
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=50, max_depth=58*10, min_samples_leaf=10),
                       train_x, train_y, valid_x)
print("RF, Count Vectors: ", accuracy)

Train Acc: 0.4854492161942896
RF, Count Vectors:  0.3863949613552698


In [None]:
# Extereme Gradient Boosting on Count Vectors
gridsearch = GridSearchCV(estimator=xgboost.XGBClassifier(max_depth=9, learning_rate=0.1, scale_pos_weight=1,
                                                          n_estimators=50, silent=True,
                                                          objective="binary:logistic", booster='gbtree',
                                                          n_jobs=6, nthread=None, gamma=0, min_child_weight=1,
                                                          max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
                                                          reg_alpha=0, reg_lambda=1),
                          param_grid=params, scoring='accuracy', n_jobs=-1, verbose=2)
accuracy = train_model(make_pipeline(tfidf_vect, gridsearch), train_x, train_y, valid_x)
print(gridsearch.best_params_, gridsearch.best_score_)
print("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(make_pipeline(tfidf_vect_ngram, GridSearchCV(estimator=xgboost.XGBClassifier(max_depth=5, learning_rate=0.1, scale_pos_weight=1,
                                                                    n_estimators=50, silent=True,
                                                                    objective="binary:logistic", booster='dart',
                                                                    n_jobs=6, nthread=None, gamma=0, min_child_weight=2,
                                                                    max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
                                                                    reg_alpha=0, reg_lambda=1),
                                                                    param_grid=params, scoring='accuracy', n_jobs=-1)),
                                                                    train_x, train_y, valid_x)
print("Xgb, N-Gram Vectors: ", accuracy)

In [108]:
itemid_train.values.reshape(-1, 1)

array([[1307003048],
       [1074369881],
       [1734209676],
       ...,
       [1678022339],
       [ 926098260],
       [1727515811]])

In [110]:
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=50, max_depth=30, min_samples_leaf=10),
                       itemid_train.values.reshape(-1, 1), train_y, itemid_valid.values.reshape(-1, 1))
print("RF, ItemID: ", accuracy)

Train Acc: 0.4844678234016383
RF, Count Vectors:  0.38560985397002634


In [114]:
accuracy = train_model(xgboost.XGBClassifier(
    max_depth=7, learning_rate=0.1, n_estimators=100, silent=True, 
    objective='binary:logistic', booster='gbtree', n_jobs=6, nthread=None, 
    gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, 
    colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 
    base_score=0.5, random_state=0, seed=None, missing=None),
                       itemid_train.values.reshape(-1, 1), train_y, 
                       itemid_valid.values.reshape(-1, 1))
print("Xgb, ItemID: ", accuracy)

Train Acc: 0.3878944108590022
Xgb, ItemID:  0.371024303435281


In [None]:
params = {
    'max_depth': [9, 11, 13],
    #'learning_rate': [0.05, 0.1, 0.2],
    #'n_estimators': range(50, 200, 50),
    #'gamma': [i/10.0 for i in range(0, 5)],
    #'subsample': [i/10.0 for i in range(6, 10)],
    #'colsample_bytree': [i/10.0 for i in range(6, 10)],
    #'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
}
ensemble = BlendEnsemble(scorer=accuracy_score, random_state=seed, verbose=2)
ensemble.add([
    RandomForestClassifier(n_estimators=100, max_depth=58*10, min_samples_leaf=10),  
    #svm.LinearSVC(dual=False, tol=.01),
    LogisticRegression(solver='sag', n_jobs=6, multi_class='multinomial', tol=1e-4, C=1.e4 / 533292),
    naive_bayes.MultinomialNB(),
    xgboost.XGBClassifier(max_depth=11, learning_rate=0.1, scale_pos_weight=1,
                          n_estimators=100, silent=True,
                          objective="binary:logistic", booster='gbtree',
                          n_jobs=6, nthread=None, gamma=0, min_child_weight=2,
                          max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
                          reg_alpha=0, reg_lambda=1),
], proba=True)

# Attach the final meta estimator
ensemble.add_meta(LogisticRegression(solver='sag', n_jobs=6, multi_class='multinomial',
                                     tol=1e-4, C=1.e4 / 533292))


accuracy = train_model(make_pipeline(tfidf_vect_ngram, GridSearchCV(estimator=ensemble),
                                     train_x, train_y, valid_x), param_grid=params, scoring='accuracy', n_jobs=-1)