In [2]:
import os
import string
import pandas as pd
import xgboost
import numpy as np

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.naive_bayes import MultinomialNB
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import neighbors
from sklearn.externals import joblib
import mlens

import keras
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imblearn.pipeline import make_pipeline as make_pipeline
from imblearn.metrics import classification_report_imbalanced
from ast import literal_eval

[MLENS] backend: threading
Using TensorFlow backend.


In [33]:
RANDOM_STATE = 42
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' #workaround for macOS mkl issue
BIG_CATEGORY = 'mobile'
# load dataset
data_directory = os.path.join(os.path.split(os.getcwd())[0], 'data')
prob_dir = os.path.join(data_directory, 'probabilities', BIG_CATEGORY)
train = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_train_split.csv'))
valid = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_valid_split.csv'))
test = pd.read_csv(os.path.join(data_directory, f'{BIG_CATEGORY}_test_split.csv'))
train_x, train_y = train['itemid'].values.reshape(-1,1), train['Category']
valid_x, valid_y = valid['itemid'].values.reshape(-1,1), valid['Category']
test_x = test['itemid'].values.reshape(-1,1)

In [34]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [35]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, cross_validate=False,
                save_model=False, extract_probs=False, feature_vector_test=None, model_name='sklearn'):
    # fit the training dataset on the classifier
    #if isinstance(classifier, xgboost.XGBClassifier):
    #    feature_vector_train = feature_vector_train.to_csc()
    #    feature_vector_valid = feature_vector_valid.to_csc()
    if cross_validate:
        kfold = model_selection.StratifiedKFold(n_splits=5, random_state=7, shuffle=True)
        results = model_selection.cross_val_score(classifier, feature_vector_train, 
                                                  label, cv=kfold, n_jobs=-1)
        print("CV Accuracy: %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))
        return results.mean()*100
    else:
        classifier.fit(feature_vector_train, label)
        # predict the labels on validation dataset
        predictions = classifier.predict(feature_vector_train)
        print('Train Acc: {}'.format(metrics.accuracy_score(predictions, label)))
        predictions = classifier.predict(feature_vector_valid)
    if extract_probs:
        val_preds = classifier.predict_proba(feature_vector_valid)
        test_preds = classifier.predict_proba(feature_vector_test)
        print(val_preds.shape)
        print(test_preds.shape)
        os.makedirs(os.path.join(prob_dir, model_name),exist_ok=True)
        np.save(os.path.join(prob_dir, model_name, 'valid.npy'), val_preds)
        np.save(os.path.join(prob_dir, model_name, 'test.npy'), test_preds)
    if save_model:
        model_path = os.path.join(data_directory, 'keras_checkpoints', 
                                  BIG_CATEGORY, model_name)
        os.makedirs(model_path, exist_ok=True)
        joblib.dump(classifier, os.path.join(model_path, model_name + '.joblib'))
        
    return metrics.accuracy_score(predictions, valid_y)

In [26]:
accuracy = train_model(make_pipeline(count_vect, naive_bayes.MultinomialNB(alpha=0.25)),
                       train_x, train_y, valid_x, cross_validate=True,
                       extract_probs=False, feature_vector_test=test_x, model_name='nb_ngrams_2')
print("NB, Count Vectors: ", accuracy)

Train Acc: 0.21363583206642497
NB, Count Vectors:  0.2150631529705286


In [18]:
accuracy = train_model(neighbors.KNeighborsClassifier(n_neighbors=100, leaf_size=30),
                       train_x, train_y, valid_x, cross_validate=False, save_model=True, extract_probs=True,
                       model_name='KNN_itemid_100', feature_vector_test=test_x)
print("NN, Count Vectors: ", accuracy)

Train Acc: 0.40376420252501977
(43941, 14)
(55440, 14)
NN, Count Vectors:  0.39468833208165494


In [32]:
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=150, max_depth=100, min_samples_leaf=10, n_jobs=-1),
                       train_x, train_y, valid_x, cross_validate=False, save_model=True, extract_probs=True,
                       model_name='rf_itemid', feature_vector_test=test_x)
print("RF, Count Vectors: ", accuracy)

Train Acc: 0.48948660975324526
(32065, 27)
(40417, 27)
RF, Count Vectors:  0.3840012474660845


In [122]:

accuracy = train_model(xgboost.XGBClassifier(max_depth=27, learning_rate=0.1, scale_pos_weight=1,
                                            n_estimators=50, silent=True,
                                            objective="binary:logistic", booster='gbtree',
                                            n_jobs=6, nthread=None, gamma=0.2, min_child_weight=5,
                                            max_delta_step=0, subsample=1, colsample_bytree=0.7, colsample_bylevel=1,
                                            reg_alpha=0, reg_lambda=1),
                                            train_x, train_y, valid_x)
print("Xgb, N-Gram Vectors: ", accuracy)

Train Acc: 0.5055627022180642
Xgb, N-Gram Vectors:  0.3773273039139248


In [123]:
accuracy = train_model(xgboost.XGBClassifier(max_depth=27, learning_rate=0.05, scale_pos_weight=1,
                                            n_estimators=200, silent=True,
                                            objective="binary:logistic", booster='gbtree',
                                            n_jobs=6, nthread=None, gamma=0.2, min_child_weight=5,
                                            max_delta_step=0, subsample=1, colsample_bytree=0.8, colsample_bylevel=1,
                                            reg_alpha=0, reg_lambda=1),
                      train_x, train_y, valid_x,
                      save_model=True, extract_probs=True, 
                      feature_vector_test=test_x, model_name='xgb_itemid_index')
print("Xgb, N-Gram Vectors: ", accuracy)

Train Acc: 0.5369976221104744
(32065, 27)
(40417, 27)
Xgb, N-Gram Vectors:  0.38147512864494


In [None]:
0.37732730

In [108]:
itemid_train.values.reshape(-1, 1)

array([[1307003048],
       [1074369881],
       [1734209676],
       ...,
       [1678022339],
       [ 926098260],
       [1727515811]])

In [67]:
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=50, max_depth=40, min_samples_leaf=10),
                       train_index, train_y, valid_index)
print("RF, ItemID: ", accuracy)

Train Acc: 0.33257438957368296
RF, ItemID:  0.27056545178568314


In [61]:
accuracy = train_model(xgboost.XGBClassifier(
    max_depth=15, learning_rate=0.1, n_estimators=100, silent=True, 
    objective='binary:logistic', booster='gbtree', n_jobs=6, nthread=None, 
    gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, 
    colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 
    base_score=0.5, random_state=0, seed=None, missing=None),
    train_x, train_y, valid_x)
print("Xgb, ItemID: ", accuracy)

Train Acc: 0.47880627742447635
Xgb, ItemID:  0.39394943908439034


In [None]:
params = {
    'max_depth': [9, 11, 13],
    #'learning_rate': [0.05, 0.1, 0.2],
    #'n_estimators': range(50, 200, 50),
    #'gamma': [i/10.0 for i in range(0, 5)],
    #'subsample': [i/10.0 for i in range(6, 10)],
    #'colsample_bytree': [i/10.0 for i in range(6, 10)],
    #'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
}
ensemble = BlendEnsemble(scorer=accuracy_score, random_state=seed, verbose=2)
ensemble.add([
    RandomForestClassifier(n_estimators=100, max_depth=58*10, min_samples_leaf=10),  
    #svm.LinearSVC(dual=False, tol=.01),
    LogisticRegression(solver='sag', n_jobs=6, multi_class='multinomial', tol=1e-4, C=1.e4 / 533292),
    naive_bayes.MultinomialNB(),
    xgboost.XGBClassifier(max_depth=11, learning_rate=0.1, scale_pos_weight=1,
                          n_estimators=100, silent=True,
                          objective="binary:logistic", booster='gbtree',
                          n_jobs=6, nthread=None, gamma=0, min_child_weight=2,
                          max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
                          reg_alpha=0, reg_lambda=1),
], proba=True)

# Attach the final meta estimator
ensemble.add_meta(LogisticRegression(solver='sag', n_jobs=6, multi_class='multinomial',
                                     tol=1e-4, C=1.e4 / 533292))


accuracy = train_model(make_pipeline(tfidf_vect_ngram, GridSearchCV(estimator=ensemble),
                                     train_x, train_y, valid_x), param_grid=params, scoring='accuracy', n_jobs=-1)

In [63]:
train_index

array([     0,      1,      2, ..., 229263, 229264, 229265])