In [1]:
from __future__ import print_function
from __future__ import division

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import GridSearchCV 

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import OneHotEncoder
import sys 
import time
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

In [None]:
data = np.load('../data/train_matrix.npz', allow_pickle = True)

In [None]:
from scipy import sparse

In [None]:
X = data['X'].item()
y = data['y'] - 1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [None]:
# dtrain = xgb.DMatrix(X_train, y_train, feature_names=list([str(i) for i in range(X.shape[1])]))
# dtest = xgb.DMatrix(X_test, y_test, feature_names=list([str(i) for i in range(X.shape[1])]))

In [None]:
import gc
del X, y, data
gc.collect()

In [None]:
eval_set = [(X_train, y_train), (X_test, y_test)]

In [None]:
def accuracy(pred, label):
    return sum(np.where(pred == label, 1, 0))/len(label)

In [None]:
params1 = {'objective':['multi:softmax'],
            'learning_rate': [0.1], #so called `eta` value
            'n_estimators': [1000],
            'max_depth': range(3,10,2),
            'gamma': [0],
            'min_child_weight': range(1,6,2),
            'subsample': [0.7],
            'colsample_bytree': [0.6],
            'reg_alpha':[1],
            'tree_method': ['gpu_hist'],
            'gpu_id': [0]}

In [None]:
xgc = XGBClassifier()

xgb_grid = GridSearchCV(xgc, params1, cv = 3, n_jobs = 4, verbose=2)

In [None]:
xgb_grid.fit(X_train, y_train, eval_metric=["merror", "mlogloss"], eval_set = eval_set, verbose=True, early_stopping_rounds = 10)

In [None]:
xgb_grid.cv_results_['mean_test_score']

In [None]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [None]:
model = XGBClassifier(colsample_bytree = 0.6, gamma = 0, learning_rate = 0.1, max_depth = 5, min_child_weight = 3, reg_alpha = 1, subsample = 0.7, objective ='multi:softmax', tree_method = 'gpu_hist', gpu_id = 0, n_estimators = 700) # n_estimators = 2000
# model.fit(X_train, y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True, early_stopping_rounds = 10)
eval_set = [(X, y), (X, y)]
model.fit(X, y, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True, early_stopping_rounds = 10)

In [None]:
y_pred = model.predict(X_train)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_train, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['merror'])
x_axis = range(0, epochs)

# plot log loss
fig1, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

# plot classification error
fig2, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

In [2]:
import sys
sys.path.append('../tools/')
from model_manager import model_manager as mm

xgc_mm = mm('xgc_700', '../models/')
xgc_mm.load()
lr_mm_pure = mm('lr_pure', '../models/')
lr_mm_pure.load()
# lr_mm = mm('lr', '../models/')
# lr_mm.load()
# xgb_enc_mm = mm('xgb_enc', '../models/')
# xgb_enc_mm.load()

Done
Done


In [None]:
lr = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter = 1000, n_jobs = 10)

In [None]:
xgb_enc = OneHotEncoder()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)

In [None]:
d = xgc_mm.model.apply(X_test)

In [None]:
d.shape

In [None]:
lr.fit(d, y_test)

In [None]:
# xgb_enc.fit(xgc_mm.model.apply(X_test))
# lr.fit(xgb_enc.transform(xgc_mm.model.apply(X_test)), y_test)

In [None]:
# lr_mm = mm('lr', '../models/', lr)
# lr_mm.save()
# xgb_enc_mm = mm('xgb_enc', '../models/', xgb_enc)
# xgb_enc_mm.save()

In [None]:
lr_mm_pure = mm('lr_pure', '../models/', lr)
lr_mm_pure.save()

In [None]:
# gpu test
# xgr = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
# xgr.fit(X_train, y_train)

In [3]:
test = dict(np.load('../data/test_matrix.npz', allow_pickle = True))

In [4]:
test = test['X'].item()

In [None]:
test_id = test.tocsc()[:, 0]
features = test.tocsc()[:, 1:]

In [6]:
# pred = model.predict(features)
pred = lr_mm_pure.model.predict(xgc_mm.model.apply(test.tocsc()[:, 1:10000]))

MemoryError: Unable to allocate 3.91 GiB for an array with shape (1050000000,) and data type float32

In [None]:
pred = pred + 1

In [None]:
pd.DataFrame({'Id': np.array(test_id.todense()).reshape(-1).astype(int), 'Score': pred}).sort_values(by = 'Id').reset_index(drop = True).to_csv('./submission.csv', index = False)