# Modeling
  1. Regression
  2. Classification

### Score
* quadratic weighted kappa
$$ k = 1-\dfrac{{\sum}_{i,i} w_{i,j} O_{i,j}}{{\sum}_{i,i} w_{i,j} E_{i,j}} $$

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk

import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import auc

import joblib
from joblib import dump, load

pd.options.display.max_columns = 400
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 600
pd.options.display.precision = 10

In [2]:
df_train = pd.read_excel("./__data/excel/train.xlsx").fillna("")
df_test = pd.read_excel("./__data/excel/test.xlsx").fillna("")

In [3]:
train_X = joblib.load('train_X.pkl')
test_X = joblib.load('test_X.pkl')
y = joblib.load('y.pkl')

# Modeling (1) Regression

## Process
1. Sparse Matrix 의 특이값 분해
2. Scaling
3. Rgression

* TruncatedSVD 로 특이값을 분해하며 parameter 중 n_componetns 를 [200, 300, 400, 500, 600] 으로 gridsearch 진행

In [23]:
svd = TruncatedSVD()
scl = StandardScaler()
xgb_model = xgb.XGBRegressor()  

In [24]:
param_grid = {'svd__n_components': [200, 300, 400, 500, 600]}
pipe_svc = pipeline.Pipeline([('svd', TruncatedSVD()), ('scl', StandardScaler()), ('xgb', xgb.XGBRegressor() )])

In [26]:
reg_grid = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, cv=10)

In [27]:
%%time
reg_grid.fit(train_X, y)

Wall time: 1d 14h 40min


GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learni...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svd__n_components': [200, 300, 400, 500, 600]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
reg_grid.grid_scores_



[mean: 0.26356, std: 0.02805, params: {'svd__n_components': 200},
 mean: 0.28212, std: 0.03101, params: {'svd__n_components': 300},
 mean: 0.29235, std: 0.03490, params: {'svd__n_components': 400},
 mean: 0.28750, std: 0.03033, params: {'svd__n_components': 500},
 mean: 0.28917, std: 0.03034, params: {'svd__n_components': 600}]

In [38]:
print('Gridsearch Best score : ', reg_grid.best_score_)
print('Gridsearch Best parameter : ', reg_grid.best_params_)

Gridsearch Best score :  0.292349349144
Gridsearch Best parameter :  {'svd__n_components': 400}


In [28]:
joblib.dump(reg_grid, 'reg_gs.pkl')

['reg_gs.pkl']

#### SVD - n_componets=400 (Best parameter)

In [6]:
svd = TruncatedSVD(n_components=400)
scl = StandardScaler()
xgb_model = xgb.XGBRegressor()

xgb_reg = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb_model)])

In [16]:
%%time
model_reg = xgb_reg.fit(train_X, y)

Wall time: 23min 31s


In [17]:
# pipeline 모델을 test data 에 적용
reg_pred = model_reg.predict(test_X)
reg_pred

array([ 3.49021912,  3.21249986,  3.39243627, ...,  2.53569579,
        3.4927175 ,  3.48737431], dtype=float32)

In [10]:
# float 형태의 예측값을 int 형태로 전환
def pred_round(pred):
    for num in range(len(pred)):
        pred[num] = round(pred[num])
    
    return pred

pred_round(reg1_pred)
reg1_pred = reg1_pred.astype(int)

In [19]:
# 제출이 가능하도록 id 와 prediction 을 하나로 쌍으로 합친 후 csv 파일 저장
reg1_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(reg1_pred, columns=['prediction'])], axis=1)
reg1_answer.to_csv('./reg1_400_answer_title.csv', index=False)

* score : 0.34730

# Modeling (2) Classification

* DecisionTreeClassifier : max_depth=400, 0.39567
* RandomForestClassifier : max_depth=400, 0.22335
* ExtraTreesClassifier : max_depth=400, 0.29203

### Process
1. KSVM
2. OvO and OvR
3. Pipeline (SVD, Scaling, Best Model)

### (1) KSVM

#### only KSVM

In [278]:
%%time
poly_svc = SVC(kernel="poly", degree=2, gamma=1, coef0=0).fit(train_X, y)

Wall time: 52.3 s


In [53]:
cv = KFold(10)
cross_val_score(poly_svc, train_X, y, cv=cv)

array([ 0.68011811,  0.64074803,  0.66240157,  0.67224409,  0.6515748 ,
        0.65551181,  0.6742126 ,  0.67783251,  0.64433498,  0.6729064 ])

In [57]:
poly_pred = poly_svc.predict(test_X)

In [77]:
poly_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(poly_pred, columns=['prediction'])], axis=1)
poly_answer.to_csv('./poly_pred_title.csv', index=False)

* score : 0.48602

#### SVD / Scaling / KSVM

In [281]:
svd = TruncatedSVD(n_components=400)
scl = StandardScaler()
svc = SVC(kernel="poly", degree=2, gamma=1, coef0=0)

In [282]:
pipe_svc = pipeline.Pipeline([('svd', svd), ('scl', scl), ('svc', svc)])

In [283]:
%%time
model_svc = pipe_svc.fit(train_X, y)

Wall time: 26min 58s


In [285]:
model_svc_pred = model_svc.predict(test_X)

In [287]:
model_svc_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(model_svc_pred, columns=['prediction'])], axis=1)
model_svc_answer.to_csv('./model_svc_pred_title.csv', index=False)

* score : 0.40666

### (2) XGBClassifier

In [5]:
from xgboost import XGBClassifier

In [5]:
xgb_clf = XGBClassifier()
xgb_param_grid = {'max_depth': [200, 300, 400, 500, 600]}
xgb_grid = GridSearchCV(estimator=xgb_clf, param_grid=xgb_param_grid, cv=5)

In [260]:
%%time
xgb_grid.fit(train_X, y)

Wall time: 13h 16min 11s


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [200, 300, 400, 500, 600]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [262]:
xgb_grid.grid_scores_



[mean: 0.64310, std: 0.01109, params: {'max_depth': 200},
 mean: 0.64113, std: 0.01301, params: {'max_depth': 300},
 mean: 0.64192, std: 0.01023, params: {'max_depth': 400},
 mean: 0.64310, std: 0.01170, params: {'max_depth': 500},
 mean: 0.64212, std: 0.01161, params: {'max_depth': 600}]

In [270]:
print('XGBClassifier Best score : ', xgb_grid.best_score_)
print('XGBClassifier Best parameter : ', xgb_grid.best_params_)

XGBClassifier Best score :  0.643103278527
XGBClassifier Best parameter :  {'max_depth': 200}


In [263]:
joblib.dump(xgb_grid, 'xgb_grid.pkl')

['xgb_grid.pkl']

#### only XGB - max_depth=200 (Best parameter)

In [272]:
%%time
xgb = XGBClassifier(max_depth=200).fit(train_X, y)

Wall time: 30min 19s


In [273]:
%%time
xgb_pred = xgb.predict(test_X)

Wall time: 22.6 s


In [275]:
xgb_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(xgb_pred, columns=['prediction'])], axis=1)
xgb_answer.to_csv('./xgb_pred_title.csv', index=False)

* score : 0.42686

#### SVD / Scaling / XGB

In [6]:
svd = TruncatedSVD(n_components=400)
scl = StandardScaler()
xgb = XGBClassifier(max_depth=200)

In [7]:
pipe_xgb = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb)])

In [8]:
%%time
model_xgb = pipe_xgb.fit(train_X, y)

Wall time: 35min 3s


In [9]:
model_xgb_pred = model_xgb.predict(test_X)

In [13]:
model_xgb_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(model_xgb_pred, columns=['prediction'])], axis=1)
model_xgb_answer.to_csv('./model_xgb_pred_title.csv', index=False)

* score : 0.47811

### Scroe ranking
* svm (kornel="poly") : **0.48602**
* xgbclassifier (max_depth=200) / TruncatedSVD / StandardScaler : **0.47811**
* xgbclassifier (max_depth=200) : **0.42686**
* svm(kornel="poly") / TruncatedSVD / StandardScaler : **0.40666**
* xgbregressor (n_components=200) : **0.34730**

## 위 model을 대상으로 OvO and OvR

* XGBClassifier / TruncatedSVD / StandardScaler

In [None]:
svd = TruncatedSVD(n_components=400)
scl = StandardScaler()
xgb = XGBClassifier(max_depth=200)

pipe_xgb = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb)])

model_xgb = pipe_xgb.fit(train_X, y)

* XGBClassifier

In [19]:
%%time
xgb_clf = XGBClassifier(max_depth=200).fit(train_X, y)

KeyboardInterrupt: 

* KSVM

In [16]:
%%time
poly_ovr = OneVsRestClassifier(SVC(kernel="poly", degree=2, gamma=1, coef0=0)).fit(train_X, y)

Wall time: 4min 4s


In [25]:
poly_ovr_pred = poly_ovr.predict(test_X)

In [27]:
poly_ovr_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(poly_ovr_pred, columns=['prediction'])], axis=1)
poly_ovr_answer.to_csv('./poly_ovr_answer_title.csv', index=False)

* score : 0.50764

In [18]:
svd = TruncatedSVD(n_components=400)
scl = StandardScaler()
xgb_clf_ovr = OneVsRestClassifier(SVC(kernel="poly", degree=2, gamma=1, coef0=0))

pipe_xgb_ovr = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb_clf_ovr)])

In [None]:
%%time
model_xgb_ovr = pipe_xgb_ovr.fit(train_X, y)

In [17]:
poly_ovr

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0,
  decision_function_shape='ovr', degree=2, gamma=1, kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

* XGBClassifier / TruncatedSVD / StandardScaler

In [6]:
svd = TruncatedSVD(n_components=400)
scl = StandardScaler()
xgb = XGBClassifier(max_depth=200)

pipe_xgb = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb)])

In [7]:
%%time
xgb_pipe_ovr = OneVsRestClassifier(pipe_xgb).fit(train_X, y)

MemoryError: 

In [9]:
xgb_clf = XGBClassifier(max_depth=200)

In [10]:
%%time
xgb_clf_ovr = OneVsRestClassifier(xgb_clf).fit(train_X, y)

Wall time: 35min 10s


In [13]:
xgb_ovr_pred = xgb_clf_ovr.predict(test_X)

In [14]:
xgb_ovr_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(xgb_ovr_pred, columns=['prediction'])], axis=1)
xgb_ovr_answer.to_csv('./xgb_ovr_answer_title.csv', index=False)

* score : 0.44522