# Modeling
  1. Regression
  2. Classification

### Score
* quadratic weighted kappa
$$ k = 1-\dfrac{{\sum}_{i,i} w_{i,j} O_{i,j}}{{\sum}_{i,i} w_{i,j} E_{i,j}} $$

In [2]:
import numpy as np
import pandas as pd
import sklearn as sk

import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import auc

import joblib
from joblib import dump, load

pd.options.display.max_columns = 400
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 600
pd.options.display.precision = 10

In [3]:
df_train = pd.read_excel("./__data/excel/train.xlsx").fillna("")
df_test = pd.read_excel("./__data/excel/test.xlsx").fillna("")

In [4]:
train_X = joblib.load('train_X_mixed.pkl')
test_X = joblib.load('test_X_mixed.pkl')
y = joblib.load('y.pkl')

# Modeling (1) Regression

## Process
1. Sparse Matrix 의 특이값 분해
2. Scaling
3. Rgression

* TruncatedSVD 로 특이값을 분해하며 parameter 중 n_componetns 를 [200, 300, 400, 500, 600] 으로 gridsearch 진행

In [5]:
svd = TruncatedSVD()
scl = StandardScaler()
xgb_model = xgb.XGBRegressor()  

In [6]:
param_grid = {'svd__n_components': [200, 300, 400, 500, 600]}
pipe_svc = pipeline.Pipeline([('svd', TruncatedSVD()), ('scl', StandardScaler()), ('xgb', xgb.XGBRegressor() )])

In [7]:
reg_grid = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, cv=10)

In [27]:
%%time
reg_grid.fit(train_X, y)

Wall time: 1d 14h 40min


GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learni...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svd__n_components': [200, 300, 400, 500, 600]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
reg_grid.grid_scores_



[mean: 0.26356, std: 0.02805, params: {'svd__n_components': 200},
 mean: 0.28212, std: 0.03101, params: {'svd__n_components': 300},
 mean: 0.29235, std: 0.03490, params: {'svd__n_components': 400},
 mean: 0.28750, std: 0.03033, params: {'svd__n_components': 500},
 mean: 0.28917, std: 0.03034, params: {'svd__n_components': 600}]

In [38]:
print('Gridsearch Best score : ', reg_grid.best_score_)
print('Gridsearch Best parameter : ', reg_grid.best_params_)

Gridsearch Best score :  0.292349349144
Gridsearch Best parameter :  {'svd__n_components': 400}


In [28]:
joblib.dump(reg_grid, 'reg_gs.pkl')

['reg_gs.pkl']

### SVD - n_componets=400 (Best parameter)

In [6]:
svd = TruncatedSVD(n_components=400)
scl = StandardScaler()
xgb_model = xgb.XGBRegressor()

xgb_reg = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb_model)])

In [16]:
%%time
model_reg = xgb_reg.fit(train_X, y)

Wall time: 23min 31s


In [17]:
# pipeline 모델을 test data 에 적용
reg_pred = model_reg.predict(test_X)
reg_pred

array([ 3.49021912,  3.21249986,  3.39243627, ...,  2.53569579,
        3.4927175 ,  3.48737431], dtype=float32)

In [10]:
# float 형태의 예측값을 int 형태로 전환
def pred_round(pred):
    for num in range(len(pred)):
        pred[num] = round(pred[num])
    
    return pred

pred_round(reg1_pred)
reg1_pred = reg1_pred.astype(int)

In [19]:
# 제출이 가능하도록 id 와 prediction 을 하나로 쌍으로 합친 후 csv 파일 저장
reg1_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(reg1_pred, columns=['prediction'])], axis=1)
reg1_answer.to_csv('./reg1_400_answer_title.csv', index=False)

* Kaggle 제출 값
 - score : 0.34730

# Modeling (2) Classification

### Process
1. KSVM
2. OvO and OvR
3. Pipeline (SVD, Scaling, Best Model)

### (1) KSVM

In [None]:
polysvc = SVC(kernel="poly", degree=2, gamma=1, coef0=0).fit(train_X, y)
plot_xor(train_X, y, polysvc, "Polynomila SVC")

In [None]:
def plot_xor(X, y, model, title, xmin=-3, xmax=3, ymin=-3, ymax=3):
    XX, YY = np.meshgrid(np.arange(xmin, xmax, (xmax-xmin)/1000), np.arange(ymin, ymax, (ymax-ymin)/1000))
    ZZ = np.reshape(model.predict(np.array([XX.ravel(), YY.ravel()]).T), XX.shape)
    plt.contourf(XX, YY, ZZ, cmap=mpl.cm.Paired_r, alpha=0.5)
    plt.scatter(X[y== 1, 0], X[y== 1, 1], c='b', marker='o', label='+1', s=100)
    plt.scatter(X[y==-1, 0], X[y==-1, 1], c='r', marker='s', label='-1', s=100)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    plt.title(title)
    plt.show()

### OvO and OvR

In [7]:
cv = KFold(10)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(train_X, y, test_size=0.20, random_state=42)

In [9]:
%%time
poly_svm = SVC(kernel="poly", degree=2, gamma=1, coef0=0).fit(train_X, y)

Wall time: 52 s


In [10]:
%%time
model2 = OneVsRestClassifier(poly_svm).fit(train_X, y)

Wall time: 4min


In [14]:
%%time
poly_pred = model2.predict(test_X)

Wall time: 2min 32s


In [15]:
poly_pred

array([4, 4, 3, ..., 2, 4, 4], dtype=int64)

In [16]:
poly_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(poly_pred, columns=['prediction'])], axis=1)

In [18]:
poly_answer.to_csv('./poly_answer.csv', index=False)

In [22]:
svd = TruncatedSVD(n_components=500)
scl = StandardScaler()
svm = SVC(kernel="poly", degree=2, gamma=1, coef0=0)

In [23]:
clf1 = pipeline.Pipeline([('svd', svd), ('scl', scl), ('svm', svm)])

In [24]:
%%time
model_clf1 = clf1.fit(train_X, y)

Wall time: 46min 22s


In [None]:
%%time
cross_val_score(model2, train_X, y, cv=cv)

In [25]:
%%time
pipe_poly_pred = model_clf1.predict(test_X)

Wall time: 3min 29s


In [26]:
pipe_poly_pred

array([3, 4, 3, ..., 2, 4, 3], dtype=int64)

In [27]:
pipe_poly_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(pipe_poly_pred, columns=['prediction'])], axis=1)

In [28]:
pipe_poly_answer.to_csv('./pipe_poly_answer.csv', index=False)