# Modeling
  1. Regression
  2. Classification

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk

import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score, accuracy_score

import joblib
from joblib import dump, load

pd.options.display.max_columns = 400
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 600
pd.options.display.precision = 10

In [2]:
df_train = pd.read_excel("./__data/excel/train.xlsx").fillna("")
df_test = pd.read_excel("./__data/excel/test.xlsx").fillna("")

# Modeling (1) Regression

In [3]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn import pipeline
# from sklearn.model_selection import GridSearchCV
# from sklearn import grid_search
# from sklearn import metrics



In [4]:
train_X = joblib.load('train_X.pkl')
test_X = joblib.load('test_X.pkl')
y = joblib.load('y.pkl')

In [5]:
def pred_round(pred):
    for num in range(len(pred)):
        pred[num] = round(pred[num])
    
    return pred

### (1) SVD - n_componets=200

In [15]:
svd = TruncatedSVD(n_components=200)
scl = StandardScaler()
xgb_model = xgb.XGBRegressor()  

In [16]:
reg1 = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb_model)])
reg2 = pipeline.Pipeline([('svd', svd), ('xgb', xgb_model)])

In [None]:
%%time
model_reg1 = reg1.fit(train_X, y)

In [None]:
reg1_pred = model_reg1.predict(test_X)
reg1_pred

In [None]:
pred_round(reg1_pred)
reg1_pred = reg1_pred.astype(int)

In [None]:
reg1_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(reg1_pred, columns=['prediction'])], axis=1)

In [None]:
reg1_answer.to_csv('./reg1_200_answer_title.csv', index=False)

* TruncatedSVD(n_components=200) / StandardScaler() / xgb.XGBRegressor()
 - score : 0.33904

In [17]:
%%time
model_reg2 = reg2.fit(train_X, y)

Wall time: 5min 34s


In [18]:
reg2_pred = model_reg2.predict(test_X)
reg2_pred

array([ 3.55006528,  2.84637547,  3.21758842, ...,  2.1747117 ,
        3.41123295,  3.31762338], dtype=float32)

In [19]:
pred_round(reg2_pred)
reg2_pred = reg2_pred.astype(int)

In [20]:
reg2_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(reg2_pred, columns=['prediction'])], axis=1)

In [22]:
reg2_answer.to_csv('./reg2_200_answer_title.csv', index=False)

* TruncatedSVD(n_components=200) / xgb.XGBRegressor()
 - score : 0.32523

### (2) SVD - n_componets=300

In [6]:
svd = TruncatedSVD(n_components=300)
scl = StandardScaler()
xgb_model = xgb.XGBRegressor()  

In [8]:
reg1 = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb_model)])
reg2 = pipeline.Pipeline([('svd', svd), ('xgb', xgb_model)])

In [None]:
%%time
model_reg1 = reg1.fit(train_X, y)

In [None]:
reg1_pred = model_reg1.predict(test_X)
reg1_pred

In [None]:
pred_round(reg1_pred)
reg1_pred = reg1_pred.astype(int)

In [None]:
reg1_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(reg1_pred, columns=['prediction'])], axis=1)

In [None]:
reg1_answer.to_csv('./reg1_300_answer_title.csv', index=False)

* TruncatedSVD(n_components=300) / StandardScaler() / xgb.XGBRegressor()
 - score : 0.34726

In [9]:
%%time
model_reg2 = reg2.fit(train_X, y)

Wall time: 13min 7s


In [10]:
reg2_pred = model_reg2.predict(test_X)
reg2_pred

array([ 3.55863261,  3.1498816 ,  3.31796885, ...,  2.306499  ,
        3.53847837,  3.24479866], dtype=float32)

In [11]:
pred_round(reg2_pred)
reg2_pred = reg2_pred.astype(int)

In [12]:
reg2_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(reg2_pred, columns=['prediction'])], axis=1)

In [13]:
reg2_answer.to_csv('./reg2_300_answer_title.csv', index=False)

* TruncatedSVD(n_components=300) / xgb.XGBRegressor()
 - score : 0.35129

### (3) SVD - n_componets=400

In [6]:
svd = TruncatedSVD(n_components=300)
scl = StandardScaler()
xgb_model = xgb.XGBRegressor()  

In [8]:
reg1 = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb_model)])
reg2 = pipeline.Pipeline([('svd', svd), ('xgb', xgb_model)])

In [None]:
%%time
model_reg1 = reg1.fit(train_X, y)

In [None]:
reg1_pred = model_reg1.predict(test_X)
reg1_pred

In [None]:
pred_round(reg1_pred)
reg1_pred = reg1_pred.astype(int)

In [None]:
reg1_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(reg1_pred, columns=['prediction'])], axis=1)

In [None]:
reg1_answer.to_csv('./reg1_300_answer_title.csv', index=False)

* TruncatedSVD(n_components=300) / StandardScaler() / xgb.XGBRegressor()
 - score : 0.34726

In [9]:
%%time
model_reg2 = reg2.fit(train_X, y)

Wall time: 13min 7s


In [10]:
reg2_pred = model_reg2.predict(test_X)
reg2_pred

array([ 3.55863261,  3.1498816 ,  3.31796885, ...,  2.306499  ,
        3.53847837,  3.24479866], dtype=float32)

In [11]:
pred_round(reg2_pred)
reg2_pred = reg2_pred.astype(int)

In [12]:
reg2_answer = pd.concat([pd.DataFrame(df_test['id'], columns=['id']), pd.DataFrame(reg2_pred, columns=['prediction'])], axis=1)

In [13]:
reg2_answer.to_csv('./reg2_300_answer_title.csv', index=False)

* TruncatedSVD(n_components=300) / xgb.XGBRegressor()
 - score : 0.35129