### Data Science London + Scikit-learn
[Kaggle Site](https://www.kaggle.com/c/data-science-london-scikit-learn/overview)

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

# 忽略警告訊息
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./data/train.csv', header=None)
trainLabel = pd.read_csv('./data/trainLabels.csv', header=None)
test = pd.read_csv('./data/test.csv', header=None)

train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.025596,-0.024526,-0.024088,-0.002271,1.092329,-0.00625,0.497342,-0.037883,0.026391,-0.003597,...,0.030651,0.022951,-0.542491,-0.011608,-0.483507,0.033371,0.567185,0.006849,-0.892659,0.609451
std,1.008282,1.016298,0.979109,0.970575,4.538834,0.989128,2.118819,2.232256,1.001064,1.01352,...,1.011645,1.001375,2.239939,1.022456,2.121281,1.007044,2.227876,0.997635,2.022022,2.045439
min,-3.365711,-3.492086,-2.695602,-3.460471,-16.421901,-3.04125,-7.224761,-6.509084,-3.145588,-2.749812,...,-3.379194,-2.971125,-7.84089,-2.999564,-7.124105,-2.952358,-5.452254,-3.473913,-8.051722,-7.799086
25%,-0.66901,-0.693937,-0.69883,-0.617557,-1.801997,-0.732265,-0.838619,-1.604037,-0.677562,-0.68222,...,-0.659457,-0.696032,-2.121943,-0.66455,-1.879247,-0.642861,-1.059786,-0.691162,-2.220126,-0.565041
50%,0.027895,-0.033194,0.008145,0.002327,0.862818,0.027041,0.582321,0.018809,0.022092,-0.03611,...,0.049416,0.049778,-0.568262,-0.028097,-0.493575,0.037732,0.455474,0.038284,-0.85547,0.779944
75%,0.76252,0.682753,0.661434,0.640743,3.843172,0.671456,1.913664,1.438304,0.74131,0.665364,...,0.747031,0.699917,0.939348,0.651374,1.005795,0.6918,2.122157,0.693535,0.388698,1.992193
max,3.326246,3.58387,2.546507,3.088738,17.565345,3.102997,7.592666,7.130097,3.145258,3.919426,...,2.844792,3.688047,7.160379,3.353631,6.005818,3.420561,6.603499,3.492548,5.77412,6.803984


In [3]:
X,y = train,np.ravel(trainLabel)
ss = StandardScaler()
X_ss = ss.fit_transform(train)
X_train, X_test, y_train, y_test = train_test_split(X_ss, y, test_size=0.25)

In [4]:
gbc = GradientBoostingClassifier(random_state=7)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
gbc.score(X_test, y_test)

0.86

In [5]:
n_estimators = [100, 200, 300]
max_depth = [1, 3, 5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(gbc, param_grid, scoring='f1', n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    3.6s finished


In [6]:
gbc_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
gbc_bestparam.fit(X_train, y_train)

# 預測測試集
y_pred = gbc_bestparam.predict(X_test)
gbc_bestparam.score(X_test, y_test)

0.884

In [7]:
ss = StandardScaler()
X_ss = ss.fit_transform(test)
y_pred = gbc_bestparam.predict(X_ss)
submission = pd.DataFrame(y_pred)
submission.columns = ['Solution']
submission['Id'] = np.arange(1,submission.shape[0]+1)
submission = submission[['Id', 'Solution']]
submission.to_csv('submission_with_scaling.csv', index=False)

In [8]:
X,y = train,np.ravel(trainLabel)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [9]:
rfc = RandomForestClassifier(n_estimators=20, max_depth=4)

# 訓練模型
rfc.fit(X_train, y_train)

# 預測測試集
y_pred = rfc.predict(X_test)
print("Score: ", rfc.score(X_test, y_test))

Score:  0.868


In [10]:
n_estimators = [100, 200, 300, 400, 500]
max_depth = [4, 7, 9, 11, 15]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(rfc, param_grid, scoring='balanced_accuracy', n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    8.7s finished


In [11]:
rfc_bestparam = RandomForestClassifier(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
rfc_bestparam.fit(X_train, y_train)

# 預測測試集
y_pred = rfc_bestparam.predict(X_test)
rfc_bestparam.score(X_test, y_test)

0.884

In [12]:
y_pred = rfc_bestparam.predict(test)
submission = pd.DataFrame(y_pred)
submission.columns = ['Solution']
submission['Id'] = np.arange(1,submission.shape[0]+1)
submission = submission[['Id', 'Solution']]
submission.to_csv('submission_with_scaling.csv', index=False)