In [5]:
import xgboost as xgb
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [36]:
print('preparing data...')
df_y_train = pd.read_csv('data/y_train.csv', header=0, index_col = 0)
y = df_y_train['y'].values

preparing data...


In [22]:
x_train_bio = pd.read_csv('data/bioFeatures.csv', header=None, index_col=0)
print(x_train_bio.shape)
x_test_bio = pd.read_csv('data/bioFeatures_test.csv', header=None, index_col=0)
print(x_test_bio.shape)

(5117, 208)
(3411, 208)


In [23]:
x_train_hrv = pd.read_csv('data/hrvFeatures.csv', header=None, index_col=0)
print(x_train_hrv.shape)
x_test_hrv = pd.read_csv('data/hrvFeatures_test.csv', header=None, index_col=0)
print(x_test_hrv.shape)

(5117, 100)
(3411, 100)


In [20]:
x_train_rpeak = pd.read_csv('data/rpeakfeature.csv', header=None, skiprows=1, index_col=0)
print(x_train_rpeak.shape)
x_test_rpeak = pd.read_csv('data/rpeakfeature_test.csv', header=None, skiprows=1, index_col=0)
print(x_test_rpeak.shape)

(5117, 8)
(3411, 8)


In [21]:
x_train_other = pd.read_csv('data/otherfeature_complete.csv', header=None, skiprows=1, index_col=0)
print(x_train_other.shape)
x_test_other = pd.read_csv('data/otherfeature_complete_test.csv', header=None, skiprows=1, index_col=0)
print(x_test_other.shape)

(5117, 20)
(3411, 20)


In [38]:
merged_train = pd.concat([x_train_bio, x_train_hrv, x_train_rpeak, x_train_other], axis=1)
print(merged_train.shape)
merged_test = pd.concat([x_test_bio, x_test_hrv, x_test_rpeak, x_test_other], axis=1)
print(merged_test.shape)

X = merged_train.values
X_test = merged_test.values

(5117, 336)
(3411, 336)


# XGBoost

In [56]:
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
train_X, eval_X, train_Y, eval_Y = train_test_split(X, y, test_size=0.3, random_state=42)
xg_train = xgb.DMatrix(train_X, label=train_Y)
xg_eval = xgb.DMatrix(eval_X, label=eval_Y)

# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.2
param['gamma'] = 1.0
param['max_depth'] = 6
param['silent'] = 1
param['subsample'] = 0.8
param['colsample_bytree'] = 0.9
param['min_child_weight'] = 20
param['num_class'] = 4


watchlist = [(xg_train, 'train'), (xg_eval, 'eval')]
num_round = 60
bst = xgb.train(param, 
                xg_train, 
                num_round, 
                watchlist, 
                feval=lambda y,t: ("f1", f1_score(y, t.get_label(), average='micro')))

# get prediction
pred = bst.predict(xg_eval)
# error_rate = np.sum(pred != test_Y) / test_Y.shape[0]
F1 = f1_score(eval_Y, pred, average='micro')
print('Test error using softmax = {}'.format(F1))

[0]	train-merror:0.203016	eval-merror:0.220052	train-f1:0.796984	eval-f1:0.779948
[1]	train-merror:0.188495	eval-merror:0.217448	train-f1:0.811505	eval-f1:0.782552
[2]	train-merror:0.177325	eval-merror:0.21224	train-f1:0.822675	eval-f1:0.78776
[3]	train-merror:0.170623	eval-merror:0.20638	train-f1:0.829377	eval-f1:0.79362
[4]	train-merror:0.163362	eval-merror:0.201172	train-f1:0.836638	eval-f1:0.798828
[5]	train-merror:0.162245	eval-merror:0.201823	train-f1:0.837755	eval-f1:0.798177
[6]	train-merror:0.152471	eval-merror:0.203776	train-f1:0.847529	eval-f1:0.796224
[7]	train-merror:0.143815	eval-merror:0.195964	train-f1:0.856185	eval-f1:0.804036
[8]	train-merror:0.140743	eval-merror:0.19987	train-f1:0.859257	eval-f1:0.80013
[9]	train-merror:0.136833	eval-merror:0.197917	train-f1:0.863167	eval-f1:0.802083
[10]	train-merror:0.13432	eval-merror:0.196615	train-f1:0.86568	eval-f1:0.803385
[11]	train-merror:0.131807	eval-merror:0.195312	train-f1:0.868193	eval-f1:0.804688
[12]	train-merror:0.12

In [58]:
xg_test = xgb.DMatrix(X_test)
y_pred = bst.predict(xg_test,ntree_limit=54)
f = open("submission.csv", "w")
f.write("id,y\n")
for i,x in enumerate(y_pred):
    f.write("{},{}\n".format(i,y_pred[i]))
f.close()

# SVM

In [69]:
clf = svm.SVC(kernel='linear', class_weight='balanced', C=1.0, random_state=0, gamma=0.1)
# clf.fit(X, y) 
cross_val_score(clf, X, y, scoring='f1_micro', cv=5)  

array([ 0.26660156,  0.25097656,  0.28027344,  0.25806452,  0.26223092])

In [68]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0, tol=1e-5)
cross_val_score(clf, X, y, scoring='f1_micro', cv=5)  

array([ 0.62109375,  0.62792969,  0.62792969,  0.61779081,  0.62133072])

In [65]:
clf.fit(X, y) 

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

In [66]:
y_pred =clf.predict(X_test)

In [67]:
f = open("submission.csv", "w")
f.write("id,y\n")
for i,x in enumerate(y_pred):
    f.write("{},{}\n".format(i,y_pred[i]))
f.close()