In [4]:
# feature engineering: interpolation for time-varying dataset
# classification: (kernel) SVM_classification,Neural Network
# regression: (kernel) SVM_regression,Neural Network
# inbalanced data, class_weight or downsampling

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import random

In [5]:
# data loading
train_data = np.loadtxt("train_features.csv",delimiter = ",",skiprows = 1,usecols = range(2,37))
train_labels = np.loadtxt("train_labels.csv",delimiter = ",",skiprows = 1,usecols = range(1,16)) #18995*15 matrix
test_data = np.loadtxt("test_features.csv",delimiter = ",",skiprows = 1,usecols = range(2,37))

In [6]:
# data imputation and reshape:
# if data exists, interpolate it for both direction; else, set 'nan' to zeros as a new feature
# hardcoded here for data shape
def data_imputation(data):
    results = np.zeros((data.shape[0]//12,12*35))
    for i in range(data.shape[0]//12):
        tmp = data[12*i:12*(i+1),:]
        tmp_pd = pd.DataFrame(tmp).interpolate(method = 'linear',limit_direction = 'both').replace(np.nan,0) # it is a pandas
        results[i] = tmp_pd.values.reshape((1,12*35),order = 'F')   # it is a ndarray # reshape order = F or C (row or col)
    return results

In [7]:
def data_downSampling(train,label):
    # separate 0s and 1s
    one_index = []
    zero_index = []
    for i in range(len(label)):
        if (label[i] == 1): one_index.append(i)
        elif(label[i] == 0): zero_index.append(i)
    # for robustness
    if(len(zero_index)<len(one_index)): return train,label
    # downsample the 0s to match the size of 1s
    else:
        ones = len(one_index)
        downsampled_train = np.zeros((2*ones,train.shape[1]))
        for i in range(ones):
            downsampled_train[i,:] = train[one_index[i],:]
        zeroindex = random.sample(zero_index,ones)
        for i in range(ones):
            downsampled_train[i+ones,:] = train[zeroindex[i],:]
        return downsampled_train, np.append(np.ones(ones),np.zeros(ones))

In [8]:
train_samples = data_imputation(train_data) # 18995*420 train matrix
test_samples = data_imputation(test_data)  # 12664*420 test matrix

In [9]:
# config
SVC_params = {'C':1.0,'kernel':'rbf', 'probability':True}
SVR_params = {'C':1.0,'kernel':'rbf','epsilon':0.1}

In [10]:
results = np.zeros((test_samples.shape[0],15))

In [11]:
for i in range(11): # about 15min in total
    print(i/11*100,"%done")
    x,y = data_downSampling(train_samples,train_labels[:,i])
    model = make_pipeline(StandardScaler(),svm.SVC(**SVC_params))
    model.fit(x,y)
    results[:,i] = model.predict_proba(test_samples)[:,1]

0.0 %done
9.090909090909092 %done
18.181818181818183 %done
27.27272727272727 %done
36.36363636363637 %done
45.45454545454545 %done
54.54545454545454 %done
63.63636363636363 %done
72.72727272727273 %done
81.81818181818183 %done
90.9090909090909 %done


In [12]:
for i in range(4): # about 5min in total
    print(i/4*100,"% done")
    X_train, X_test, y_train, y_test = train_test_split(train_samples, train_labels[:,i+11], test_size=0.5, random_state=42)
    model = make_pipeline(StandardScaler(),svm.SVR(**SVR_params))
    model.fit(X_train,y_train)
    results[:,i+11] = model.predict(test_samples)
#print(results[0])

0.0 % done
25.0 % done
50.0 % done
75.0 % done


In [13]:
# write data
pd.DataFrame(results).to_csv('prediction.csv', index=False, float_format='%.3f')
# show example
results[0]

array([ 0.75387777,  0.81844636,  0.77124992,  0.74811651,  0.7844246 ,
        0.74525592,  0.29490504,  0.55802176,  0.79253951,  0.6152522 ,
        0.5739433 , 18.60417832, 81.86481371, 97.15250078, 84.6593394 ])