In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

<h3>Preprocessing</h3>

In [2]:
hangover_df = pd.read_csv('./train-embedding/hangover.csv', header=None)
hangover_df['target'] = 1
normal_df   = pd.read_csv('./train-embedding/normal.csv', header=None)
normal_df['target'] = 0
df = pd.concat([hangover_df, normal_df])
df = shuffle(df)

msk = np.random.rand(len(df)) <= 0.9
train_df = df[msk]
test_df = df[~msk]

In [3]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,target
62,0.082332,0.11558,0.011484,0.141847,-0.04187,0.125534,0.029235,-0.004505,0.029337,0.069322,...,0.098125,0.032009,-0.010072,-0.055096,0.208765,0.129552,-0.050709,0.0441,0.04273,0
76,-0.012147,0.091161,-0.070147,-0.030081,0.050049,0.167613,0.013124,0.03597,-0.13111,0.157702,...,0.18872,0.102914,-0.07303,0.028014,0.09111,-0.141332,0.05006,-0.022183,0.084601,0
91,0.114124,-0.077783,-0.086178,-0.052333,0.109709,0.066981,0.026119,-0.009682,-0.044196,-0.26069,...,0.185594,-0.066444,0.069844,0.005636,0.018122,-0.006824,-0.055215,-0.019155,0.020464,1
83,0.060992,0.01036,-0.180835,0.080082,0.076127,-0.029076,-0.010619,-0.107285,0.006827,-0.0911,...,0.085843,-0.102468,0.136757,-0.003343,0.074511,-0.009724,0.002287,-0.03584,-0.041547,0
14,-0.019936,-0.064112,0.101069,0.006636,0.09464,0.103703,0.040171,0.000945,-0.102596,0.006707,...,0.034364,0.037204,-0.09531,-0.083566,-0.049263,-0.031641,0.131151,0.147703,0.190647,0
13,0.018436,0.057802,0.061157,-0.124911,0.118827,0.01047,-0.019737,-0.041294,-0.106877,0.003194,...,0.116197,0.065195,-0.018978,-0.017539,0.022261,-0.08153,0.037408,0.03656,0.016555,1
25,0.070075,0.079045,-0.155921,0.04785,0.05475,0.126134,-0.01709,-0.100432,-0.099305,0.099807,...,0.139557,-0.037178,-0.072808,0.002498,0.048477,-0.055768,0.039198,-0.04718,0.031304,0
30,0.103962,-0.132276,-0.160088,-0.003601,0.153191,0.120972,-0.027515,-0.013164,-0.017648,-0.186249,...,0.186487,0.010499,0.060644,0.115755,0.070034,-0.018712,-0.059361,0.051425,0.056528,0
20,-0.082405,0.060443,0.04556,-0.128009,0.071083,0.109485,-0.081349,0.030569,-0.006374,0.035442,...,0.102738,0.177443,-0.091745,0.004533,0.043872,0.029775,-0.067521,0.102697,0.136325,0
93,0.016159,0.061643,-0.127319,0.005559,0.064426,0.020554,-0.00878,-0.059373,-0.091717,-0.025443,...,0.15275,-0.100938,-0.05753,-0.077848,0.082762,-0.143422,-0.010725,-0.102129,0.049473,0


In [4]:
train = {'x': train_df.drop('target', axis=1), 'y': train_df['target']}
test  = {'x': test_df.drop('target', axis=1), 'y': test_df['target']}

<h3>Linear Model</h3>

In [5]:
model = linear_model.Ridge(alpha=0.1, random_state=1)
model.fit(train['x'], train['y'])

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=1, solver='auto', tol=0.001)

In [6]:
train['pred'] = model.predict(train['x'])
test['pred'] = model.predict(test['x'])

print('Train error', mean_squared_error(train['y'], train['pred']))
print('Test error', mean_squared_error(test['y'], test['pred']))

Train error 0.0921805556365
Test error 0.104213236455


In [7]:
tmp_df = pd.DataFrame()
tmp_df['y'] = test['y']
tmp_df['pred'] = test['pred']

In [8]:
danic_df = pd.read_csv('./train-embedding/reps.csv', header=None)
danic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.017694,0.162265,-0.118203,-0.05407,0.009281,0.230801,-0.119686,-0.119757,0.059174,0.067316,...,0.189606,0.10406,0.184823,-0.168545,0.055621,-0.033896,0.054627,0.00038,0.092291,0.066612
1,-0.026778,0.114405,0.070789,-0.133816,-0.022331,0.111301,-0.098673,0.025478,-0.018543,0.020422,...,0.086911,0.17539,0.110514,-0.030552,-0.094022,-0.078188,0.010718,-0.004407,0.025306,0.104452
2,0.019151,0.046371,0.003117,-0.038588,0.049059,0.106353,-0.060592,-0.009925,-0.158011,0.126521,...,0.040483,0.191458,0.076505,-0.027609,0.044342,-0.01973,-0.054163,0.060563,0.090053,0.05911


In [9]:
model.predict(danic_df)

array([ 0.59832453,  0.7866295 ,  0.55894058])

In [10]:
vova_df = pd.read_csv('./train-embedding/vova.csv', header=None)
vova_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.026778,0.114405,0.070789,-0.133816,-0.022331,0.111301,-0.098673,0.025478,-0.018543,0.020422,...,0.086911,0.17539,0.110514,-0.030552,-0.094022,-0.078188,0.010718,-0.004407,0.025306,0.104452


In [11]:
model.predict(vova_df)

array([ 0.78662948])