In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

<h3>Preprocessing</h3>

In [3]:
hangover_df = pd.read_csv('./train-embedding/hangover.csv', header=None)
hangover_df['target'] = 1
normal_df   = pd.read_csv('./train-embedding/normal.csv', header=None)
normal_df['target'] = 0
df = pd.concat([hangover_df, normal_df])
df = shuffle(df)

msk = np.random.rand(len(df)) <= 0.8
train_df = df[msk]
test_df = df[~msk]

In [5]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,target
12,0.08088,-0.176592,0.038707,-0.00946,0.118051,0.070261,0.10163,0.153487,0.009066,-0.1037,...,0.108239,0.02252,0.1131,-0.026724,0.01982,0.100162,-0.01868,-0.027456,-0.15592,1
37,0.113941,-0.083191,-0.010968,0.001656,0.087452,0.063608,-0.028396,0.069848,-0.110994,-0.071103,...,0.138125,-0.04852,0.128077,0.074322,0.061818,-0.044193,0.026496,0.017908,-0.153994,1
79,0.124156,-0.054175,-0.092372,-0.062406,0.157984,0.153145,-0.009058,0.109545,0.045385,-0.182554,...,0.20369,0.036918,0.068059,-0.018924,0.114833,-0.011401,-0.077959,-0.024953,-0.047879,0
32,0.10974,-0.060122,-0.131905,0.015752,0.093418,0.099462,0.019249,0.082634,0.010525,-0.078638,...,0.125224,0.036419,0.155314,0.168345,0.043257,0.032027,-0.078397,0.086005,-0.085439,1
14,-0.034428,0.071713,0.003654,-0.052993,-0.099435,0.113485,-0.08749,0.032322,0.076029,-0.012988,...,0.135183,0.060403,-0.1507,-0.083001,-0.009732,0.091188,0.032803,0.079489,0.102148,1
8,0.148423,0.043876,-0.094534,0.070775,0.019919,0.177518,0.0403,-0.083676,-0.006649,-0.057782,...,0.170794,0.001772,-0.075166,-0.026812,0.087938,-0.020205,0.0435,-0.077304,-0.018615,0
83,0.060992,0.01036,-0.180835,0.080082,0.076127,-0.029076,-0.010619,-0.107285,0.006827,-0.0911,...,0.085843,-0.102468,0.136757,-0.003343,0.074511,-0.009724,0.002287,-0.03584,-0.041547,0
106,0.076569,0.038463,-0.056801,0.019768,0.060545,0.121453,0.032959,-0.054842,-0.065075,-0.128476,...,0.232446,0.009199,0.038069,0.04083,0.029512,0.002875,0.083149,0.107194,0.032708,0
89,0.048498,-0.074123,-0.064649,0.08293,0.123177,0.023074,0.018386,-0.057971,0.060657,-0.107098,...,0.153173,-0.042172,0.010676,-0.1541,0.083738,-0.111483,0.079466,-0.067158,0.029305,0
62,0.082332,0.11558,0.011484,0.141847,-0.04187,0.125534,0.029235,-0.004505,0.029337,0.069322,...,0.098125,0.032009,-0.010072,-0.055096,0.208765,0.129552,-0.050709,0.0441,0.04273,0


In [7]:
train = {'x': train_df.drop('target', axis=1), 'y': train_df['target']}
test  = {'x': test_df.drop('target', axis=1), 'y': test_df['target']}

<h3>Linear Model</h3>

In [8]:
model = linear_model.Ridge(alpha=0.1, random_state=1)
model.fit(train['x'], train['y'])

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=1, solver='auto', tol=0.001)

In [9]:
train['pred'] = model.predict(train['x'])
test['pred'] = model.predict(test['x'])

print('Train error', mean_squared_error(train['y'], train['pred']))
print('Test error', mean_squared_error(test['y'], test['pred']))

Train error 0.0940173725487
Test error 0.105564124553


In [15]:
tmp_df = pd.DataFrame()
tmp_df['y'] = test['y']
tmp_df['pred'] = test['pred']

In [21]:
danic_df = pd.read_csv('./train-embedding/reps.csv', header=None)
danic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.017694,0.162265,-0.118203,-0.05407,0.009281,0.230801,-0.119686,-0.119757,0.059174,0.067316,...,0.189606,0.10406,0.184823,-0.168545,0.055621,-0.033896,0.054627,0.00038,0.092291,0.066612
1,-0.026778,0.114405,0.070789,-0.133816,-0.022331,0.111301,-0.098673,0.025478,-0.018543,0.020422,...,0.086911,0.17539,0.110514,-0.030552,-0.094022,-0.078188,0.010718,-0.004407,0.025306,0.104452
2,0.019151,0.046371,0.003117,-0.038588,0.049059,0.106353,-0.060592,-0.009925,-0.158011,0.126521,...,0.040483,0.191458,0.076505,-0.027609,0.044342,-0.01973,-0.054163,0.060563,0.090053,0.05911


In [22]:
model.predict(danic_df)

array([ 0.54079195,  0.74249873,  0.40462095])

In [24]:
vova_df = pd.read_csv('./train-embedding/vova.csv', header=None)
vova_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.026778,0.114405,0.070789,-0.133816,-0.022331,0.111301,-0.098673,0.025478,-0.018543,0.020422,...,0.086911,0.17539,0.110514,-0.030552,-0.094022,-0.078188,0.010718,-0.004407,0.025306,0.104452


In [25]:
model.predict(vova_df)

array([ 0.74249868])