In [1]:
import numpy as np
import pandas as pd
import gensim as gs
from gensim.models import Word2Vec as wv

from operator import itemgetter



### Load Case Data

load case data

In [2]:
without_time = False

if without_time:
    caseFilename = "./cleaned_data/case.csv"
else:
    caseFilename = "./cleaned_data/case_w_time.csv"

In [3]:
caseDF = pd.read_csv(caseFilename, sep=',')
caseDF.head()

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,TIME,EVE_INDEX
0,0,10168,2110-12-02 14:57:00,507
1,1,10168,2110-12-02 14:57:00,1069
2,2,10168,2110-12-02 14:57:00,519
3,3,10168,2110-12-02 14:57:00,427
4,4,10168,2110-12-02 14:57:00,1042


In [4]:
caseDF = caseDF.groupby(['SUBJECT_ID', 'EVE_INDEX'])['EVE_INDEX'].size().reset_index()
caseDF["HF"] = 1
caseDF.columns = (['SUBJECT_ID','EVE_INDEX','VALUE','HF'])
caseDF.head()

Unnamed: 0,SUBJECT_ID,EVE_INDEX,VALUE,HF
0,111,208,1,1
1,111,368,1,1
2,111,372,1,1
3,111,467,1,1
4,111,484,1,1


### Load control data

load control data -> get counts of the events -> add 0 for HF

In [5]:
if without_time:
    controlFilename = "./cleaned_data/control.csv"
else:
    controlFilename = "./cleaned_data/control_w_time.csv"

In [6]:
controlDF = pd.read_csv(controlFilename, sep=',')
controlDF.head()

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,TIME,EVE_INDEX
0,0,10008,2142-06-28 00:00:00,4264
1,1,10008,2142-06-28 00:00:00,3505
2,2,10008,2142-06-28 00:00:00,4435
3,3,10008,2142-06-28 00:00:00,3520
4,4,10008,2142-06-28 09:04:00,1068


In [7]:
controlDF = controlDF.groupby(['SUBJECT_ID', 'EVE_INDEX'])['EVE_INDEX'].size().reset_index()
controlDF["HF"] = 0
controlDF.columns = (['SUBJECT_ID','EVE_INDEX','VALUE','HF'])
controlDF.head()

Unnamed: 0,SUBJECT_ID,EVE_INDEX,VALUE,HF
0,94,191,1,0
1,94,372,1,0
2,94,381,1,0
3,94,408,1,0
4,94,427,1,0


### Concatenate case and control Data

In [8]:
ccDF = pd.concat([caseDF, controlDF], ignore_index=True)


In [9]:
ccDF.head()

Unnamed: 0,SUBJECT_ID,EVE_INDEX,VALUE,HF
0,111,208,1,1
1,111,368,1,1
2,111,372,1,1
3,111,467,1,1
4,111,484,1,1


In [10]:
print ccDF.EVE_INDEX.values.max()

4908


### Load Word2Vec Model



In [11]:
# w2vFilename = "word2vec_model/w2vmodel" # number of "words" 12342
# w2vFilename = "word2vec_model/w2vmodel_mv" # number of "words" 10932
# w2vFilename = "word2vec_model/w2vmodel_mv_wt" # number of "words" 4898
w2vFilename = "word2vec_model/w2vmodel_wt" # number of "words" 4908

In [12]:
model = wv.load(w2vFilename)

In [13]:
"""
# figure out how many words are in each file:
for i in xrange(1,100000):
    model.wv[str(i)]
"""


'\n# figure out how many words are in each file:\nfor i in xrange(1,100000):\n    model.wv[str(i)]\n'

In [14]:
numEvents = 4908

In [15]:
model.wv['67']

array([  9.12749767e-02,   1.72770634e-01,  -8.64839330e-02,
         4.92569543e-02,   8.51974636e-02,   8.94424245e-02,
         8.10873434e-02,  -2.12554913e-02,  -9.07133892e-02,
        -4.44544643e-01,  -9.39155966e-02,  -1.42273277e-01,
        -9.20310393e-02,  -7.54970163e-02,   2.17612788e-01,
        -1.37646040e-02,   1.14285141e-01,  -1.86925665e-01,
         1.06518082e-01,   1.19199879e-01,  -8.87205675e-02,
         6.41254038e-02,  -7.46387243e-02,   1.46333680e-01,
        -2.16454089e-01,  -1.44739985e-01,   2.97411159e-03,
         7.57252797e-02,   4.98889163e-02,  -1.22849703e-01,
        -1.23999389e-02,   9.41355005e-02,   6.62142597e-03,
         1.22349113e-01,   7.98727274e-02,  -1.93570286e-01,
        -8.96727517e-02,  -8.83859843e-02,   2.83745900e-02,
        -2.79099997e-02,   1.36267290e-01,   9.97283384e-02,
        -3.14190164e-02,   7.15878233e-02,   2.59285029e-02,
        -1.68913707e-01,   1.46062225e-02,   2.18062967e-01,
         3.34368609e-02,

In [16]:
w2vList = []
for event in xrange(1,numEvents+1):
    w2vList.append([event,np.mean(model[str(event)])])

In [17]:
w2vDF = pd.DataFrame(w2vList)
w2vDF.columns = (['EVE_INDEX','VALUE'])
w2vDF.head()

Unnamed: 0,EVE_INDEX,VALUE
0,1,0.00146
1,2,-0.005085
2,3,-0.007838
3,4,-0.000361
4,5,0.009049


In [18]:
fullDF = pd.merge(ccDF, w2vDF, on='EVE_INDEX')
fullDF.head()

Unnamed: 0,SUBJECT_ID,EVE_INDEX,VALUE_x,HF,VALUE_y
0,111,208,1,1,-0.00886
1,394,208,1,1,-0.00886
2,665,208,1,1,-0.00886
3,1104,208,2,1,-0.00886
4,2586,208,1,1,-0.00886


In [19]:
fullDF = fullDF[['SUBJECT_ID','EVE_INDEX','VALUE_y','HF']]
fullDF.columns = (['SUBJECT_ID','EVE_INDEX','VALUE','HF'])
fullDF.head()

Unnamed: 0,SUBJECT_ID,EVE_INDEX,VALUE,HF
0,111,208,-0.00886,1
1,394,208,-0.00886,1
2,665,208,-0.00886,1
3,1104,208,-0.00886,1
4,2586,208,-0.00886,1


### Assemble dict of features, value tuples

In [20]:
patient_features = {}

for row in fullDF.itertuples():
    if row[1] not in patient_features:
        patient_features[row[1]] = []
    entry = (row[2], row[3])
    patient_features[row[1]].append(entry)

hfDF = fullDF[['SUBJECT_ID', 'HF']]

hfLabel = {}
for row in hfDF.itertuples():
    if row[1] not in hfLabel:
        if row[2] == 1:
            hfLabel[row[1]] = 1
        else:
            hfLabel[row[1]] = 0

In [21]:
patient_features[394]

[(208, -0.0088597238063812256),
 (368, 0.02057897113263607),
 (372, 0.022947700694203377),
 (467, 0.022865088656544685),
 (496, -0.0096948128193616867),
 (507, 0.024237528443336487),
 (966, 0.025913562625646591),
 (1016, 0.023672915995121002),
 (1195, 0.016318388283252716),
 (1462, 0.029055427759885788),
 (1491, 0.023539276793599129),
 (1497, 0.013786486349999905),
 (2046, 0.03238409012556076),
 (2232, 0.0014141473220661283),
 (2233, 0.012805505655705929),
 (2557, -0.0031014252454042435),
 (2677, 0.026324596256017685),
 (2836, 0.0074840323068201542),
 (2892, 0.023581657558679581),
 (3137, 0.010779570788145065),
 (3152, 0.027649072930216789),
 (3173, -0.0073043350130319595),
 (3305, -0.013071508146822453),
 (3390, 0.036173474043607712),
 (3668, 0.033763088285923004),
 (3695, 0.011101148091256618),
 (3825, -0.013362876139581203),
 (3976, -0.011290004476904869),
 (4073, 0.0091795418411493301),
 (4265, 0.008207494392991066),
 (4326, 0.010863864794373512),
 (4735, 0.01918594166636467),
 (49

### Write to svmlight file

In [22]:
if without_time:
    filename1 = "cleaned_data/features_svmlight_w2v.train"
    filename2 = "cleaned_data/features_w2v.train"
else:
    filename1 = "cleaned_data/features_svmlight_w2v_wt.train"
    filename2 = "cleaned_data/features_w2v_wt.train"


In [23]:
fileWriter1 = open(filename1, 'wb')
fileWriter2 = open(filename2, 'wb')

for patient in sorted(patient_features):
    fileWriter1.write('{:.0f}'.format(hfLabel[patient]))
    fileWriter2.write('{:.0f} {:.0f}'.format(patient, hfLabel[patient]))
    for record in sorted(patient_features[patient], key=itemgetter(0)):
        fileWriter1.write(' {:.0f}:{:.6f}'.format(record[0], record[1]))
        fileWriter2.write(' {:.0f}:{:.6f}'.format(record[0], record[1]))
    fileWriter1.write(" \n")
    fileWriter2.write(" \n")

fileWriter2.close()
fileWriter1.close()