In [None]:
# Cross-validation

# https://scikit-learn.org/stable/modules/cross_validation.html

# Logistic Regression
# https://www.datacamp.com/tutorial/understanding-logistic-regression-python

# SVM support vector machine
# https://scikit-learn.org/dev/versions.html
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html


# decision tree
# https://scikit-learn.org/stable/modules/tree.html

# random forest
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [1]:
import numpy as np
import pandas as pd

from operator import itemgetter

In [15]:
without_time = True

if without_time:
    caseFilename = "../cleaned_data/case.csv"
else:
    caseFilename = "../cleaned_data/case_w_time.csv"

In [16]:
caseGD = pd.read_csv(caseFilename, sep=',')
caseGD.head()

Unnamed: 0.1,Unnamed: 0,subject_id,time,eve_index
0,0,10026821,2134-06-02 07:12:00,26406
1,1,10026821,2134-06-02 07:12:00,6699
2,2,10026821,2134-06-02 07:12:00,5645
3,3,10026821,2134-06-02 07:12:00,5612
4,4,10026821,2134-06-03 08:00:00,35454


In [17]:
caseGD = caseGD.groupby(['subject_id', 'eve_index']).size().reset_index()
caseGD["GD"] = 1
caseGD.columns = (['subject_id', 'eve_index','value','GD'])
caseGD.head()

Unnamed: 0,subject_id,eve_index,value,GD
0,10026821,5612,1,1
1,10026821,5645,1,1
2,10026821,6699,1,1
3,10026821,26406,1,1
4,10026821,28623,1,1


In [18]:
if without_time:
    controlFilename = "../cleaned_data/control.csv"
else:
    controlFilename = "../cleaned_data/control_w_time.csv"

In [19]:
controlGD = pd.read_csv(controlFilename, sep=',')
controlGD.head()

Unnamed: 0.1,Unnamed: 0,subject_id,time,eve_index
0,0,10006196,2204-09-22 18:42:00,5353
1,1,10006196,2204-09-22 19:00:00,28623
2,2,10006196,2204-09-22 19:00:00,30848
3,3,10006196,2204-09-22 19:00:00,29794
4,4,10006196,2204-09-22 19:00:00,33424


In [20]:
controlGD = controlGD.groupby(['subject_id', 'eve_index']).size().reset_index()
controlGD["GD"] = 0
controlGD.columns = (['subject_id', 'eve_index','value','GD'])
controlGD.head()

Unnamed: 0,subject_id,eve_index,value,GD
0,10006196,5353,1,0
1,10006196,5362,1,0
2,10006196,5496,1,0
3,10006196,5683,1,0
4,10006196,5702,1,0


In [21]:
fullGD = pd.concat([caseGD, controlGD], ignore_index=True)

In [22]:
fullGD.head()

Unnamed: 0,subject_id,eve_index,value,GD
0,10026821,5612,1,1
1,10026821,5645,1,1
2,10026821,6699,1,1
3,10026821,26406,1,1
4,10026821,28623,1,1


In [23]:
print(fullGD.eve_index.values.max())

36765


In [11]:
patient_features = {}

for row in fullGD.itertuples():
    if row[1] not in patient_features:
        patient_features[row[1]] = []
    entry = (row[2], row[3])
    patient_features[row[1]].append(entry)

hfGD = fullGD[['subject_id', 'GD']]

hfLabel = {}
for row in hfGD.itertuples():
    if row[1] not in hfLabel:
        if row[2] == 1:
            hfLabel[row[1]] = 1
        else:
            hfLabel[row[1]] = 0

In [12]:
patient_features

{10026821: [(5612, 1),
  (5645, 1),
  (6699, 1),
  (26406, 1),
  (28623, 1),
  (29535, 1),
  (29794, 1),
  (30848, 1),
  (32075, 1),
  (32554, 4),
  (32609, 1),
  (33355, 1),
  (33424, 1),
  (33649, 2),
  (34157, 1),
  (34261, 1),
  (35388, 1),
  (35454, 1),
  (35881, 1),
  (36802, 1),
  (36807, 1)],
 10078279: [(4333, 2),
  (5415, 1),
  (5625, 1),
  (5664, 1),
  (5727, 1),
  (6541, 1),
  (6699, 1),
  (6939, 1),
  (26406, 1),
  (28623, 1),
  (29535, 1),
  (29794, 1),
  (30848, 1),
  (31683, 2),
  (32075, 1),
  (32554, 4),
  (32609, 1),
  (33355, 1),
  (33424, 1),
  (33649, 1),
  (34157, 1),
  (34261, 1),
  (35388, 1),
  (35454, 1),
  (35881, 1),
  (36802, 1),
  (36807, 1)],
 10130191: [(112, 2),
  (18695, 1),
  (28623, 2),
  (30176, 2),
  (30848, 2),
  (31683, 2),
  (32408, 2),
  (34591, 1),
  (35452, 1),
  (35454, 2),
  (36397, 1),
  (36806, 1)],
 10183015: [(2700, 1), (26801, 1), (36807, 1)],
 10230631: [(4333, 2),
  (5214, 1),
  (5319, 1),
  (5934, 1),
  (28623, 1),
  (30176, 1),
  

In [13]:
if without_time:
    filename1 = "../cleaned_data/features_svmlight_eventCounts.train"
    filename2 = "../cleaned_data/features_eventCounts.train"
else:
    filename1 = "../cleaned_data/features_svmlight_eventCounts_wt.train"
    filename2 = "../cleaned_data/features_eventCounts_wt.train"

In [14]:
fileWriter1 = open(filename1, 'wb')
fileWriter2 = open(filename2, 'wb')

for patient in sorted(patient_features):
    fileWriter1.write('{:.0f}'.format(hfLabel[patient]).encode())
    fileWriter2.write('{:.0f} {:.0f}'.format(patient, hfLabel[patient]).encode())
    for record in sorted(patient_features[patient], key=itemgetter(0)):
        fileWriter1.write(' {:.0f}:{:.0f}'.format(record[0], record[1]).encode())
        fileWriter2.write(' {:.0f}:{:.0f}'.format(record[0], record[1]).encode())
    fileWriter1.write(" \n".encode())
    fileWriter2.write(" \n".encode())

fileWriter2.close()
fileWriter1.close()