In [1]:
# Prepare data files for exercise. Predimed data does not include ID or location, we made that up to make the exercise work.
# This notebook also divides the data into records and mapping CSVs

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('predimed.csv')

In [4]:
# adds a fictitious medical center / city
df['location-id'] = np.random.choice(range(1, 6), len(df))
df

Unnamed: 0,group,sex,age,smoke,bmi,waist,wth,htn,diab,hyperchol,famhist,hormo,p14,toevent,event,location-id
0,Control,Male,58,Former,33.53,122,0.753086,No,No,Yes,No,No,10,5.374401,Yes,4
1,Control,Male,77,Current,31.05,119,0.730061,Yes,Yes,No,No,No,10,6.097194,No,4
2,MedDiet + VOO,Female,72,Former,30.86,106,0.654321,No,Yes,No,Yes,No,8,5.946612,No,4
3,MedDiet + Nuts,Male,71,Former,27.68,118,0.694118,Yes,No,Yes,No,No,8,2.907598,Yes,4
4,MedDiet + VOO,Female,79,Never,35.94,129,0.806250,Yes,No,Yes,No,No,9,4.761123,No,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6319,Control,Female,66,Never,28.51,104,0.645963,Yes,No,Yes,Yes,No,8,3.550992,No,5
6320,Control,Male,80,Never,23.81,109,0.589189,Yes,Yes,Yes,Yes,No,8,2.743326,No,5
6321,MedDiet + Nuts,Male,57,Former,25.24,100,0.571429,Yes,No,Yes,No,,7,0.479124,No,3
6322,MedDiet + VOO,Female,71,Never,32.04,98,0.653333,Yes,No,Yes,Yes,No,6,2.587269,No,5


In [5]:
# for each location we give a patient-id
df['tmp'] = df.sort_values('location-id').groupby('location-id')['age'].cumsum()
df['patient-id'] = df.sort_values('tmp').groupby('location-id')['tmp'].rank().astype(int)
del df['tmp']
df

Unnamed: 0,group,sex,age,smoke,bmi,waist,wth,htn,diab,hyperchol,famhist,hormo,p14,toevent,event,location-id,patient-id
0,Control,Male,58,Former,33.53,122,0.753086,No,No,Yes,No,No,10,5.374401,Yes,4,436
1,Control,Male,77,Current,31.05,119,0.730061,Yes,Yes,No,No,No,10,6.097194,No,4,1130
2,MedDiet + VOO,Female,72,Former,30.86,106,0.654321,No,Yes,No,Yes,No,8,5.946612,No,4,1131
3,MedDiet + Nuts,Male,71,Former,27.68,118,0.694118,Yes,No,Yes,No,No,8,2.907598,Yes,4,1132
4,MedDiet + VOO,Female,79,Never,35.94,129,0.806250,Yes,No,Yes,No,No,9,4.761123,No,2,1111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6319,Control,Female,66,Never,28.51,104,0.645963,Yes,No,Yes,Yes,No,8,3.550992,No,5,120
6320,Control,Male,80,Never,23.81,109,0.589189,Yes,Yes,Yes,Yes,No,8,2.743326,No,5,118
6321,MedDiet + Nuts,Male,57,Former,25.24,100,0.571429,Yes,No,Yes,No,,7,0.479124,No,3,351
6322,MedDiet + VOO,Female,71,Never,32.04,98,0.653333,Yes,No,Yes,Yes,No,6,2.587269,No,5,499


In [6]:
df.to_csv('predimed_full.csv', index=False)
df

Unnamed: 0,group,sex,age,smoke,bmi,waist,wth,htn,diab,hyperchol,famhist,hormo,p14,toevent,event,location-id,patient-id
0,Control,Male,58,Former,33.53,122,0.753086,No,No,Yes,No,No,10,5.374401,Yes,4,436
1,Control,Male,77,Current,31.05,119,0.730061,Yes,Yes,No,No,No,10,6.097194,No,4,1130
2,MedDiet + VOO,Female,72,Former,30.86,106,0.654321,No,Yes,No,Yes,No,8,5.946612,No,4,1131
3,MedDiet + Nuts,Male,71,Former,27.68,118,0.694118,Yes,No,Yes,No,No,8,2.907598,Yes,4,1132
4,MedDiet + VOO,Female,79,Never,35.94,129,0.806250,Yes,No,Yes,No,No,9,4.761123,No,2,1111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6319,Control,Female,66,Never,28.51,104,0.645963,Yes,No,Yes,Yes,No,8,3.550992,No,5,120
6320,Control,Male,80,Never,23.81,109,0.589189,Yes,Yes,Yes,Yes,No,8,2.743326,No,5,118
6321,MedDiet + Nuts,Male,57,Former,25.24,100,0.571429,Yes,No,Yes,No,,7,0.479124,No,3,351
6322,MedDiet + VOO,Female,71,Never,32.04,98,0.653333,Yes,No,Yes,Yes,No,6,2.587269,No,5,499


In [7]:
# separate into two datasets to be joined during the exercise
df1 = df[['patient-id', 'location-id', 'sex', 'age', 'smoke', 'bmi', 'waist', 'wth', 'htn', 'diab', 'hyperchol', 'famhist', 'hormo', 'p14', 'toevent', 'event']]

In [8]:
df1.iloc[np.random.permutation(len(df1))]

Unnamed: 0,patient-id,location-id,sex,age,smoke,bmi,waist,wth,htn,diab,hyperchol,famhist,hormo,p14,toevent,event
562,1269,4,Male,61,Former,33.33,98,0.653333,Yes,Yes,No,Yes,No,7,0.410678,Yes
4639,278,4,Male,64,Former,28.44,90,0.562500,No,No,Yes,No,,9,5.514031,No
1792,1080,2,Female,72,Never,27.59,94,0.648276,Yes,Yes,No,Yes,,9,4.848734,No
4919,1137,5,Male,69,Current,30.06,110,0.611111,Yes,No,Yes,No,No,9,1.965777,No
1269,901,2,Male,65,Never,28.06,100,0.581395,Yes,No,Yes,Yes,No,10,4.405202,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3972,1192,4,Male,58,Never,29.55,103,0.631902,No,Yes,No,No,No,11,4.971937,No
2947,316,1,Male,62,Current,30.06,107,0.672956,Yes,No,No,Yes,No,8,4.774812,No
5352,566,1,Female,65,Never,33.24,109,0.721854,Yes,No,Yes,No,No,5,5.826147,No
1356,670,3,Male,71,Former,28.33,106,0.634731,Yes,Yes,No,No,,10,5.746749,No


In [9]:
df1.to_csv('../../data/predimed_records.csv', index=False)

In [10]:
# second table contaings only information from which patient was assigned to which condition
df2 =  df[['location-id', 'patient-id', 'group']]
df2 = df2.sort_values(['location-id', 'patient-id'])
df2

Unnamed: 0,location-id,patient-id,group
5219,1,1,MedDiet + VOO
1153,1,2,MedDiet + Nuts
5731,1,3,MedDiet + Nuts
4839,1,4,MedDiet + VOO
2839,1,5,Control
...,...,...,...
2388,5,1253,MedDiet + VOO
2390,5,1254,MedDiet + Nuts
2397,5,1255,MedDiet + VOO
5062,5,1256,MedDiet + Nuts


In [11]:
# we purposedly remove 37 records at random, to make the merge not so boring
ids_to_keep = np.random.permutation(len(df1))[0:len(df2) - 37]
ids_to_keep

array([2130,  181,  970, ..., 3855, 5116, 3827])

In [12]:
df2.iloc[ids_to_keep].to_csv('../../data/predimed_mapping.csv', index=False)

In [13]:
# now we randomly select 42 patients to be outliers

In [14]:
outliers_indices = np.random.permutation(len(df1))[0:42]
outliers_indices

array([3109,  578,  603, 4949, 2445, 5409, 1698, 2791, 3399, 2801, 3381,
       3747, 2121,  648, 5492, 5194, 3103, 4033, 4819, 1287,  390,  743,
       3624, 3994,  853, 5658,  422, 4166, 1920, 4610, 1140,  997, 5376,
        554, 1534, 4887, 1260, 5326, 5145, 5577, 3810, 6104])

In [15]:
df.iloc[outliers_indices][['location-id', 'patient-id']].to_csv('../tabular_join/dropped.csv', index=False)