In [112]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupShuffleSplit, GroupKFold

In [113]:
df1 = pd.read_csv('patras_data.csv', parse_dates=['date'])
df2 = pd.read_csv('sheffield_data.csv', parse_dates=['date'])

# Helper functions

In [114]:
def imputer(df, lookup_table, median_val, variable):
    '''
    If zero height value, impute mean height value depending on patient's gender/ethnicity combination.
    If patient's gender/ethnicity combination has no non-zero mean height value, impute the mean height value of all patients with non-zero heights.
    '''
    df2 = df.copy()
    for i in range(len(df)):
        if df2.at[i, variable] == 0:
            for j in range(len(lookup_table)):
                if df2.at[i, 'gender'] == lookup_table.at[j, 'gender'] and df2.at[i, 'ethnicity'] == lookup_table.at[j, 'ethnicity']:
                    df2.at[i, variable] = lookup_table.at[j, variable]
                    break
            else:
                df2.at[i, variable] = median_val
    
    return df2

In [115]:
def shuffle_data_by_group(df, group_column_name, random_state=1):
    random.seed(random_state)
    group_object = df.groupby(group_column_name)
    groups = [group_object.get_group(x) for x in group_object.groups]
    random.shuffle(groups)
    for i in range(len(groups)):
        groups[i][group_column_name] = i
    return pd.concat(groups).reset_index(drop=True)

In [116]:
def reset_group_id(df, group_column_name):
    group_object = df.groupby(group_column_name)
    groups = [group_object.get_group(x) for x in group_object.groups]
    for i in range(len(groups)):
        groups[i][group_column_name] = i
    return pd.concat(groups).reset_index(drop=True)

In [117]:
def datetime_to_days_diff(df, group_column_name, time_column_name):
    group_object = df.groupby(group_column_name)
    grouped_data = [group_object.get_group(x) for x in group_object.groups]
    new_group_list = []
    for group in grouped_data:
        group['times'] = None
        ref = group[time_column_name].iloc[0]
        for i in range(len(group)):
            group['times'].iloc[i] = (group[time_column_name].iloc[i] - ref) / np.timedelta64(1, 'D')
        group['times'] = pd.to_numeric(group['times'], downcast="integer")
        new_group_list.append(group)
    df_new = pd.concat(new_group_list, axis=0, ignore_index=True)
    return df_new

In [118]:
def train_test_split_grouped_interpolation(df, group_sizes, test_size=0.2, random_state=1):
    '''
    Train/test split, but test set contains at least one observation from each group in the training set, and contains no unseen groups.
    '''
    assert 0 < test_size < 1, "Test size must be strictly between 0 and 1"
    assert np.sum(group_sizes) == len(df), "Sum of group_sizes must be equal to length dataframe"
    assert group_sizes.all() > 0, "Group sizes should be non-negative"
    assert len(group_sizes) < len(df), "Number of groups should be less than number of observations"

    np.random.seed(random_state)
    df_len = len(df)
    test_len = int(test_size * df_len)
    no_groups = len(group_sizes)

    # Pick one observation from all groups
    sample_len = no_groups
    n_samples_chosen_per_group = np.ones_like(group_sizes)
    last_idx_arr = np.cumsum(group_sizes)-1 # Array of index of the last observation in each group within the overall dataset
    test_idx = [last_idx_arr[i] for i in range(no_groups)]
    group_sizes_new = group_sizes.copy()
    group_sizes_new -= 1

    # Keep picking more observations until the required number of test observations has been picked
    while sample_len < test_len:
        group_idx = np.random.randint(no_groups) # Pick a random group
        if group_sizes_new[group_idx] > 1:
            if test_len - sample_len > 1:
                n = np.random.randint(1, min([group_sizes_new[group_idx], test_len-sample_len])) # Pick a random sample of size 1<=n<group_size from the chosen group
            else:
                n = 1
            last_idx = last_idx_arr[group_idx]-n_samples_chosen_per_group[group_idx] # Index of the last observation remaining in each group within the overall dataset
            test_idx += [last_idx-i for i in range(n)]
            n_samples_chosen_per_group[group_idx] += n # Update number of samples chosen from the group
            group_sizes_new[group_idx] -= n # Update current group sizes
            sample_len += n

    mask = np.ones(df_len, dtype=bool)
    mask[test_idx] = False
    df_train, df_test = df[mask], df[~mask]

    return df_train, df_test, n_samples_chosen_per_group

In [119]:
def train_test_split_grouped_extrapolation(df, groups, test_size=0.2, random_state=1):
    '''
    Train/test split for a dataframe, but test set only contains only unseen groups.
    ``test_size`` represents the proportion of groups to include in the test split (rounded up).
    '''
    train_idx, test_idx = next(GroupShuffleSplit(test_size=test_size, random_state=random_state).split(df, groups=groups))
    df_train, df_test = df.iloc[train_idx], df.iloc[test_idx]

    return df_train, df_test

# Combine dataframes

In [120]:
# Reset Patient ID for both dataframes
df1.groupby(['ID']).ngroup()

0         0
1         0
2         0
3         0
4         0
       ... 
4348    558
4349    558
4350    558
4351    559
4352    559
Length: 4353, dtype: int64

In [121]:
df1['ID'] = df1.groupby(['ID']).ngroup()

In [122]:
df2.groupby(['ID']).ngroup() + df1['ID'].unique().size

0        560
1        560
2        560
3        560
4        560
        ... 
6637    1552
6638    1552
6639    1553
6640    1553
6641    1553
Length: 6642, dtype: int64

In [123]:
df2['ID'] = df2.groupby(['ID']).ngroup() + df1['ID'].unique().size

In [124]:
data = pd.concat([df1, df2], axis=0, ignore_index=True)

In [125]:
# Select non-control patients
data = data[data['control_patient']==False]
data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,ID,site,date_entered_study,control_patient,dob_year,gender,ethnicity,employment,height,smoker,kidney_transplant,kidney_transplant_Date,Patient_died,Date_of_death,disease,CGN_disease,sampleId,date,egfr,ckd_stage,slope_allprior,N_allprior,Class_allprior,slope_2yprior,N_2yprior,Class_2yprior,bp.di,bp.sys,sc,bun,X24h.pr,sampletaken,ace_inhibitors,arbs,statins,rate_decline_egfr,urine_creatinine,one_over_slope,serum_cholestrol,serum_triglycerides,serum_calcium,serum_phosphorus,serum_ca_p,serum_PTH,protein_urea,serum_creatinin,urine_albumin,urineprotein_creat_ratio,urinealbumin_crea_ratio,pulse,Weight,bmi,bsa,waist_circum,hip_circum,midarm_circum,triceps_skinfold,diabetic,mt_glom_scler,mt_tub_fib,mt_vas_scler,serum_glucose,haematuria,micro_albuminuria,hba1c,c_reactive_protein,retinopathy,neuropathy,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe,AKI,Started_dialysis
0,0,Patras,16/2/16 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,False,,False,,DN,DN,,2011-04-10,69.0,,,1.0,,,1.0,,80.0,150.0,1.1,47.0,1250.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0,Patras,16/2/16 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,False,,False,,DN,DN,,2012-07-02,62.0,,-20.277778,2.0,Progressor,-20.277778,2.0,Progressor,0.0,0.0,1.2,64.0,2058.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0,Patras,16/2/16 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,False,,False,,DN,DN,,2012-11-09,56.0,,-13.424735,3.0,Progressor,-13.424735,3.0,Progressor,85.0,145.0,1.3,64.0,1993.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0,Patras,16/2/16 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,False,,False,,DN,DN,,2013-03-26,62.0,,-4.958831,4.0,Slow_progressor,-4.958831,4.0,Slow_progressor,90.0,160.0,1.2,55.0,1960.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0,Patras,16/2/16 17:37,False,1946.0,Male,Caucasian,Unemployed,1.7,Non-Smoker,False,,False,,DN,DN,,2013-09-17,50.0,,-7.323452,5.0,Progressor,-7.323452,5.0,Progressor,100.0,165.0,1.5,54.0,3936.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10814,1552,Sheffield,9/6/19,False,1973.0,Female,Caucasian,,0.0,,False,,False,,Other,Other,,2019-06-13,65.0,,,1.0,,,1.0,,95.0,137.0,91.0,5.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10815,1552,Sheffield,9/6/19,False,1973.0,Female,Caucasian,,0.0,,False,,False,,Other,Other,2164.0,2019-12-09,64.0,0.0,-4.010989,2.0,Slow_progressor,-4.010989,2.0,Slow_progressor,81.0,139.0,,4.5,0.0,1,False,False,False,-4.010989,117.647059,-0.000479,0.0,0.0,2.28,0.97,2.2116,0.0,0.07,92.0,4.7,595.0,3995.0,,76.6,,0.0,,,,,False,0.0,0.0,0.0,,,,,,,,,,,,,,,False,False
10816,1553,Sheffield,9/6/19,False,1948.0,Female,Caucasian,,0.0,,False,,False,,HTN,HTN,,2019-07-25,5.0,,,1.0,,,1.0,,0.0,0.0,709.0,29.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10817,1553,Sheffield,9/6/19,False,1948.0,Female,Caucasian,,0.0,,False,,False,,HTN,HTN,2166.0,2019-12-09,32.0,0.0,201.122449,2.0,Stable,201.122449,2.0,Stable,81.0,165.0,,12.3,0.0,1,False,False,False,201.122449,69.004525,0.041223,0.0,0.0,2.50,1.29,3.2250,0.0,0.14,144.0,0.0,,,,90.0,,0.0,,,,,False,0.0,0.0,0.0,,,,,,,,,,,,,,,False,False


# Cleaning not involving imputing values

## Initial cleaning

In [126]:
# Dataframe after selecting relevant columns
data2 = data[['ID', 'site', 'dob_year','gender', 'ethnicity', 'height', 'Weight', 'smoker', 'kidney_transplant', 'Patient_died', 'disease', 'bp.sys', 'bun', 'date', 'egfr']]
data2 = data2.rename(columns={'Weight': 'weight', 'Patient_died': 'patient_died'})
data2['ID'] = data2.groupby(['ID']).ngroup()
data2

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10814,1421,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10815,1421,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10816,1422,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10817,1422,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


In [127]:
data2['smoker'] = data2['smoker'].replace({'Past Smoker': 'Past-Smoker'})

In [128]:
data2['smoker'].value_counts()

Non-Smoker     2602
Past-Smoker    1567
Smoker          896
Name: smoker, dtype: int64

In [129]:
# Notice that for this patient, egfr and ckd_stage appears to have been switched incorrectly, so we manualy set egfr to the correct value
data.loc[data['egfr']==3]

Unnamed: 0,ID,site,date_entered_study,control_patient,dob_year,gender,ethnicity,employment,height,smoker,kidney_transplant,kidney_transplant_Date,Patient_died,Date_of_death,disease,CGN_disease,sampleId,date,egfr,ckd_stage,slope_allprior,N_allprior,Class_allprior,slope_2yprior,N_2yprior,Class_2yprior,bp.di,bp.sys,sc,bun,X24h.pr,sampletaken,ace_inhibitors,arbs,statins,rate_decline_egfr,urine_creatinine,one_over_slope,serum_cholestrol,serum_triglycerides,serum_calcium,serum_phosphorus,serum_ca_p,serum_PTH,protein_urea,serum_creatinin,urine_albumin,urineprotein_creat_ratio,urinealbumin_crea_ratio,pulse,Weight,bmi,bsa,waist_circum,hip_circum,midarm_circum,triceps_skinfold,diabetic,mt_glom_scler,mt_tub_fib,mt_vas_scler,serum_glucose,haematuria,micro_albuminuria,hba1c,c_reactive_protein,retinopathy,neuropathy,haemaglobin,wbc,platelets,vitamin_b12,folate,serum_fe,total_fe,AKI,Started_dialysis
3539,373,Patras,17/7/17 12:39,False,1956.0,Male,Caucasian,Semi-Professional,1.7,Smoker,False,,False,,DN,DN,1042.0,2017-04-12,3.0,41.0,-74.118006,3.0,Progressor,-74.118006,3.0,Progressor,100.0,170.0,,73.0,1639.0,1,False,True,True,-74.118006,66.26,-0.162154,,,10.0,4.5,0.0,,71.3,1.8,475.0,,,70.0,80.0,,-99.99,,,,,True,0.0,0.0,0.0,,,,,,,,,,,,,,,False,False


In [130]:
data2.loc[data2['egfr']==3, 'egfr'] = 41

In [131]:
# Sort values by date
data3 = data2.sort_values(['ID', 'date'], ignore_index=True)

## Regrouping ethnicity

In [132]:
data4 = data3.copy()

In [133]:
data4['ethnicity'].value_counts()

Caucasian                        9209
Asian                             146
Black (afro caribean descent)     108
Others                             56
Asian - Indian                     12
Asian (Pakistan)                   11
INDIAN                             11
Somali                              9
Asian(INDIAN)                       6
Chinese                             6
Indian                              5
Asian( PAKISTANI)                   5
Asian ( BANGLADESHHI)               2
Arab                                1
Name: ethnicity, dtype: int64

In [134]:
# Combine categories
data4['ethnicity'] = data4['ethnicity'].replace(dict.fromkeys(['Asian - Indian', 'Asian (Pakistan)', 'INDIAN', 'Chinese', 'Asian(INDIAN)', 'Indian', 'Asian( PAKISTANI)', 'Arab', 'Asian ( BANGLADESHHI)'], 'Asian'))
data4['ethnicity'] = data4['ethnicity'].replace({'Somali': 'Others'})
data4['ethnicity'] = data4['ethnicity'].replace({'Black (afro caribean descent)': 'Black'})

In [135]:
data4['ethnicity'].value_counts()

Caucasian    9209
Asian         205
Black         108
Others         65
Name: ethnicity, dtype: int64

## egfr

In [136]:
data4['egfr'].unique()

array([ 69. ,  62. ,  56. ,  50. ,  54. ,  53. ,  46. ,  58. ,  49. ,
        90. , 103. ,  33. ,  31. ,  29. ,  36. ,  35. ,  45. ,  48. ,
        59. ,  72. ,  41. ,  44. ,  64. ,  60. ,  71. ,  47. ,  66. ,
        76. ,  52. ,  39. ,  28. ,  84. ,  89. ,  24. ,  38. ,  27. ,
        21. ,  22. ,  30. ,  23. ,  20. ,  19. ,  26. ,  16. ,  13. ,
        32. ,  34. ,  43. ,  37. ,  25. ,   9. ,  15. ,  10. , 107. ,
        82. ,  93. ,  63. , 106. ,  92. , 118. , 150. , 149. , 101. ,
       125. , 127. , 124. ,  67. ,  55. ,  40. ,  57. ,  68. ,  42. ,
        65. ,  73. ,  80. ,  81. ,  61. ,  18. ,  78. ,  70. ,  14. ,
        17. ,  11. ,  12. ,  85. ,  77. ,  79. , 102. ,  74. ,  51. ,
        97. ,  83. ,  96. , 108. ,  94. ,  86. ,  75. , 109. ,  91. ,
       104. ,  88. ,  87. ,  95. , 114. , 100. , 133. ,  99. , 113. ,
       112. ,   7. , 120. ,  98. , 158. , 128. , 129. , 145. , 122. ,
       111. , 117. , 110. , 131. ,   8. , 123. , 146. , 105. , 136. ,
       119. , 141. ,

In [137]:
# Missing values for egfr
data4['egfr'].isna().sum()

402

In [138]:
# Drop all rows with missing egfr values
data5 = data4.dropna(subset=['egfr'])
data5.reset_index(inplace=True, drop=True)
data5['ID'] = data5.groupby(['ID']).ngroup()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [139]:
# Drop row with 0 egfr value (1 row)
data6 = data5.loc[data5['egfr'] != 0]
data6.reset_index(inplace=True, drop=True)
data6['ID'] = data6.groupby(['ID']).ngroup()
data6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10411,1370,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10412,1370,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10413,1371,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10414,1371,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


## Height

In [140]:
# Some issues with height
data6.describe()

Unnamed: 0,ID,dob_year,height,weight,bp.sys,bun,egfr
count,10416.0,8674.0,10406.0,3161.0,10118.0,10399.0,10416.0
mean,604.731663,1959.417454,5.495013,104.621711,106.65339,101.503183,46.569931
std,398.776884,15.168561,27.748251,1201.547418,62.594163,6793.564265,25.039977
min,0.0,1926.0,0.0,1.48,0.0,0.0,4.0
25%,228.0,1948.0,0.0,69.3,100.0,9.0,27.0
50%,616.0,1957.0,1.56,79.7,132.0,18.0,42.0
75%,936.0,1969.0,1.7,90.4,147.0,52.0,61.0
max,1371.0,2002.0,188.0,67125.0,1158.0,692801.0,189.0


In [141]:
data6['height'].unique()

array([  1.7  ,   1.67 ,   1.6  ,   1.68 ,   1.65 ,   1.63 ,   1.66 ,
         1.45 ,   1.64 ,   1.8  ,   1.75 ,   1.73 ,   1.62 ,   1.56 ,
         1.76 ,   1.74 ,   1.57 ,   1.71 ,   1.5  ,   1.78 ,   1.48 ,
         1.72 ,   1.69 ,   1.89 ,   1.52 ,   1.59 ,   1.58 ,   1.87 ,
         1.82 ,   1.77 ,   1.53 ,   1.44 ,   1.61 ,   0.   ,   1.95 ,
         1.54 ,   1.88 ,   1.81 ,   1.51 , 169.   ,   1.55 ,   1.9  ,
         2.   ,   1.42 ,   1.41 ,   1.47 ,   1.85 ,   1.83 ,   1.84 ,
           nan,   1.86 ,   1.79 ,   1.92 ,   1.91 , 158.   ,   1.601,
         1.752,   1.625,   1.98 , 180.   , 173.   ,   1.49 , 174.   ,
         2.02 ,  97.6  , 172.   , 166.   , 167.   , 171.   , 142.   ,
       181.   , 188.   , 176.   , 184.   , 168.   , 178.   , 179.   ,
       155.   , 182.   , 183.   , 165.   , 159.   ])

In [142]:
data6['height'].isna().sum()

10

In [143]:
data6[data6['height'].isna().to_numpy()]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
3858,364,Patras,1954.0,Female,Caucasian,,76.0,Non-Smoker,False,False,GMN,125.0,53.0,2017-12-10,53.0
3859,364,Patras,1954.0,Female,Caucasian,,90.0,Non-Smoker,False,False,GMN,130.0,50.0,2018-05-03,53.0
5636,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,142.0,9.0,2015-01-30,48.0
5637,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,,10.0,2015-10-29,51.0
5638,667,Sheffield,1988.0,Male,Caucasian,,,,False,False,HTN,145.0,10.0,2016-05-02,48.0
5639,667,Sheffield,1988.0,Male,Caucasian,,73.6,,False,False,HTN,154.0,11.3,2017-02-17,50.0
10216,1316,Sheffield,1946.0,Female,,,,,False,False,GMN,144.0,9.0,2018-05-24,24.0
10217,1316,Sheffield,1946.0,Female,,,,,False,False,GMN,149.0,11.0,2018-10-18,30.0
10218,1316,Sheffield,1946.0,Female,,,71.0,,False,False,GMN,150.0,10.3,2019-03-28,34.0
10219,1316,Sheffield,1946.0,Female,,,69.2,,False,False,GMN,144.0,14.0,2019-10-10,32.0


In [144]:
# Remove NaN values
data7 = data6.dropna(subset=['height'])
data7.reset_index(inplace=True, drop=True)
data7['ID'] = data7.groupby(['ID']).ngroup()
data7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10401,1367,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10402,1367,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10403,1368,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10404,1368,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


In [145]:
# Strange value
data7.loc[data7['height']==97.6]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
5994,711,Sheffield,,Male,Asian,97.6,,,False,False,GMN,0.0,0.0,2010-06-14,108.0
5995,711,Sheffield,,Male,Asian,97.6,,,False,False,GMN,0.0,0.0,2010-10-18,117.0
5996,711,Sheffield,,Male,Asian,97.6,107.8,,False,False,GMN,0.0,21.9,2010-11-10,18.0


In [146]:
# Remove strange values
data8 = data7.drop(data7.loc[data7['height']==97.6].index.to_numpy())
data8.reset_index(inplace=True, drop=True)
data8['ID'] = data8.groupby(['ID']).ngroup()
data8

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
0,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,150.0,47.0,2011-04-10,69.0
1,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,0.0,64.0,2012-07-02,62.0
2,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,145.0,64.0,2012-11-09,56.0
3,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,160.0,55.0,2013-03-26,62.0
4,0,Patras,1946.0,Male,Caucasian,1.7,,Non-Smoker,False,False,DN,165.0,54.0,2013-09-17,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10398,1366,Sheffield,1973.0,Female,Caucasian,0.0,,,False,False,Other,137.0,5.0,2019-06-13,65.0
10399,1366,Sheffield,1973.0,Female,Caucasian,0.0,76.6,,False,False,Other,139.0,4.5,2019-12-09,64.0
10400,1367,Sheffield,1948.0,Female,Caucasian,0.0,,,False,False,HTN,0.0,29.0,2019-07-25,5.0
10401,1367,Sheffield,1948.0,Female,Caucasian,0.0,90.0,,False,False,HTN,165.0,12.3,2019-12-09,32.0


In [147]:
# Convert all height measurements to meters
data8['height'] = np.where(data8['height'] > 5, data8['height'] / 100, data8['height'])

In [148]:
data8['height'].unique()

array([1.7  , 1.67 , 1.6  , 1.68 , 1.65 , 1.63 , 1.66 , 1.45 , 1.64 ,
       1.8  , 1.75 , 1.73 , 1.62 , 1.56 , 1.76 , 1.74 , 1.57 , 1.71 ,
       1.5  , 1.78 , 1.48 , 1.72 , 1.69 , 1.89 , 1.52 , 1.59 , 1.58 ,
       1.87 , 1.82 , 1.77 , 1.53 , 1.44 , 1.61 , 0.   , 1.95 , 1.54 ,
       1.88 , 1.81 , 1.51 , 1.55 , 1.9  , 2.   , 1.42 , 1.41 , 1.47 ,
       1.85 , 1.83 , 1.84 , 1.86 , 1.79 , 1.92 , 1.91 , 1.601, 1.752,
       1.625, 1.98 , 1.49 , 2.02 ])

In [149]:
# Too many rows with zero values for height to drop from the dataframe
print('Number of rows with zero height values: ',(data8['height']==0).sum())

# All patients with zero height values do not have previously-recorded height values
print('Do all patients with zero height values not have any previously-recorded height values? ', data8.groupby(['ID']).height.min().equals(data8.groupby(['ID']).height.max()))

# All patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)
print('Do all patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)? ', data8.loc[data8['height']!=0].groupby(['ID']).height.median().median() == data8.loc[data8['height']!=0].groupby(['ID']).first().height.median())

Number of rows with zero height values:  4761
Do all patients with zero height values not have any previously-recorded height values?  True
Do all patients with non-zero height values have the same height values for all of their individual patient-level observations (i.e. their heights do not change over time)?  True


## Age

In [150]:
# All patients with zero dob_year values do not have previously-recorded values
data8.groupby(['ID']).dob_year.min().equals(data8.groupby(['ID']).dob_year.max())

True

In [151]:
data9 = data8.copy()
data9['dob_year'].fillna(0, inplace=True)

## bp.sys

In [152]:
data10 = data9.copy()

In [153]:
# NaN, zero values and unusually small and large values
data10['bp.sys'].unique()

array([ 150.,    0.,  145.,  160.,  165.,  140.,  135.,  125.,  115.,
        120.,  180.,  130.,  110.,   nan,  155.,  137.,  122.,  105.,
        170.,  134.,  124.,  133.,  100.,  138.,  148.,  139.,  152.,
        121.,   96.,   95.,   90.,  200.,  136.,  128.,  127.,  132.,
        220.,  178.,  144.,  142.,   97.,  166.,  146.,  151.,  131.,
        117.,  126.,  300.,  123.,  175.,  149.,  190.,  156.,  114.,
         98.,  113.,  184.,  147.,  141.,  107.,   80.,  101.,  185.,
        195.,  154., 1158.,   14.,   85.,  106.,  143.,  164.,  103.,
        112.,  129.,  118.,  111.,  119.,  162.,  108.,  169.,  172.,
         92.,  177.,  153.,  159.,  174.,  116.,  173.,  168.,  183.,
        181.,  167.,  187.,  158.,  176.,  191.,  102.,  171.,  161.,
        163.,  188.,  192.,  157.,  196.,   91.,  199.,   88.,  109.,
        179.,   72.,  104.,  182.,  186.,  189.,  193.,  206.,  197.,
        209.,   17.,  237.,  202.,   99.,   93.,  215.,  225.,  207.,
        203.,  214.,

In [154]:
data10.loc[data10['bp.sys']==1158]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2044,168,Patras,1956.0,Male,Caucasian,1.63,67.0,Past-Smoker,False,False,GMN,1158.0,34.0,2016-12-07,122.0


In [155]:
data10.iloc[2043:2046, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2043,168,Patras,1956.0,Male,Caucasian,1.63,,Past-Smoker,False,False,GMN,120.0,34.0,2016-02-16,122.0
2044,168,Patras,1956.0,Male,Caucasian,1.63,67.0,Past-Smoker,False,False,GMN,1158.0,34.0,2016-12-07,122.0
2045,168,Patras,1956.0,Male,Caucasian,1.63,71.0,Past-Smoker,False,False,GMN,120.0,32.0,2017-01-31,146.0


In [156]:
# Change strange value
data10.loc[data10['bp.sys']==1158, ['bp.sys']] = 120.0

In [157]:
data10.loc[data10['bp.sys']==300]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
859,69,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,300.0,25.0,2015-03-26,72.0


In [158]:
data10.iloc[858:861, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
858,69,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,120.0,39.0,2014-09-22,65.0
859,69,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,300.0,25.0,2015-03-26,72.0
860,69,Patras,1952.0,Male,Caucasian,1.74,,Past-Smoker,True,False,Transplant,128.0,42.0,2015-05-10,72.0


In [159]:
# Change strange value
data10.loc[data10['bp.sys']==300, ['bp.sys']] = 120.0

In [160]:
data10.loc[data10['bp.sys']==17]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
5186,612,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,17.0,17.0,2008-03-10,23.0


In [161]:
data10.iloc[5185:5188, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
5185,612,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,160.0,17.0,2007-05-18,19.0
5186,612,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,17.0,17.0,2008-03-10,23.0
5187,612,Sheffield,0.0,Male,Caucasian,1.78,,Non-Smoker,False,False,DN,127.0,15.0,2008-04-01,24.0


In [162]:
# Change strange value
data10.loc[data10['bp.sys']==17, ['bp.sys']] = 160.0

In [163]:
data10.loc[data10['bp.sys']==14]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2057,169,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,14.0,31.0,2015-09-06,60.0


In [164]:
data10.iloc[2056:2059, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2056,169,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,140.0,42.0,2015-01-27,77.0
2057,169,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,14.0,31.0,2015-09-06,60.0
2058,169,Patras,1954.0,Female,Caucasian,1.58,,Non-Smoker,False,False,GMN,110.0,37.0,2015-09-29,60.0


In [165]:
# Change strange value
data10.loc[data10['bp.sys']==14, ['bp.sys']] = 140.0

In [166]:
# Replace NaN with zero for now
data10['bp.sys'].replace(0, np.nan, inplace=True)

## bun

In [167]:
data11 = data10.copy()

In [168]:
# NaN, zero values and unusually large values
# a = data11['bun'].unique()
# np.sort(a)

In [169]:
data11.loc[data11['bun']==692801]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
1851,151,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,90.0,692801.0,2015-07-29,23.0


In [170]:
data11.iloc[1850:1853, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
1850,151,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,,80.0,2015-01-28,29.0
1851,151,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,90.0,692801.0,2015-07-29,23.0
1852,151,Patras,1972.0,Male,Caucasian,1.71,,Non-Smoker,False,False,DN,101.0,72.0,2015-09-09,25.0


In [171]:
# Change strange value
data11.loc[data11['bun']==692801, ['bun']] = 80.0

In [172]:
data11.loc[data11['bun']==365]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2048,168,Patras,1956.0,Male,Caucasian,1.63,69.0,Past-Smoker,False,False,GMN,125.0,365.0,2018-01-30,122.0


In [173]:
data11.iloc[2047:2050, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2047,168,Patras,1956.0,Male,Caucasian,1.63,72.0,Past-Smoker,False,False,GMN,130.0,36.0,2017-09-26,122.0
2048,168,Patras,1956.0,Male,Caucasian,1.63,69.0,Past-Smoker,False,False,GMN,125.0,365.0,2018-01-30,122.0
2049,168,Patras,1956.0,Male,Caucasian,1.63,70.0,Past-Smoker,False,False,GMN,,42.0,2018-02-10,104.0


In [174]:
# Change strange value
data11.loc[data11['bun']==365, ['bun']] = 36.0

In [175]:
data11.loc[data11['bun']==471]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2107,174,Patras,1985.0,Female,Caucasian,1.57,,Non-Smoker,False,False,GMN,,471.0,2015-01-06,105.0


In [176]:
data11.iloc[2106:2109, :]

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr
2106,173,Patras,1966.0,Male,Caucasian,1.67,103.0,Past-Smoker,False,False,HTN,135.0,166.0,2019-07-31,11.0
2107,174,Patras,1985.0,Female,Caucasian,1.57,,Non-Smoker,False,False,GMN,,471.0,2015-01-06,105.0
2108,174,Patras,1985.0,Female,Caucasian,1.57,,Non-Smoker,False,False,GMN,,60.0,2015-03-30,105.0


In [177]:
# Change strange value
data11.loc[data11['bun']==471, ['bun']] = 60.0

In [178]:
# Replace NaN with zero for now
data11['bun'].replace(0, np.nan, inplace=True)

## Weight

In [179]:
# No zero values for weight, but many NaN values
print(data11['weight'].isna().sum())
data11.loc[data11['weight']==0]

7248


Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr


In [180]:
data12 = pd.concat([data11.loc[(data11['weight'] > 30) & (data11['weight'] < 200)], data11.loc[data11['weight'].isna()]]).reset_index(drop=True)
data12['ID'] = data12.groupby(['ID']).ngroup()
data12.sort_values(['ID', 'date'], ignore_index=True, inplace=True)

# Cleaning involving imputing values
We process data in the following data sets **separately** to avoid data leak:


*   Test (extrapolation): "test_extrapolation"
*   Test (interpolation): "test_interpolation"
*   Full training set: "data_train_full"

For 5-fold cross-validation:
*   Train set 1, Val set 1
*   Train set 2, Val set 2
*   Train set 3, Val set 3
*   Train set 4, Val set 4
*   Train set 5, Val set 5

stored in "train_set_list", "val_set_list"

For validation (Deep ME):
*   Train set: "data_train"
*   Val set: "data_val"

All data in test and validation sets are imputed based solely on the training set(s)

In [181]:
# Convert dates to number of days since first observation for each patient
data13 = datetime_to_days_diff(data12, 'ID', 'date')

# Shuffle data by patient
data14 = shuffle_data_by_group(data13, 'ID', random_state=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

In [182]:
data14_patras = data14.loc[data14['site'] == 'Patras'].reset_index(drop=True)
data14_patras['ID'] = data14_patras.groupby(['ID']).ngroup()
data14_sheffield = data14.loc[data14['site'] == 'Sheffield'].reset_index(drop=True)
data14_sheffield['ID'] = data14_sheffield.groupby(['ID']).ngroup()

## Create main train, test extrapolation, test interpolation sets

### Patras

In [183]:
# Get patients with only 1 observation, 80 in total
a = data14_patras.groupby(['ID']).size()==1
single_obs_rows_patras = data14_patras[data14_patras['ID'].map(a)]

In [184]:
# Remaining data
data15_patras = data14_patras[data14_patras['ID'].map(~a)]

# Number of remaining groups
data15_patras['ID'].unique().size

438

In [185]:
# Split test extrapolation set
data_temp_patras, test_extrapolation_patras = train_test_split_grouped_extrapolation(data15_patras, data15_patras['ID'], test_size=0.2, random_state=1)

In [186]:
test_extrapolation_patras = pd.concat([test_extrapolation_patras, single_obs_rows_patras]).reset_index(drop=True)
test_extrapolation_patras

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,5,Patras,1946.0,Male,Caucasian,1.76,,Past-Smoker,False,False,Other,,35.0,2017-03-05,53.0,0
1,5,Patras,1946.0,Male,Caucasian,1.76,,Past-Smoker,False,False,Other,,,2017-04-20,50.0,46
2,5,Patras,1946.0,Male,Caucasian,1.76,68.0,Past-Smoker,False,False,Other,140.0,48.0,2017-09-22,40.0,201
3,5,Patras,1946.0,Male,Caucasian,1.76,68.0,Past-Smoker,False,False,Other,135.0,44.0,2020-01-27,42.0,1058
4,6,Patras,1962.0,Male,Caucasian,1.65,,Smoker,False,False,DN,,43.0,2016-03-16,83.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929,458,Patras,1930.0,Female,Caucasian,1.65,75.0,Non-Smoker,False,False,GMN,,104.0,2019-11-19,20.0,0
930,464,Patras,1947.0,Male,Caucasian,1.75,78.0,,False,False,Other,,46.0,2016-10-20,44.0,0
931,473,Patras,1997.0,Female,Caucasian,1.61,62.0,Non-Smoker,False,False,GMN,120.0,33.0,2019-04-16,112.0,0
932,474,Patras,1947.0,Male,Caucasian,1.78,75.0,Past-Smoker,False,False,GMN,110.0,78.0,2020-04-28,37.0,0


In [187]:
data_temp_patras.reset_index(inplace=True, drop=True)
data_temp_patras.sort_values(['ID', 'times'], ignore_index=True, inplace=True)
group_sizes = data_temp_patras.groupby(['ID']).size().to_numpy()
data_train_full_patras, test_interpolation_patras, n_samples_chosen_per_group_patras = train_test_split_grouped_interpolation(data_temp_patras, group_sizes, test_size=0.25, random_state=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [188]:
test_interpolation_patras.reset_index(inplace=True, drop=True)
test_interpolation_patras

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,0,Patras,1955.0,Male,Caucasian,1.75,80.0,Smoker,False,False,HTN,130.0,48.0,2016-05-27,60.0,391
1,1,Patras,1945.0,Male,Caucasian,1.67,,Non-Smoker,False,False,DN,,86.0,2016-02-19,40.0,1446
2,1,Patras,1945.0,Male,Caucasian,1.67,89.0,Non-Smoker,False,False,DN,140.0,201.0,2017-11-04,26.0,2070
3,3,Patras,1971.0,Male,Caucasian,1.75,,Past-Smoker,True,False,Transplant,,69.0,2013-02-07,64.0,101
4,3,Patras,1971.0,Male,Caucasian,1.75,,Past-Smoker,True,False,Transplant,150.0,78.0,2013-04-02,55.0,155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,490,Patras,1977.0,Male,Caucasian,1.65,85.0,Smoker,False,False,GMN,130.0,64.0,2016-09-19,60.0,958
831,490,Patras,1977.0,Male,Caucasian,1.65,85.0,Smoker,False,False,GMN,145.0,100.0,2017-04-12,47.0,1163
832,490,Patras,1977.0,Male,Caucasian,1.65,82.0,Smoker,False,False,GMN,140.0,65.0,2017-09-01,65.0,1305
833,490,Patras,1977.0,Male,Caucasian,1.65,86.0,Smoker,False,False,GMN,120.0,52.0,2017-12-06,60.0,1401


In [189]:
data_train_full_patras.reset_index(inplace=True, drop=True)
data_train_full_patras

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,0,Patras,1955.0,Male,Caucasian,1.75,,Smoker,False,False,HTN,,81.0,2015-05-02,41.0,0
1,0,Patras,1955.0,Male,Caucasian,1.75,,Smoker,False,False,HTN,,52.0,2015-07-05,60.0,64
2,0,Patras,1955.0,Male,Caucasian,1.75,,Smoker,False,False,HTN,,34.0,2015-11-15,51.0,197
3,0,Patras,1955.0,Male,Caucasian,1.75,80.0,Smoker,False,False,HTN,150.0,43.0,2016-05-10,72.0,374
4,1,Patras,1945.0,Male,Caucasian,1.67,,Non-Smoker,False,False,DN,,88.0,2012-03-05,36.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2503,491,Patras,1980.0,Male,Caucasian,1.78,68.0,Non-Smoker,True,False,Transplant,,46.0,2017-03-14,66.0,275
2504,491,Patras,1980.0,Male,Caucasian,1.78,68.0,Non-Smoker,True,False,Transplant,120.0,63.0,2017-06-06,61.0,359
2505,491,Patras,1980.0,Male,Caucasian,1.78,68.0,Non-Smoker,True,False,Transplant,,39.0,2017-09-28,66.0,473
2506,491,Patras,1980.0,Male,Caucasian,1.78,68.0,Non-Smoker,True,False,Transplant,120.0,45.0,2017-11-29,61.0,535


### Sheffield

In [190]:
# Get patients with only 1 observation, 80 in total
b = data14_sheffield.groupby(['ID']).size()==1
single_obs_rows_sheffield = data14_sheffield[data14_sheffield['ID'].map(b)]

In [191]:
# Remaining data
data15_sheffield = data14_sheffield[data14_sheffield['ID'].map(~b)]

# Number of remaining groups
data15_sheffield['ID'].unique().size

849

In [192]:
# Split test extrapolation set
data_temp_sheffield, test_extrapolation_sheffield = train_test_split_grouped_extrapolation(data15_sheffield, data15_sheffield['ID'], test_size=0.2, random_state=1)

In [193]:
test_extrapolation_sheffield = pd.concat([test_extrapolation_sheffield, single_obs_rows_sheffield]).reset_index(drop=True)
test_extrapolation_sheffield

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,3,Sheffield,1972.0,Female,Caucasian,0.00,,,True,False,Transplant,135.0,7.0,2015-06-05,52.0,0
1,3,Sheffield,1972.0,Female,Caucasian,0.00,,,True,False,Transplant,148.0,5.0,2016-06-01,55.0,362
2,3,Sheffield,1972.0,Female,Caucasian,0.00,,,True,False,Transplant,158.0,7.0,2016-08-31,54.0,453
3,3,Sheffield,1972.0,Female,Caucasian,0.00,,,True,False,Transplant,154.0,5.0,2017-04-01,54.0,666
4,3,Sheffield,1972.0,Female,Caucasian,0.00,,,True,False,Transplant,144.0,10.0,2017-09-20,51.0,838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310,595,Sheffield,0.0,Female,Caucasian,0.00,84.8,,False,False,GMN,139.0,8.1,2010-09-17,50.0,0
1311,604,Sheffield,0.0,Male,Caucasian,1.74,,Past-Smoker,False,False,Other,177.0,14.0,2009-10-07,25.0,0
1312,613,Sheffield,1972.0,Male,,0.00,,,False,False,Other,,6.2,2019-03-27,90.0,0
1313,703,Sheffield,0.0,Male,Asian,1.63,,Past-Smoker,False,False,Vascular,185.0,14.0,2009-10-07,31.0,0


In [194]:
data_temp_sheffield.reset_index(inplace=True, drop=True)
data_temp_sheffield.sort_values(['ID', 'times'], ignore_index=True, inplace=True)
group_sizes = data_temp_sheffield.groupby(['ID']).size().to_numpy()
data_train_full_sheffield, test_interpolation_sheffield, n_samples_chosen_per_group_sheffield = train_test_split_grouped_interpolation(data_temp_sheffield, group_sizes, test_size=0.25, random_state=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [195]:
test_interpolation_sheffield.reset_index(inplace=True, drop=True)
test_interpolation_sheffield

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,0,Sheffield,1955.0,Male,,0.00,,,False,False,GMN,172.0,7.0,2017-02-23,75.0,1091
1,1,Sheffield,1959.0,Female,Caucasian,0.00,70.7,,True,True,Transplant,174.0,10.0,2017-06-14,29.0,1223
2,2,Sheffield,1958.0,Male,Caucasian,0.00,65.2,,True,False,Transplant,139.0,14.9,2019-03-20,26.0,1723
3,4,Sheffield,0.0,Male,Caucasian,0.00,,,False,False,GMN,,,2012-10-01,103.0,805
4,5,Sheffield,1941.0,Female,,0.00,66.0,,False,False,GMN,172.0,11.5,2016-10-28,29.0,173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,868,Sheffield,1967.0,Male,,1.79,83.2,,False,False,DN,167.0,31.3,2018-06-03,26.0,1594
1197,870,Sheffield,1958.0,Male,Caucasian,0.00,89.4,,True,False,Transplant,135.0,5.7,2019-12-02,64.0,1980
1198,872,Sheffield,1952.0,Female,Caucasian,0.00,57.4,,False,False,GMN,147.0,23.8,2019-08-02,25.0,945
1199,873,Sheffield,0.0,Female,Caucasian,0.00,,,False,False,HTN,142.0,,2010-10-15,16.0,854


In [196]:
data_train_full_sheffield.reset_index(inplace=True, drop=True)
data_train_full_sheffield

Unnamed: 0,ID,site,dob_year,gender,ethnicity,height,weight,smoker,kidney_transplant,patient_died,disease,bp.sys,bun,date,egfr,times
0,0,Sheffield,1955.0,Male,,0.0,,,False,False,GMN,,8.0,2014-02-28,73.0,0
1,0,Sheffield,1955.0,Male,,0.0,,,False,False,GMN,143.0,8.0,2014-08-15,62.0,168
2,0,Sheffield,1955.0,Male,,0.0,,,False,False,GMN,150.0,6.0,2014-12-19,74.0,294
3,0,Sheffield,1955.0,Male,,0.0,,,False,False,GMN,138.0,7.0,2015-07-05,77.0,492
4,0,Sheffield,1955.0,Male,,0.0,,,False,False,GMN,144.0,7.0,2015-10-29,70.0,608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3600,873,Sheffield,0.0,Female,Caucasian,0.0,,,False,False,HTN,119.0,,2010-09-07,19.0,816
3601,874,Sheffield,1949.0,Female,Caucasian,0.0,,,False,False,HTN,141.0,8.0,2015-10-21,22.0,0
3602,874,Sheffield,1949.0,Female,Caucasian,0.0,,,False,False,HTN,178.0,14.0,2016-02-25,25.0,127
3603,874,Sheffield,1949.0,Female,Caucasian,0.0,,,False,False,HTN,150.0,13.0,2017-01-19,26.0,456


## Create validation sets

In [197]:
# Cross-validation sets
train_set_list_patras, val_set_list_patras = [], []
group_kfold = GroupKFold(n_splits=5)
for train_index, val_index in group_kfold.split(data_train_full_patras, groups=data_train_full_patras['ID']):
    train_set_list_patras.append(data_train_full_patras.iloc[train_index].reset_index(drop=True))
    val_set_list_patras.append(data_train_full_patras.iloc[val_index].reset_index(drop=True))

train_set_list_sheffield, val_set_list_sheffield = [], []
group_kfold = GroupKFold(n_splits=5)
for train_index, val_index in group_kfold.split(data_train_full_sheffield, groups=data_train_full_sheffield['ID']):
    train_set_list_sheffield.append(data_train_full_sheffield.iloc[train_index].reset_index(drop=True))
    val_set_list_sheffield.append(data_train_full_sheffield.iloc[val_index].reset_index(drop=True))

In [198]:
# Training and validation sets
data_train_patras, data_val_patras = train_test_split_grouped_extrapolation(data_train_full_patras, data_train_full_patras['ID'], test_size=0.25, random_state=1)
data_train_patras.reset_index(inplace=True, drop=True)
data_val_patras.reset_index(inplace=True, drop=True)

data_train_sheffield, data_val_sheffield = train_test_split_grouped_extrapolation(data_train_full_sheffield, data_train_full_sheffield['ID'], test_size=0.25, random_state=1)
data_train_sheffield.reset_index(inplace=True, drop=True)
data_val_sheffield.reset_index(inplace=True, drop=True)

## Height

### Impute for data_train_full, test_extrapolation, test_interpolation

In [199]:
# Median height among all non-zero rows
median_height = data_train_full_patras.loc[data_train_full_patras['height']!=0].groupby(['ID']).height.median().median()

# Median heights by gender and ethnicity among patients with non-zero height values
height_table = pd.DataFrame({'height': data_train_full_patras.loc[data_train_full_patras['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()

data_train_full_patras2 = imputer(data_train_full_patras, height_table, median_height, 'height')
test_extrapolation_patras2 = imputer(test_extrapolation_patras, height_table, median_height, 'height')
test_interpolation_patras2 = imputer(test_interpolation_patras, height_table, median_height, 'height')

# Median height among all non-zero rows
median_height = data_train_full_sheffield.loc[data_train_full_sheffield['height']!=0].groupby(['ID']).height.median().median()

# Median heights by gender and ethnicity among patients with non-zero height values
height_table = pd.DataFrame({'height': data_train_full_sheffield.loc[data_train_full_sheffield['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()

data_train_full_sheffield2 = imputer(data_train_full_sheffield, height_table, median_height, 'height')
test_extrapolation_sheffield2 = imputer(test_extrapolation_sheffield, height_table, median_height, 'height')
test_interpolation_sheffield2 = imputer(test_interpolation_sheffield, height_table, median_height, 'height')

### Impute for data_train and data_val

In [200]:
# Median height among all non-zero rows
median_height = data_train_patras.loc[data_train_patras['height']!=0].groupby(['ID']).height.median().median()

# Median heights by gender and ethnicity among patients with non-zero height values
height_table = pd.DataFrame({'height': data_train_patras.loc[data_train_patras['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()

data_train_patras2 = imputer(data_train_patras, height_table, median_height, 'height')
data_val_patras2 = imputer(data_val_patras, height_table, median_height, 'height')

median_height = data_train_sheffield.loc[data_train_sheffield['height']!=0].groupby(['ID']).height.median().median()

# Median heights by gender and ethnicity among patients with non-zero height values
height_table = pd.DataFrame({'height': data_train_sheffield.loc[data_train_sheffield['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()

data_train_sheffield2 = imputer(data_train_sheffield, height_table, median_height, 'height')
data_val_sheffield2 = imputer(data_val_sheffield, height_table, median_height, 'height')

### Impute for CV sets

In [201]:
train_set_list_patras2, val_set_list_patras2 = [], []
train_set_list_sheffield2, val_set_list_sheffield2 = [], []
for i in range(len(train_set_list_patras)):
    train_set_patras = train_set_list_patras[i]
    val_set_patras = val_set_list_patras[i]
    median_height = train_set_patras.loc[train_set_patras['height']!=0].groupby(['ID']).height.median().median()
    height_table = pd.DataFrame({'height': train_set_patras.loc[train_set_patras['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()
    train_set_list_patras2.append(imputer(train_set_patras, height_table, median_height, 'height'))
    val_set_list_patras2.append(imputer(val_set_patras, height_table, median_height, 'height'))

    train_set_sheffield = train_set_list_sheffield[i]
    val_set_sheffield = val_set_list_sheffield[i]
    median_height = train_set_sheffield.loc[train_set_sheffield['height']!=0].groupby(['ID']).height.median().median()
    height_table = pd.DataFrame({'height': train_set_sheffield.loc[train_set_sheffield['height']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).height.median()}).reset_index()
    train_set_list_sheffield2.append(imputer(train_set_sheffield, height_table, median_height, 'height'))
    val_set_list_sheffield2.append(imputer(val_set_sheffield, height_table, median_height, 'height'))

## Weight

### Impute for data_train_full, test_extrapolation, test_interpolation

In [202]:
# Median weight among all non-NaN rows
median_weight = data_train_full_patras2.loc[~data_train_full_patras2['weight'].isna()].groupby(['ID']).weight.median().median()

# Impute NaN values using the mean of each patient in the training set where available
data_train_full_patras2['weight'] = data_train_full_patras2['weight'].fillna(data_train_full_patras2.groupby('ID')['weight'].transform('mean'))
test_extrapolation_patras2['weight'] = test_extrapolation_patras2['weight'].fillna(data_train_full_patras2.groupby('ID')['weight'].transform('mean'))
test_interpolation_patras2['weight'] = test_interpolation_patras2['weight'].fillna(data_train_full_patras2.groupby('ID')['weight'].transform('mean'))

# Fill remaining NaN values with overall median
data_train_full_patras2['weight'].fillna(median_weight, inplace=True)
test_extrapolation_patras2['weight'].fillna(median_weight, inplace=True)
test_interpolation_patras2['weight'].fillna(median_weight, inplace=True)

# Median weight among all non-NaN rows
median_weight = data_train_full_sheffield2.loc[~data_train_full_sheffield2['weight'].isna()].groupby(['ID']).weight.median().median()

# Impute NaN values using the mean of each patient in the training set where available
data_train_full_sheffield2['weight'] = data_train_full_sheffield2['weight'].fillna(data_train_full_sheffield2.groupby('ID')['weight'].transform('mean'))
test_extrapolation_sheffield2['weight'] = test_extrapolation_sheffield2['weight'].fillna(data_train_full_sheffield2.groupby('ID')['weight'].transform('mean'))
test_interpolation_sheffield2['weight'] = test_interpolation_sheffield2['weight'].fillna(data_train_full_sheffield2.groupby('ID')['weight'].transform('mean'))

# Fill remaining NaN values with overall median
data_train_full_sheffield2['weight'].fillna(median_weight, inplace=True)
test_extrapolation_sheffield2['weight'].fillna(median_weight, inplace=True)
test_interpolation_sheffield2['weight'].fillna(median_weight, inplace=True)

### Impute for data_train and data_val

In [203]:
# Median weight among all non-NaN rows
median_weight = data_train_patras2.loc[~data_train_patras2['weight'].isna()].groupby(['ID']).weight.median().median()

# Impute NaN values using the mean of each patient in the training set where available
data_train_patras2['weight'] = data_train_patras2['weight'].fillna(data_train_patras2.groupby('ID')['weight'].transform('mean'))
data_val_patras2['weight'] = data_val_patras2['weight'].fillna(data_train_patras2.groupby('ID')['weight'].transform('mean'))

# Fill remaining NaN values with overall median
data_train_patras2['weight'].fillna(median_weight, inplace=True)
data_val_patras2['weight'].fillna(median_weight, inplace=True)

# Median weight among all non-NaN rows
median_weight = data_train_sheffield2.loc[~data_train_sheffield2['weight'].isna()].groupby(['ID']).weight.median().median()

# Impute NaN values using the mean of each patient in the training set where available
data_train_sheffield2['weight'] = data_train_sheffield2['weight'].fillna(data_train_sheffield2.groupby('ID')['weight'].transform('mean'))
data_val_sheffield2['weight'] = data_val_sheffield2['weight'].fillna(data_train_sheffield2.groupby('ID')['weight'].transform('mean'))

# Fill remaining NaN values with overall median
data_train_sheffield2['weight'].fillna(median_weight, inplace=True)
data_val_sheffield2['weight'].fillna(median_weight, inplace=True)

### Impute for CV sets

In [204]:
for i in range(len(train_set_list_patras2)):
    # Median weight among all non-NaN rows
    median_weight = train_set_list_patras2[i].loc[~train_set_list_patras2[i]['weight'].isna()].groupby(['ID']).weight.median().median()

    # Impute NaN values using the mean of each patient in the training set where available
    train_set_list_patras2[i]['weight'] = train_set_list_patras2[i]['weight'].fillna(train_set_list_patras2[i].groupby('ID')['weight'].transform('mean'))
    val_set_list_patras2[i]['weight'] = val_set_list_patras2[i]['weight'].fillna(train_set_list_patras2[i].groupby('ID')['weight'].transform('mean'))

    # Fill remaining NaN values with overall median
    train_set_list_patras2[i]['weight'].fillna(median_weight, inplace=True)
    val_set_list_patras2[i]['weight'].fillna(median_weight, inplace=True)
    
    # Median weight among all non-NaN rows
    median_weight = train_set_list_sheffield2[i].loc[~train_set_list_sheffield2[i]['weight'].isna()].groupby(['ID']).weight.median().median()

    # Impute NaN values using the mean of each patient in the training set where available
    train_set_list_sheffield2[i]['weight'] = train_set_list_sheffield2[i]['weight'].fillna(train_set_list_sheffield2[i].groupby('ID')['weight'].transform('mean'))
    val_set_list_sheffield2[i]['weight'] = val_set_list_sheffield2[i]['weight'].fillna(train_set_list_sheffield2[i].groupby('ID')['weight'].transform('mean'))

    # Fill remaining NaN values with overall median
    train_set_list_sheffield2[i]['weight'].fillna(median_weight, inplace=True)
    val_set_list_sheffield2[i]['weight'].fillna(median_weight, inplace=True)

## Age

### Impute for data_train_full, test_extrapolation, test_interpolation

In [205]:
# Median dob_year among all non-zero rows
median_dob = data_train_full_patras2.loc[data_train_full_patras2['dob_year']!=0].groupby(['ID']).dob_year.median().median()

# Median dob_year by gender and ethnicity among patients with non-zero dob_year values
dob_table = pd.DataFrame({'dob_year': data_train_full_patras2.loc[data_train_full_patras2['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()

data_train_full_patras3 = imputer(data_train_full_patras2, dob_table, median_dob, 'dob_year')
test_extrapolation_patras3 = imputer(test_extrapolation_patras2, dob_table, median_dob, 'dob_year')
test_interpolation_patras3 = imputer(test_interpolation_patras2, dob_table, median_dob, 'dob_year')

# Convert dob_year to age
data_train_full_patras3.insert(3, 'age', data_train_full_patras3['date'].dt.year - data_train_full_patras3['dob_year'])
data_train_full_patras3.drop(columns=['dob_year', 'date'], inplace=True)
test_extrapolation_patras3.insert(3, 'age', test_extrapolation_patras3['date'].dt.year - test_extrapolation_patras3['dob_year'])
test_extrapolation_patras3.drop(columns=['dob_year', 'date'], inplace=True)
test_interpolation_patras3.insert(3, 'age', test_interpolation_patras3['date'].dt.year - test_interpolation_patras3['dob_year'])
test_interpolation_patras3.drop(columns=['dob_year', 'date'], inplace=True)

# Median dob_year among all non-zero rows
median_dob = data_train_full_sheffield2.loc[data_train_full_sheffield2['dob_year']!=0].groupby(['ID']).dob_year.median().median()

# Median dob_year by gender and ethnicity among patients with non-zero dob_year values
dob_table = pd.DataFrame({'dob_year': data_train_full_sheffield2.loc[data_train_full_sheffield2['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()

data_train_full_sheffield3 = imputer(data_train_full_sheffield2, dob_table, median_dob, 'dob_year')
test_extrapolation_sheffield3 = imputer(test_extrapolation_sheffield2, dob_table, median_dob, 'dob_year')
test_interpolation_sheffield3 = imputer(test_interpolation_sheffield2, dob_table, median_dob, 'dob_year')

# Convert dob_year to age
data_train_full_sheffield3.insert(3, 'age', data_train_full_sheffield3['date'].dt.year - data_train_full_sheffield3['dob_year'])
data_train_full_sheffield3.drop(columns=['dob_year', 'date'], inplace=True)
test_extrapolation_sheffield3.insert(3, 'age', test_extrapolation_sheffield3['date'].dt.year - test_extrapolation_sheffield3['dob_year'])
test_extrapolation_sheffield3.drop(columns=['dob_year', 'date'], inplace=True)
test_interpolation_sheffield3.insert(3, 'age', test_interpolation_sheffield3['date'].dt.year - test_interpolation_sheffield3['dob_year'])
test_interpolation_sheffield3.drop(columns=['dob_year', 'date'], inplace=True)

### Impute for data_train and data_val

In [206]:
# Median dob_year among all non-zero rows
median_dob = data_train_patras2.loc[data_train_patras2['dob_year']!=0].groupby(['ID']).dob_year.median().median()

# Median dob_year by gender and ethnicity among patients with non-zero dob_year values
dob_table = pd.DataFrame({'dob_year': data_train_patras2.loc[data_train_patras2['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()

data_train_patras3 = imputer(data_train_patras2, dob_table, median_dob, 'dob_year')
data_val_patras3 = imputer(data_val_patras2, dob_table, median_dob, 'dob_year')

# Convert dob_year to age
data_train_patras3.insert(3, 'age', data_train_patras3['date'].dt.year - data_train_patras3['dob_year'])
data_train_patras3.drop(columns=['dob_year', 'date'], inplace=True)
data_val_patras3.insert(3, 'age', data_val_patras3['date'].dt.year - data_val_patras3['dob_year'])
data_val_patras3.drop(columns=['dob_year', 'date'], inplace=True)

# Median dob_year among all non-zero rows
median_dob = data_train_sheffield2.loc[data_train_sheffield2['dob_year']!=0].groupby(['ID']).dob_year.median().median()

# Median dob_year by gender and ethnicity among patients with non-zero dob_year values
dob_table = pd.DataFrame({'dob_year': data_train_sheffield2.loc[data_train_sheffield2['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()

data_train_sheffield3 = imputer(data_train_sheffield2, dob_table, median_dob, 'dob_year')
data_val_sheffield3 = imputer(data_val_sheffield2, dob_table, median_dob, 'dob_year')

# Convert dob_year to age
data_train_sheffield3.insert(3, 'age', data_train_sheffield3['date'].dt.year - data_train_sheffield3['dob_year'])
data_train_sheffield3.drop(columns=['dob_year', 'date'], inplace=True)
data_val_sheffield3.insert(3, 'age', data_val_sheffield3['date'].dt.year - data_val_sheffield3['dob_year'])
data_val_sheffield3.drop(columns=['dob_year', 'date'], inplace=True)

### Impute for CV sets

In [207]:
train_set_list_patras3, val_set_list_patras3 = [], []
train_set_list_sheffield3, val_set_list_sheffield3 = [], []
for i in range(len(train_set_list_patras2)):
    train_set_patras = train_set_list_patras2[i]
    val_set_patras = val_set_list_patras2[i]
    median_dob = train_set_patras.loc[train_set_patras['dob_year']!=0].groupby(['ID']).dob_year.median().median()
    dob_table = pd.DataFrame({'dob_year': train_set_patras.loc[train_set_patras['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()
    train_set_patras3 = imputer(train_set_patras, dob_table, median_dob, 'dob_year')
    val_set_patras3 = imputer(val_set_patras, dob_table, median_dob, 'dob_year')
    train_set_patras3.insert(3, 'age', train_set_patras3['date'].dt.year - train_set_patras3['dob_year'])
    train_set_patras3.drop(columns=['dob_year', 'date'], inplace=True)
    val_set_patras3.insert(3, 'age', val_set_patras3['date'].dt.year - val_set_patras3['dob_year'])
    val_set_patras3.drop(columns=['dob_year', 'date'], inplace=True)
    train_set_list_patras3.append(train_set_patras3)
    val_set_list_patras3.append(val_set_patras3)

    train_set_sheffield = train_set_list_sheffield2[i]
    val_set_sheffield = val_set_list_sheffield2[i]
    median_dob = train_set_sheffield.loc[train_set_sheffield['dob_year']!=0].groupby(['ID']).dob_year.median().median()
    dob_table = pd.DataFrame({'dob_year': train_set_sheffield.loc[train_set_sheffield['dob_year']!=0].groupby(['ID']).first().groupby(['gender', 'ethnicity']).dob_year.median()}).reset_index()
    train_set_sheffield3 = imputer(train_set_sheffield, dob_table, median_dob, 'dob_year')
    val_set_sheffield3 = imputer(val_set_sheffield, dob_table, median_dob, 'dob_year')
    train_set_sheffield3.insert(3, 'age', train_set_sheffield3['date'].dt.year - train_set_sheffield3['dob_year'])
    train_set_sheffield3.drop(columns=['dob_year', 'date'], inplace=True)
    val_set_sheffield3.insert(3, 'age', val_set_sheffield3['date'].dt.year - val_set_sheffield3['dob_year'])
    val_set_sheffield3.drop(columns=['dob_year', 'date'], inplace=True)
    train_set_list_sheffield3.append(train_set_sheffield3)
    val_set_list_sheffield3.append(val_set_sheffield3)

## bp.sys

In [208]:
# Fill zero values with mean of each patient
data_train_full_patras3['bp.sys'] = data_train_full_patras3['bp.sys'].fillna(data_train_full_patras3.groupby('ID')['bp.sys'].transform('mean'))
test_extrapolation_patras3['bp.sys'] = test_extrapolation_patras3['bp.sys'].fillna(data_train_full_patras3.groupby('ID')['bp.sys'].transform('mean'))
test_interpolation_patras3['bp.sys'] = test_interpolation_patras3['bp.sys'].fillna(data_train_full_patras3.groupby('ID')['bp.sys'].transform('mean'))
data_train_patras3['bp.sys'] = data_train_patras3['bp.sys'].fillna(data_train_patras3.groupby('ID')['bp.sys'].transform('mean'))
data_val_patras3['bp.sys'] = data_val_patras3['bp.sys'].fillna(data_train_patras3.groupby('ID')['bp.sys'].transform('mean'))

# Fill remaining NaN with global mean
data_train_full_patras3['bp.sys'] = data_train_full_patras3['bp.sys'].fillna(data_train_full_patras3['bp.sys'].mean())
test_extrapolation_patras3['bp.sys'] = test_extrapolation_patras3['bp.sys'].fillna(data_train_full_patras3['bp.sys'].mean())
test_interpolation_patras3['bp.sys'] = test_interpolation_patras3['bp.sys'].fillna(data_train_full_patras3['bp.sys'].mean())
data_train_patras3['bp.sys'] = data_train_patras3['bp.sys'].fillna(data_train_patras3['bp.sys'].mean())
data_val_patras3['bp.sys'] = data_val_patras3['bp.sys'].fillna(data_train_patras3['bp.sys'].mean())

# Fill zero values with mean of each patient
data_train_full_sheffield3['bp.sys'] = data_train_full_sheffield3['bp.sys'].fillna(data_train_full_sheffield3.groupby('ID')['bp.sys'].transform('mean'))
test_extrapolation_sheffield3['bp.sys'] = test_extrapolation_sheffield3['bp.sys'].fillna(data_train_full_sheffield3.groupby('ID')['bp.sys'].transform('mean'))
test_interpolation_sheffield3['bp.sys'] = test_interpolation_sheffield3['bp.sys'].fillna(data_train_full_sheffield3.groupby('ID')['bp.sys'].transform('mean'))
data_train_sheffield3['bp.sys'] = data_train_sheffield3['bp.sys'].fillna(data_train_sheffield3.groupby('ID')['bp.sys'].transform('mean'))
data_val_sheffield3['bp.sys'] = data_val_sheffield3['bp.sys'].fillna(data_train_sheffield3.groupby('ID')['bp.sys'].transform('mean'))

# Fill remaining NaN with global mean
data_train_full_sheffield3['bp.sys'] = data_train_full_sheffield3['bp.sys'].fillna(data_train_full_sheffield3['bp.sys'].mean())
test_extrapolation_sheffield3['bp.sys'] = test_extrapolation_sheffield3['bp.sys'].fillna(data_train_full_sheffield3['bp.sys'].mean())
test_interpolation_sheffield3['bp.sys'] = test_interpolation_sheffield3['bp.sys'].fillna(data_train_full_sheffield3['bp.sys'].mean())
data_train_sheffield3['bp.sys'] = data_train_sheffield3['bp.sys'].fillna(data_train_sheffield3['bp.sys'].mean())
data_val_sheffield3['bp.sys'] = data_val_sheffield3['bp.sys'].fillna(data_train_sheffield3['bp.sys'].mean())

In [209]:
for i in range(len(train_set_list_patras3)):
    train_set_list_patras3[i]['bp.sys'] = train_set_list_patras3[i]['bp.sys'].fillna(train_set_list_patras3[i].groupby('ID')['bp.sys'].transform('mean'))
    val_set_list_patras3[i]['bp.sys'] = val_set_list_patras3[i]['bp.sys'].fillna(train_set_list_patras3[i].groupby('ID')['bp.sys'].transform('mean'))
    train_set_list_patras3[i]['bp.sys'] = train_set_list_patras3[i]['bp.sys'].fillna(train_set_list_patras3[i]['bp.sys'].mean())
    val_set_list_patras3[i]['bp.sys'] = val_set_list_patras3[i]['bp.sys'].fillna(train_set_list_patras3[i]['bp.sys'].mean())

    train_set_list_sheffield3[i]['bp.sys'] = train_set_list_sheffield3[i]['bp.sys'].fillna(train_set_list_sheffield3[i].groupby('ID')['bp.sys'].transform('mean'))
    val_set_list_sheffield3[i]['bp.sys'] = val_set_list_sheffield3[i]['bp.sys'].fillna(train_set_list_sheffield3[i].groupby('ID')['bp.sys'].transform('mean'))
    train_set_list_sheffield3[i]['bp.sys'] = train_set_list_sheffield3[i]['bp.sys'].fillna(train_set_list_sheffield3[i]['bp.sys'].mean())
    val_set_list_sheffield3[i]['bp.sys'] = val_set_list_sheffield3[i]['bp.sys'].fillna(train_set_list_sheffield3[i]['bp.sys'].mean())

## bun

In [210]:
# Fill zero values with mean of each patient
data_train_full_patras3['bun'] = data_train_full_patras3['bun'].fillna(data_train_full_patras3.groupby('ID')['bun'].transform('mean'))
test_extrapolation_patras3['bun'] = test_extrapolation_patras3['bun'].fillna(data_train_full_patras3.groupby('ID')['bun'].transform('mean'))
test_interpolation_patras3['bun'] = test_interpolation_patras3['bun'].fillna(data_train_full_patras3.groupby('ID')['bun'].transform('mean'))
data_train_patras3['bun'] = data_train_patras3['bun'].fillna(data_train_patras3.groupby('ID')['bun'].transform('mean'))
data_val_patras3['bun'] = data_val_patras3['bun'].fillna(data_train_patras3.groupby('ID')['bun'].transform('mean'))

# Fill remaining NaN with global mean
data_train_full_patras3['bun'] = data_train_full_patras3['bun'].fillna(data_train_full_patras3['bun'].mean())
test_extrapolation_patras3['bun'] = test_extrapolation_patras3['bun'].fillna(data_train_full_patras3['bun'].mean())
test_interpolation_patras3['bun'] = test_interpolation_patras3['bun'].fillna(data_train_full_patras3['bun'].mean())
data_train_patras3['bun'] = data_train_patras3['bun'].fillna(data_train_patras3['bun'].mean())
data_val_patras3['bun'] = data_val_patras3['bun'].fillna(data_train_patras3['bun'].mean())

# Fill zero values with mean of each patient
data_train_full_sheffield3['bun'] = data_train_full_sheffield3['bun'].fillna(data_train_full_sheffield3.groupby('ID')['bun'].transform('mean'))
test_extrapolation_sheffield3['bun'] = test_extrapolation_sheffield3['bun'].fillna(data_train_full_sheffield3.groupby('ID')['bun'].transform('mean'))
test_interpolation_sheffield3['bun'] = test_interpolation_sheffield3['bun'].fillna(data_train_full_sheffield3.groupby('ID')['bun'].transform('mean'))
data_train_sheffield3['bun'] = data_train_sheffield3['bun'].fillna(data_train_sheffield3.groupby('ID')['bun'].transform('mean'))
data_val_sheffield3['bun'] = data_val_sheffield3['bun'].fillna(data_train_sheffield3.groupby('ID')['bun'].transform('mean'))

# Fill remaining NaN with global mean
data_train_full_sheffield3['bun'] = data_train_full_sheffield3['bun'].fillna(data_train_full_sheffield3['bun'].mean())
test_extrapolation_sheffield3['bun'] = test_extrapolation_sheffield3['bun'].fillna(data_train_full_sheffield3['bun'].mean())
test_interpolation_sheffield3['bun'] = test_interpolation_sheffield3['bun'].fillna(data_train_full_sheffield3['bun'].mean())
data_train_sheffield3['bun'] = data_train_sheffield3['bun'].fillna(data_train_sheffield3['bun'].mean())
data_val_sheffield3['bun'] = data_val_sheffield3['bun'].fillna(data_train_sheffield3['bun'].mean())

In [211]:
for i in range(len(train_set_list_patras3)):
    train_set_list_patras3[i]['bun'] = train_set_list_patras3[i]['bun'].fillna(train_set_list_patras3[i].groupby('ID')['bun'].transform('mean'))
    val_set_list_patras3[i]['bun'] = val_set_list_patras3[i]['bun'].fillna(train_set_list_patras3[i].groupby('ID')['bun'].transform('mean'))
    train_set_list_patras3[i]['bun'] = train_set_list_patras3[i]['bp.sys'].fillna(train_set_list_patras3[i]['bp.sys'].mean())
    val_set_list_patras3[i]['bun'] = val_set_list_patras3[i]['bun'].fillna(train_set_list_patras3[i]['bun'].mean())

    train_set_list_sheffield3[i]['bun'] = train_set_list_sheffield3[i]['bun'].fillna(train_set_list_sheffield3[i].groupby('ID')['bun'].transform('mean'))
    val_set_list_sheffield3[i]['bun'] = val_set_list_sheffield3[i]['bun'].fillna(train_set_list_sheffield3[i].groupby('ID')['bun'].transform('mean'))
    train_set_list_sheffield3[i]['bun'] = train_set_list_sheffield3[i]['bun'].fillna(train_set_list_sheffield3[i]['bun'].mean())
    val_set_list_sheffield3[i]['bun'] = val_set_list_sheffield3[i]['bun'].fillna(train_set_list_sheffield3[i]['bun'].mean())

# Save

In [212]:
# Fill NaN values with Unknown
data_train_full_patras4 = data_train_full_patras3.fillna('Unknown')
test_extrapolation_patras4 = test_extrapolation_patras3.fillna('Unknown')
test_interpolation_patras4 = test_interpolation_patras3.fillna('Unknown')
data_train_patras4 = data_train_patras3.fillna('Unknown')
data_val_patras4 = data_val_patras3.fillna('Unknown')

data_train_full_sheffield4 = data_train_full_sheffield3.fillna('Unknown')
test_extrapolation_sheffield4 = test_extrapolation_sheffield3.fillna('Unknown')
test_interpolation_sheffield4 = test_interpolation_sheffield3.fillna('Unknown')
data_train_sheffield4 = data_train_sheffield3.fillna('Unknown')
data_val_sheffield4 = data_val_sheffield3.fillna('Unknown')

train_set_list_patras4, val_set_list_patras4 = [], []
train_set_list_sheffield4, val_set_list_sheffield4 = [], []
for i in range(len(train_set_list_patras3)):
    train_set_list_patras4.append(train_set_list_patras3[i].fillna('Unknown'))
    val_set_list_patras4.append(val_set_list_patras3[i].fillna('Unknown'))

    train_set_list_sheffield4.append(train_set_list_sheffield3[i].fillna('Unknown'))
    val_set_list_sheffield4.append(val_set_list_sheffield3[i].fillna('Unknown'))

In [213]:
data_train_full_patras4.to_csv('Patras/data_train_full_patras.csv', index=False)
test_extrapolation_patras4.to_csv('Patras/test_extrapolation_patras.csv', index=False)
test_interpolation_patras4.to_csv('Patras/test_interpolation_patras.csv', index=False)
data_train_patras4.to_csv('Patras/data_train_patras.csv', index=False)
data_val_patras4.to_csv('Patras/data_val_patras.csv', index=False)

data_train_full_sheffield4.to_csv('Sheffield/data_train_full_sheffield.csv', index=False)
test_extrapolation_sheffield4.to_csv('Sheffield/test_extrapolation_sheffield.csv', index=False)
test_interpolation_sheffield4.to_csv('Sheffield/test_interpolation_sheffield.csv', index=False)
data_train_sheffield4.to_csv('Sheffield/data_train_sheffield.csv', index=False)
data_val_sheffield4.to_csv('Sheffield/data_val_sheffield.csv', index=False)

for i in range(len(train_set_list_patras4)):
    train_set_list_patras4[i].to_csv('Patras/data_cv_train_patras_' + str(i+1) + '.csv', index=False)
    val_set_list_patras4[i].to_csv('Patras/data_cv_val_patras_' + str(i+1) + '.csv', index=False)

    train_set_list_sheffield4[i].to_csv('Sheffield/data_cv_train_sheffield_' + str(i+1) + '.csv', index=False)
    val_set_list_sheffield4[i].to_csv('Sheffield/data_cv_val_sheffield_' + str(i+1) + '.csv', index=False)

In [215]:
n_samples_chosen_per_group_series_patras = pd.Series(n_samples_chosen_per_group_patras)
n_samples_chosen_per_group_series_patras.to_csv('Patras/n_samples_chosen_per_group_series_patras.csv', index=False)

n_samples_chosen_per_group_series_sheffield = pd.Series(n_samples_chosen_per_group_sheffield)
n_samples_chosen_per_group_series_sheffield.to_csv('Sheffield/n_samples_chosen_per_group_series_sheffield.csv', index=False)

In [216]:
!zip -r /content/file_patras.zip /content/Patras
!zip -r /content/file_sheffield.zip /content/Sheffield

  adding: content/Patras/ (stored 0%)
  adding: content/Patras/data_cv_train_patras_1.csv (deflated 89%)
  adding: content/Patras/n_samples_chosen_per_group_series_patras.csv (deflated 77%)
  adding: content/Patras/data_cv_train_patras_3.csv (deflated 89%)
  adding: content/Patras/data_cv_val_patras_5.csv (deflated 86%)
  adding: content/Patras/data_cv_train_patras_5.csv (deflated 89%)
  adding: content/Patras/test_interpolation_patras.csv (deflated 84%)
  adding: content/Patras/data_cv_train_patras_4.csv (deflated 89%)
  adding: content/Patras/data_cv_train_patras_2.csv (deflated 89%)
  adding: content/Patras/data_train_full_patras.csv (deflated 87%)
  adding: content/Patras/data_cv_val_patras_2.csv (deflated 87%)
  adding: content/Patras/data_cv_val_patras_3.csv (deflated 86%)
  adding: content/Patras/data_val_patras.csv (deflated 86%)
  adding: content/Patras/data_cv_val_patras_1.csv (deflated 86%)
  adding: content/Patras/data_train_patras.csv (deflated 87%)
  adding: content/Patra