# Preprocessing of raw blood donation data for use in SVMs - Finnish data

### Importing packages

In [1]:
import numpy as np
import pandas as pd 
import datetime
import ast
import pickle
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

### Loading original data files 
'vdonationc' contains all donation visits but information on genetic markers is present only in 'lastdon' so we will merge them.

In [None]:
data_last = pd.read_csv('../../data/lastdon.csv', low_memory=False)
data_all = pd.read_csv('../../data/vdonationc.csv', low_memory=False)
data_raw = data_all.merge(data_last[['vdonor', 'height', 'weight', 'smoking', 'bmi', 
                                     'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352',
                                     'snp_1_169549811', 'prs_anemia', 'prs_ferritin', 'prs_hemoglobin']], 
                          how='inner',
                          on='vdonor')
print(data_all.shape, data_raw.shape)

In [None]:
data_raw.columns

Out of 5 786 320 total donations in vdonationc, we have genetic markers for 645 978. 

### Selecting variables and relevant visits
Drop some columns that are not needed: zip code, city, donation site and blood group. Blood group might be interesting to include, but factors have to be dummy-coded for SVMs. For now, drop it. Only keep successful whole-blood donations (donat_phleb == 'K') and Hb-deferred ones (Hb_deferral == 1 & donat_phleb == '\*') from the past 5 years (2016-2020). 

In [None]:
data = data_raw.copy().drop(columns=['Unnamed: 0', 'zip', 'city', 'site', 'aborh'])
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data = data.loc[(data.donat_phleb == 'K') | ((data.donat_phleb == '*') & (data.Hb_deferral == 1)), ]
print(data_raw.shape, data.shape)

data = data.drop(columns=['status','donat_phleb'])
#data.to_pickle('../../data/alldata.pkl')

data = data.loc[data.year > 2015, ]
#data.to_pickle('../../data/alldata_2016_2020.pkl')
print(data.shape)

This leaves us with 201 843 donation attempts.

Add variables HbPrevn and DaystoPrevn for n between 1-5. 

In [None]:
def add_prev_hb_time(df, number):
    colnames = ['HbPrev'+str(number), 'DaystoPrev'+str(number)]
    df[colnames[0]] = df['Hb'].shift(number)
    df[colnames[1]] = (df['date'] - df['date'].shift(number)) / pd.Timedelta('1 day') 
    return df

data = data.dropna(axis=0)

df_1 = data.groupby('vdonor').apply(add_prev_hb_time, number=1)
print(datetime.datetime.now(), '1 done')
df_2 = df_1.groupby('vdonor').apply(add_prev_hb_time, number=2)
print(datetime.datetime.now(), '2 done')
df_3 = df_2.groupby('vdonor').apply(add_prev_hb_time, number=3)
print(datetime.datetime.now(), '3 done')
df_4 = df_3.groupby('vdonor').apply(add_prev_hb_time, number=4)
print(datetime.datetime.now(), '4 done')
df_5 = df_4.groupby('vdonor').apply(add_prev_hb_time, number=5)
print(datetime.datetime.now(), '5 done')
df_5.head()

df_5.to_pickle('../../data/df_2016_2020.pkl')

### Checking pre-processed data

In [None]:
df = pd.read_pickle('../../data/df_2016_2020.pkl')
df.describe()

Min, max and mean for most variables look sensible. However the minimum value for several DaystoPrevn variables is very low, so we investigate this. From the Veripalvelu website: *The minimum interval between blood donations is 91 days for women and 61 days for men.* Values smaller than this can occur if the previous visit was a deferral, however. This is the case for almost all these observations. There are a few (<100) observations that have DaystoPrevn variables smaller than the minimum donation interval, but they do appear to be valid donations. We leave them in the dataset.

A look at the distribution of number of donation attempts per donor:

In [None]:
dcounts = df.vdonor.value_counts()

plt.hist(dcounts)
plt.show()

# Marginal distributions of variables per SVM

We want to describe the distributions of all variables in the data set separately for women and men, and separately for the five models SVM-1 through SVM-5. 

In [79]:
df = pd.read_pickle('../../data/df_2016_2020.pkl')
df['prs_anemia'] = df['prs_anemia'].round(10)
df['prs_ferritin'] = df['prs_ferritin'].round(10)
df['prs_hemoglobin'] = df['prs_hemoglobin'].round(10)

In [80]:
df.head()

Unnamed: 0,vdonor,age,sex,date,Hb,Hb_deferral,height,weight,smoking,bmi,snp_17_58358769,snp_6_32617727,snp_15_45095352,snp_1_169549811,prs_anemia,prs_ferritin,prs_hemoglobin,year,month,HbPrev1,DaystoPrev1,HbPrev2,DaystoPrev2,HbPrev3,DaystoPrev3,HbPrev4,DaystoPrev4,HbPrev5,DaystoPrev5
49,MV4ALL7LH8X3,52,Women,2016-03-31,131.0,0,163,79,False,29.7339,0,1,0,0,1e-06,-1.51e-08,-2e-06,2016,3,,,,,,,,,,
50,MV4ALL7LH8X3,52,Women,2016-07-05,118.0,1,163,79,False,29.7339,0,1,0,0,1e-06,-1.51e-08,-2e-06,2016,7,131.0,96.0,,,,,,,,
51,MV4ALL7LH8X3,52,Women,2016-10-06,125.0,0,163,79,False,29.7339,0,1,0,0,1e-06,-1.51e-08,-2e-06,2016,10,118.0,93.0,131.0,189.0,,,,,,
52,MV4ALL7LH8X3,53,Women,2016-12-30,121.0,1,163,79,False,29.7339,0,1,0,0,1e-06,-1.51e-08,-2e-06,2016,12,125.0,85.0,118.0,178.0,131.0,274.0,,,,
53,MV4ALL7LH8X3,53,Women,2017-03-23,138.0,0,163,79,False,29.7339,0,1,0,0,1e-06,-1.51e-08,-2e-06,2017,3,121.0,83.0,125.0,168.0,118.0,261.0,131.0,357.0,,


In [77]:
def describe_numerical(x):
    xmin, xq1, xmedian, xq3, xmax = np.percentile(x, [0, 25, 50, 75, 100]) 
    return(xmin, xq1, xmedian, xq3, xmax)

def describe_categorical(x):
    xoccs = x.value_counts(sort=False)
    xoccsn = x.value_counts(normalize=True, sort=False)
#    df = pd.DataFrame({'Value':xoccs.index,
#                       'Count':xoccs.values,
#                       'Proportion': xoccsn.values})
    return(xoccs.index, xoccs.values, xoccsn.values)

def describe_df(df):
    xcount = df.shape[1]
    # numerical variables
    df_num = df.drop(columns=['year', 'smoking', 'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 'snp_1_169549811'])
    res_num = df_num.apply(describe_numerical, axis='index').transpose()
    res_num.columns = ['minimum', 'Q1', 'median', 'Q3', 'maximum']
    # categorical variables
    res_cat = []
    for variable in ['year', 'smoking', 'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 'snp_1_169549811']:
        res = describe_categorical(df[variable])
        res = pd.DataFrame(res).transpose()
        res.columns = ['Value','Count','Proportion']
        res_cat.append(res)
    return(res_num, res_cat)

### Male donors

In [78]:
df_m = df.loc[df['sex'] == 'Men', ['year', 'age', 'month', 'smoking', 'height', 'weight', 'bmi',
                         'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 'snp_1_169549811', 'prs_anemia', 'prs_ferritin',
                         'prs_hemoglobin', 'HbPrev1', 'DaystoPrev1', 'HbPrev2', 'DaystoPrev2','HbPrev3', 'DaystoPrev3',
                         'HbPrev4', 'DaystoPrev4', 'HbPrev5', 'DaystoPrev5']]

for nback in range(1, 6):
    varname = 'HbPrev' + str(nback)
    df_sub = df_m.dropna(subset=['HbPrev'+str(nback)])
    print(nback, 'donations back')
    res_num, res_cat = describe_df(df_sub)
    print('Numerical variables:')
    display(res_num)
    for i, varname in enumerate(['year', 'smoking', 'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 'snp_1_169549811']):
        print('Variable', varname)
        display(res_cat[i])

1 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,18.0,38.0,52.0,60.0,71.0
month,1.0,3.0,6.0,10.0,12.0
height,149.0,175.0,180.0,184.0,210.0
weight,50.0,78.0,86.0,96.0,190.0
bmi,15.31969,24.56931,26.70362,29.42508,65.192744
prs_anemia,-4e-06,-8.773e-07,-3.88e-08,7.92e-07,5e-06
prs_ferritin,-7e-06,-1.271712e-06,-2.34799e-08,1.241847e-06,8e-06
prs_hemoglobin,-1.7e-05,-3.095667e-06,1.81646e-08,3.254924e-06,1.6e-05
HbPrev1,102.0,147.0,154.0,161.0,196.0
DaystoPrev1,1.0,71.0,98.0,147.0,1212.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2017.0,21036.0,0.235497
1,2018.0,24767.0,0.277265
2,2019.0,24552.0,0.274858
3,2016.0,12425.0,0.139097
4,2020.0,6546.0,0.073282


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,False,79866,0.894096
1,True,9460,0.105904


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,87791.0,0.982816
1,1.0,1535.0,0.017184


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,1.0,43927.0,0.491761
1,2.0,18489.0,0.206983
2,0.0,26910.0,0.301256


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,78603.0,0.879957
1,1.0,10233.0,0.114558
2,2.0,490.0,0.005486


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,0.0,85788.0,0.960392
1,1.0,3503.0,0.039216
2,2.0,35.0,0.000392


2 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,18.0,40.0,52.0,60.0,71.0
month,1.0,3.0,7.0,10.0,12.0
height,149.0,175.0,180.0,184.0,210.0
weight,50.0,78.0,86.0,96.0,187.0
bmi,15.31969,24.57787,26.72993,29.53099,65.192744
prs_anemia,-4e-06,-8.791e-07,-4.11e-08,7.8965e-07,5e-06
prs_ferritin,-7e-06,-1.270993e-06,-2.210532e-08,1.250257e-06,8e-06
prs_hemoglobin,-1.6e-05,-3.088784e-06,2.2888e-08,3.275019e-06,1.6e-05
HbPrev1,102.0,147.0,154.0,161.0,195.0
DaystoPrev1,1.0,70.0,97.0,139.0,1199.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2018.0,22981.0,0.293118
1,2019.0,23638.0,0.301497
2,2017.0,18597.0,0.237201
3,2016.0,6691.0,0.085342
4,2020.0,6495.0,0.082842


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,False,70199,0.895373
1,True,8203,0.104627


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,77094.0,0.983317
1,1.0,1308.0,0.016683


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,1.0,38564.0,0.491875
1,2.0,16216.0,0.206831
2,0.0,23622.0,0.301293


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,69065.0,0.880909
1,1.0,8906.0,0.113594
2,2.0,431.0,0.005497


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,0.0,75280.0,0.96018
1,1.0,3092.0,0.039438
2,2.0,30.0,0.000383


3 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,18.0,41.0,53.0,61.0,71.0
month,1.0,3.0,6.0,10.0,12.0
height,149.0,175.0,180.0,184.0,210.0
weight,50.0,78.0,86.0,96.0,187.0
bmi,16.706205,24.61521,26.76978,29.53686,65.192744
prs_anemia,-4e-06,-8.825e-07,-4.69e-08,7.872e-07,5e-06
prs_ferritin,-7e-06,-1.265253e-06,-1.78685e-08,1.26002e-06,8e-06
prs_hemoglobin,-1.6e-05,-3.081072e-06,4.24917e-08,3.295483e-06,1.6e-05
HbPrev1,102.0,147.0,154.0,161.0,195.0
DaystoPrev1,1.0,70.0,94.0,132.0,945.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2019.0,22433.0,0.328559
1,2018.0,21024.0,0.307922
2,2016.0,2973.0,0.043543
3,2017.0,15486.0,0.226811
4,2020.0,6361.0,0.093165


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,False,61187,0.896158
1,True,7090,0.103842


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,67174.0,0.983845
1,1.0,1103.0,0.016155


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,1.0,33599.0,0.492098
1,2.0,14104.0,0.20657
2,0.0,20574.0,0.301331


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,60224.0,0.882054
1,1.0,7675.0,0.11241
2,2.0,378.0,0.005536


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,0.0,65543.0,0.959957
1,1.0,2708.0,0.039662
2,2.0,26.0,0.000381


4 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,18.0,42.0,54.0,61.0,71.0
month,1.0,3.0,6.0,10.0,12.0
height,149.0,175.0,180.0,184.0,210.0
weight,50.0,78.0,86.0,96.0,187.0
bmi,16.706205,24.6181,26.77551,29.56195,65.192744
prs_anemia,-4e-06,-8.832e-07,-5.21e-08,7.826e-07,5e-06
prs_ferritin,-6e-06,-1.25905e-06,-9.8048e-09,1.264376e-06,8e-06
prs_hemoglobin,-1.6e-05,-3.074471e-06,5.43668e-08,3.32903e-06,1.6e-05
HbPrev1,102.0,147.0,154.0,161.0,195.0
DaystoPrev1,1.0,69.0,92.0,126.0,848.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2018.0,18891.0,0.3203
1,2019.0,21014.0,0.356296
2,2017.0,11927.0,0.202225
3,2020.0,6144.0,0.104173
4,2016.0,1003.0,0.017006


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,True,6083,0.103138
1,False,52896,0.896862


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,58062.0,0.984452
1,1.0,917.0,0.015548


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,2.0,12183.0,0.206565
1,1.0,29014.0,0.491938
2,0.0,17782.0,0.301497


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,52094.0,0.883264
1,1.0,6557.0,0.111175
2,2.0,328.0,0.005561


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,1.0,2345.0,0.03976
1,0.0,56612.0,0.959867
2,2.0,22.0,0.000373


5 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,18.0,43.0,54.0,62.0,71.0
month,1.0,3.0,6.0,10.0,12.0
height,149.0,175.0,180.0,184.0,210.0
weight,50.0,78.0,86.0,96.0,187.0
bmi,16.706205,24.6181,26.79494,29.5858,65.192744
prs_anemia,-4e-06,-8.842e-07,-5.59e-08,7.818e-07,5e-06
prs_ferritin,-6e-06,-1.255091e-06,-4.6771e-09,1.273029e-06,8e-06
prs_hemoglobin,-1.6e-05,-3.067383e-06,7.29404e-08,3.349797e-06,1.6e-05
HbPrev1,102.0,147.0,154.0,161.0,195.0
DaystoPrev1,1.0,68.0,90.0,119.0,833.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2018.0,16612.0,0.328554
1,2019.0,19434.0,0.384367
2,2017.0,8468.0,0.167481
3,2020.0,5875.0,0.116196
4,2016.0,172.0,0.003402


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,True,5180,0.102451
1,False,45381,0.897549


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,49810.0,0.985147
1,1.0,751.0,0.014853


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,2.0,10455.0,0.20678
1,1.0,24855.0,0.491584
2,0.0,15251.0,0.301636


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,44723.0,0.884536
1,1.0,5556.0,0.109887
2,2.0,282.0,0.005577


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,1.0,2016.0,0.039873
1,0.0,48526.0,0.959752
2,2.0,19.0,0.000376


### Female donors

In [60]:
df_f = df.loc[df['sex'] == 'Women', ['year', 'age', 'month', 'smoking', 'height', 'weight', 'bmi',
                         'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 'snp_1_169549811', 'prs_anemia', 'prs_ferritin',
                         'prs_hemoglobin', 'HbPrev1', 'DaystoPrev1', 'HbPrev2', 'DaystoPrev2','HbPrev3', 'DaystoPrev3',
                         'HbPrev4', 'DaystoPrev4', 'HbPrev5', 'DaystoPrev5']]

for nback in range(1, 6):
    varname = 'HbPrev' + str(nback)
    df_sub = df_f.dropna(subset=['HbPrev'+str(nback)])
    print(nback, 'donations back')
    res_num, res_cat = describe_df(df_sub)
    print('Numerical variables:')
    display(res_num)
    for i, varname in enumerate(['year', 'smoking', 'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 'snp_1_169549811']):
        print('Variable', varname)
        display(res_cat[i])

1 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,18.0,29.0,46.0,57.0,70.0
month,1.0,3.0,6.0,10.0,12.0
height,138.0,162.0,166.0,170.0,198.0
weight,49.0,64.0,71.0,82.0,172.0
bmi,15.942183,23.18367,25.78125,29.39469,64.776711
prs_anemia,-5e-06,-8.472577e-07,-2.26692e-09,8.279037e-07,5e-06
prs_ferritin,-7e-06,-1.193138e-06,3.22431e-08,1.279313e-06,8e-06
prs_hemoglobin,-1.6e-05,-3.010211e-06,3.85015e-08,3.105051e-06,1.8e-05
HbPrev1,86.0,133.0,140.0,147.0,187.0
DaystoPrev1,2.0,104.0,132.0,206.0,1421.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2016.0,9249.0,0.109488
1,2017.0,19230.0,0.227641
2,2018.0,24973.0,0.295626
3,2019.0,24768.0,0.293199
4,2020.0,6255.0,0.074046


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,False,74317,0.879751
1,True,10158,0.120249


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,83166.0,0.984504
1,1.0,1303.0,0.015425
2,2.0,6.0,7.1e-05


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,1.0,41700.0,0.493637
1,0.0,26483.0,0.313501
2,2.0,16292.0,0.192862


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,73884.0,0.874626
1,1.0,10218.0,0.120959
2,2.0,373.0,0.004416


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,0.0,80802.0,0.95652
1,1.0,3603.0,0.042652
2,2.0,70.0,0.000829


2 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,18.0,30.0,47.0,58.0,70.0
month,1.0,3.0,6.0,10.0,12.0
height,138.0,162.0,166.0,170.0,198.0
weight,49.0,64.0,71.0,82.0,172.0
bmi,15.942183,23.2438,25.81663,29.40779,64.776711
prs_anemia,-5e-06,-8.506054e-07,-3.1529e-09,8.258885e-07,5e-06
prs_ferritin,-7e-06,-1.188703e-06,3.38549e-08,1.281368e-06,8e-06
prs_hemoglobin,-1.6e-05,-3.010211e-06,4.06315e-08,3.106084e-06,1.8e-05
HbPrev1,96.0,133.0,140.0,147.0,187.0
DaystoPrev1,2.0,102.0,126.0,192.0,1200.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2016.0,3244.0,0.047041
1,2017.0,15041.0,0.218109
2,2018.0,21625.0,0.313583
3,2019.0,22922.0,0.332391
4,2020.0,6129.0,0.088876


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,False,60850,0.882383
1,True,8111,0.117617


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,67929.0,0.985035
1,1.0,1028.0,0.014907
2,2.0,4.0,5.8e-05


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,1.0,33988.0,0.492858
1,0.0,21707.0,0.314772
2,2.0,13266.0,0.19237


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,60328.0,0.874813
1,1.0,8333.0,0.120836
2,2.0,300.0,0.00435


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,0.0,65952.0,0.956367
1,1.0,2949.0,0.042763
2,2.0,60.0,0.00087


3 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,18.0,32.0,49.0,58.0,70.0
month,1.0,3.0,6.0,10.0,12.0
height,138.0,162.0,166.0,170.0,197.0
weight,49.0,64.0,72.0,82.0,172.0
bmi,15.942183,23.30668,25.86451,29.41176,64.776711
prs_anemia,-5e-06,-8.541987e-07,-5.800145e-09,8.252804e-07,5e-06
prs_ferritin,-7e-06,-1.185416e-06,3.38549e-08,1.285622e-06,8e-06
prs_hemoglobin,-1.6e-05,-3.001904e-06,4.48068e-08,3.117147e-06,1.6e-05
HbPrev1,96.0,133.0,140.0,147.0,187.0
DaystoPrev1,2.0,100.0,124.0,180.0,1155.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2016.0,563.0,0.010219
1,2017.0,10278.0,0.186561
2,2018.0,17924.0,0.325347
3,2019.0,20503.0,0.372159
4,2020.0,5824.0,0.105714


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,False,48750,0.884883
1,True,6342,0.115117


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,54297.0,0.98557
1,1.0,792.0,0.014376
2,2.0,3.0,5.4e-05


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,1.0,27121.0,0.492286
1,0.0,17420.0,0.316198
2,2.0,10551.0,0.191516


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,48203.0,0.874955
1,1.0,6651.0,0.120725
2,2.0,238.0,0.00432


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,0.0,52683.0,0.956273
1,1.0,2358.0,0.042801
2,2.0,51.0,0.000926


4 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,19.0,34.0,50.0,59.0,70.0
month,1.0,3.0,6.0,10.0,12.0
height,138.0,162.0,166.0,170.0,197.0
weight,50.0,64.0,72.0,82.0,170.0
bmi,15.942183,23.33547,25.91068,29.62963,64.776711
prs_anemia,-5e-06,-8.593617e-07,-7.3495e-09,8.235684e-07,5e-06
prs_ferritin,-7e-06,-1.182103e-06,3.38549e-08,1.292249e-06,8e-06
prs_hemoglobin,-1.6e-05,-2.989836e-06,5.256755e-08,3.119629e-06,1.6e-05
HbPrev1,96.0,134.0,140.0,148.0,187.0
DaystoPrev1,2.0,99.0,119.0,168.0,1027.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2017.0,5700.0,0.131957
1,2018.0,14231.0,0.329452
2,2019.0,17894.0,0.414251
3,2020.0,5368.0,0.124271
4,2016.0,3.0,6.9e-05


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,False,38339,0.887559
1,True,4857,0.112441


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,42597.0,0.986133
1,1.0,597.0,0.013821
2,2.0,2.0,4.6e-05


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,1.0,21216.0,0.491157
1,0.0,13735.0,0.317969
2,2.0,8245.0,0.190874


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,37818.0,0.875498
1,1.0,5194.0,0.120243
2,2.0,184.0,0.00426


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,0.0,41300.0,0.956107
1,1.0,1854.0,0.042921
2,2.0,42.0,0.000972


5 donations back
Numerical variables:


Unnamed: 0,minimum,Q1,median,Q3,maximum
age,19.0,36.0,51.0,60.0,70.0
month,1.0,3.0,6.0,10.0,12.0
height,138.0,162.0,166.0,170.0,193.0
weight,50.0,65.0,72.0,82.0,170.0
bmi,16.302379,23.38435,25.95156,29.70564,64.776711
prs_anemia,-5e-06,-8.686403e-07,-8.4609e-09,8.243793e-07,5e-06
prs_ferritin,-7e-06,-1.182477e-06,3.198128e-08,1.292132e-06,8e-06
prs_hemoglobin,-1.6e-05,-2.976487e-06,6.106885e-08,3.125541e-06,1.5e-05
HbPrev1,97.0,134.0,141.0,148.0,187.0
DaystoPrev1,5.0,98.0,117.0,157.75,863.0


Variable year


Unnamed: 0,Value,Count,Proportion
0,2017.0,2505.0,0.075456
1,2018.0,10646.0,0.320682
2,2019.0,15234.0,0.458883
3,2020.0,4813.0,0.144979


Variable smoking


Unnamed: 0,Value,Count,Proportion
0,False,29546,0.889993
1,True,3652,0.110007


Variable snp_17_58358769


Unnamed: 0,Value,Count,Proportion
0,0.0,32757.0,0.986716
1,1.0,440.0,0.013254
2,2.0,1.0,3e-05


Variable snp_6_32617727


Unnamed: 0,Value,Count,Proportion
0,1.0,16301.0,0.491024
1,0.0,10576.0,0.318573
2,2.0,6321.0,0.190403


Variable snp_15_45095352


Unnamed: 0,Value,Count,Proportion
0,0.0,29065.0,0.875505
1,1.0,3994.0,0.120308
2,2.0,139.0,0.004187


Variable snp_1_169549811


Unnamed: 0,Value,Count,Proportion
0,0.0,31739.0,0.956052
1,1.0,1425.0,0.042924
2,2.0,34.0,0.001024


# Scaled based on training data

We need to scale all explanatory variables before doing anything with the SVM. We use the StandardScaler option in the sk-learn package, which makes all variables have a mean of zero and variance of one. We save the scalers for later use when we change time-related variables. Scalers are fitted using only the training data and then used to transform both training and test data.

Test data will be the last year of donations (1 May 2019 - 31 April 2020) and training data everything before that.

In [None]:
df = pd.read_pickle('../../data/df_2016_2020.pkl')

In [None]:
var = ['vdonor', 'date', 'sex', 'year', 'age', 'month', 'smoking', 'height', 'weight', 'bmi', 
       'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 'snp_1_169549811', 'prs_anemia', 'prs_ferritin',
       'prs_hemoglobin']
for n in range(1, 6):
    var.extend(['HbPrev'+str(n), 'DaystoPrev'+str(n)])
var.append('Hb_deferral')

df['smoking'] = df['smoking'].astype(int)

train_men = df.loc[(df.sex == 'Men') & (df.date <= '2019-05-01'), var]
train_men = train_men[train_men.columns[4:]]
train_women = df.loc[(df.sex == 'Women') & (df.date <= '2019-05-01'), var]
train_women = train_women[train_women.columns[4:]]

test_men = df.loc[(df.sex == 'Men') & (df.date > '2019-05-01'), var]
test_men = test_men[test_men.columns[4:]]
test_women = df.loc[(df.sex == 'Women') & (df.date > '2019-05-01'), var]
test_women = test_women[test_women.columns[4:]]

In [None]:
for nback in range(1, 6):
    var = ['age', 'month', 'smoking', 'height', 'weight', 'bmi', 
           'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 'snp_1_169549811', 'prs_anemia', 'prs_ferritin',
           'prs_hemoglobin']
    for n in range(1, nback+1):
        var.extend(['HbPrev'+str(n), 'DaystoPrev'+str(n)])
    var.append('Hb_deferral')
    
    train_men_sub = train_men[var].dropna()
    train_women_sub = train_women[var].dropna()
    test_men_sub = test_men[var].dropna()
    test_women_sub = test_women[var].dropna()
    
    scaler_men = StandardScaler()
    scaler_women = StandardScaler()
    scaler_men.fit(train_men_sub[train_men_sub.columns[:-1]])
    scaler_women.fit(train_women_sub[train_men_sub.columns[:-1]])
    
    train_men_sub[train_men_sub.columns[:-1]] = scaler_men.transform(train_men_sub[train_men_sub.columns[:-1]])
    train_women_sub[train_women_sub.columns[:-1]] = scaler_women.transform(train_women_sub[train_women_sub.columns[:-1]])
    test_men_sub[test_men_sub.columns[:-1]] = scaler_men.transform(test_men_sub[test_men_sub.columns[:-1]])
    test_women_sub[test_women_sub.columns[:-1]] = scaler_women.transform(test_women_sub[test_women_sub.columns[:-1]])
    
    pickle.dump(scaler_men, open('../results/scalers/men_'+str(nback)+'.pkl', 'wb'))
    pickle.dump(scaler_women, open('../results/scalers/women_'+str(nback)+'.pkl', 'wb'))
    
    train_men_sub.to_pickle('../../data/scaled/men_'+str(nback)+'_train.pkl')
    train_women_sub.to_pickle('../../data/scaled/women_'+str(nback)+'_train.pkl')
    test_men_sub.to_pickle('../../data/scaled/men_'+str(nback)+'_test.pkl')
    test_women_sub.to_pickle('../../data/scaled/women_'+str(nback)+'_test.pkl')

# Limited variables ('onlyhb'), scaled on training data
To draw comparisons with results on the Dutch data, we also fit the model on data only containing the variables present in the Dutch data. This has the same result as removing those variables from the scaled data created above.

In [None]:
df = pd.read_pickle('../../data/df_2016_2020.pkl')

In [None]:
var = ['vdonor', 'date', 'sex', 'year', 'age', 'month']
for n in range(1, 6):
    var.extend(['HbPrev'+str(n), 'DaystoPrev'+str(n)])
var.append('Hb_deferral')

train_men = df.loc[(df.sex == 'Men') & (df.date <= '2019-05-01'), var]
train_men = train_men[train_men.columns[4:]]
train_women = df.loc[(df.sex == 'Women') & (df.date <= '2019-05-01'), var]
train_women = train_women[train_women.columns[4:]]

test_men = df.loc[(df.sex == 'Men') & (df.date > '2019-05-01'), var]
test_men = test_men[test_men.columns[4:]]
test_women = df.loc[(df.sex == 'Women') & (df.date > '2019-05-01'), var]
test_women = test_women[test_women.columns[4:]]

In [None]:
for nback in range(1, 6):
    var = ['age', 'month']
    for n in range(1, nback+1):
        var.extend(['HbPrev'+str(n), 'DaystoPrev'+str(n)])
    var.append('Hb_deferral')
    
    train_men_sub = train_men[var].dropna()
    train_women_sub = train_women[var].dropna()
    test_men_sub = test_men[var].dropna()
    test_women_sub = test_women[var].dropna()
    
    scaler_men = StandardScaler()
    scaler_women = StandardScaler()
    scaler_men.fit(train_men_sub[train_men_sub.columns[:-1]])
    scaler_women.fit(train_women_sub[train_men_sub.columns[:-1]])
    
    pickle.dump(scaler_men, open('../results/scalers_onlyhb/men_'+str(nback)+'.pkl', 'wb'))
    pickle.dump(scaler_women, open('../results/scalers_onlyhb/women_'+str(nback)+'.pkl', 'wb'))
    
    train_men_sub[train_men_sub.columns[:-1]] = scaler_men.transform(train_men_sub[train_men_sub.columns[:-1]])
    train_women_sub[train_women_sub.columns[:-1]] = scaler_women.transform(train_women_sub[train_women_sub.columns[:-1]])
    test_men_sub[test_men_sub.columns[:-1]] = scaler_men.transform(test_men_sub[test_men_sub.columns[:-1]])
    test_women_sub[test_women_sub.columns[:-1]] = scaler_women.transform(test_women_sub[test_women_sub.columns[:-1]])
    
    train_men_sub.to_pickle('../../data/scaled_onlyhb/men_'+str(nback)+'_train.pkl')
    train_women_sub.to_pickle('../../data/scaled_onlyhb/women_'+str(nback)+'_train.pkl')
    test_men_sub.to_pickle('../../data/scaled_onlyhb/men_'+str(nback)+'_test.pkl')
    test_women_sub.to_pickle('../../data/scaled_onlyhb/women_'+str(nback)+'_test.pkl')

# Another run with only Hb + genetic data
This excludes weight, height, bmi and smoking. This allows us to see the relative added value of genetic data only.

In [3]:
df = pd.read_pickle('../../data/df_2016_2020.pkl')
df.head()

Unnamed: 0,vdonor,age,sex,date,Hb,Hb_deferral,height,weight,smoking,bmi,snp_17_58358769,snp_6_32617727,snp_15_45095352,snp_1_169549811,prs_anemia,prs_ferritin,prs_hemoglobin,year,month,HbPrev1,DaystoPrev1,HbPrev2,DaystoPrev2,HbPrev3,DaystoPrev3,HbPrev4,DaystoPrev4,HbPrev5,DaystoPrev5
49,MV4ALL7LH8X3,52,Women,2016-03-31,131.0,0,163,79,False,29.7339,0,1,0,0,1e-06,-1.510828e-08,-2e-06,2016,3,,,,,,,,,,
50,MV4ALL7LH8X3,52,Women,2016-07-05,118.0,1,163,79,False,29.7339,0,1,0,0,1e-06,-1.510828e-08,-2e-06,2016,7,131.0,96.0,,,,,,,,
51,MV4ALL7LH8X3,52,Women,2016-10-06,125.0,0,163,79,False,29.7339,0,1,0,0,1e-06,-1.510828e-08,-2e-06,2016,10,118.0,93.0,131.0,189.0,,,,,,
52,MV4ALL7LH8X3,53,Women,2016-12-30,121.0,1,163,79,False,29.7339,0,1,0,0,1e-06,-1.510828e-08,-2e-06,2016,12,125.0,85.0,118.0,178.0,131.0,274.0,,,,
53,MV4ALL7LH8X3,53,Women,2017-03-23,138.0,0,163,79,False,29.7339,0,1,0,0,1e-06,-1.510828e-08,-2e-06,2017,3,121.0,83.0,125.0,168.0,118.0,261.0,131.0,357.0,,


In [4]:
var = ['vdonor', 'date', 'sex', 'year', 'age', 'month', 'snp_17_58358769', 'snp_6_32617727',  
       'snp_15_45095352', 'snp_1_169549811', 'prs_anemia', 'prs_ferritin', 'prs_hemoglobin']
for n in range(1, 6):
    var.extend(['HbPrev'+str(n), 'DaystoPrev'+str(n)])
var.append('Hb_deferral')

train_men = df.loc[(df.sex == 'Men') & (df.date <= '2019-05-01'), var]
train_men = train_men[train_men.columns[4:]]
train_women = df.loc[(df.sex == 'Women') & (df.date <= '2019-05-01'), var]
train_women = train_women[train_women.columns[4:]]

test_men = df.loc[(df.sex == 'Men') & (df.date > '2019-05-01'), var]
test_men = test_men[test_men.columns[4:]]
test_women = df.loc[(df.sex == 'Women') & (df.date > '2019-05-01'), var]
test_women = test_women[test_women.columns[4:]]

In [6]:
for nback in range(1, 6):
    var = ['age', 'month', 'snp_17_58358769', 'snp_6_32617727', 'snp_15_45095352', 
           'snp_1_169549811', 'prs_anemia', 'prs_ferritin', 'prs_hemoglobin']
    for n in range(1, nback+1):
        var.extend(['HbPrev'+str(n), 'DaystoPrev'+str(n)])
    var.append('Hb_deferral')
    
    train_men_sub = train_men[var].dropna()
    train_women_sub = train_women[var].dropna()
    test_men_sub = test_men[var].dropna()
    test_women_sub = test_women[var].dropna()
    
    scaler_men = StandardScaler()
    scaler_women = StandardScaler()
    scaler_men.fit(train_men_sub[train_men_sub.columns[:-1]])
    scaler_women.fit(train_women_sub[train_men_sub.columns[:-1]])
    
    pickle.dump(scaler_men, open('../results/scalers_hbgen/men_'+str(nback)+'.pkl', 'wb'))
    pickle.dump(scaler_women, open('../results/scalers_hbgen/women_'+str(nback)+'.pkl', 'wb'))
    
    train_men_sub[train_men_sub.columns[:-1]] = scaler_men.transform(train_men_sub[train_men_sub.columns[:-1]])
    train_women_sub[train_women_sub.columns[:-1]] = scaler_women.transform(train_women_sub[train_women_sub.columns[:-1]])
    test_men_sub[test_men_sub.columns[:-1]] = scaler_men.transform(test_men_sub[test_men_sub.columns[:-1]])
    test_women_sub[test_women_sub.columns[:-1]] = scaler_women.transform(test_women_sub[test_women_sub.columns[:-1]])
    
    train_men_sub.to_pickle('../../data/scaled_hbgen/men_'+str(nback)+'_train.pkl')
    train_women_sub.to_pickle('../../data/scaled_hbgen/women_'+str(nback)+'_train.pkl')
    test_men_sub.to_pickle('../../data/scaled_hbgen/men_'+str(nback)+'_test.pkl')
    test_women_sub.to_pickle('../../data/scaled_hbgen/women_'+str(nback)+'_test.pkl')