# Preprocessing of raw blood donation data for use in SVMs

### Importing packages

In [14]:
import numpy as np
import pandas as pd 
import datetime
import pickle
import ast
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

### Loading original data files

Source data: 
- all donations 2008 - 2020     - alledonaties_2008_2020.csv
- donations in 2021            - 2021Donatiesplus.txt

Sex and date of birth are already included in the donations file in this version, so the donor file is not needed. Matching columns in both data sets are selected and data is combined into one dataframe.

In [None]:
data_path = ...

In [6]:
donations_raw = pd.read_csv(f'{data_path}alledonaties_2008_2020.csv', low_memory=False)
don_2021_raw = pd.read_csv(f'{data_path}2021Donatiesplus.txt', sep='\t', low_memory=False)
file = open(f'{data_path}donatiesoortcodes.txt', 'r')
contents = file.read()
donatiesoort_dict = ast.literal_eval(contents)
file.close()

In [None]:
don_2008_2020 = donations_raw.copy()
don_2021 = don_2021_raw.copy()

# Only keep donations with permission to use in research
don_2008_2020 = don_2008_2020.loc[(don_2008_2020['WOtoestemming'] == 'Ja') | (don_2008_2020['ToestemmingWO'] == 'Ja'), ]
don_2021 = don_2021.loc[don_2021['ToestemmingWO'] == 'Ja', ]

# Select relevant columns
don_2008_2020 = don_2008_2020[['KeyID', 'Geslacht', 'Geboortedatum', 'Einnummer', 
                       'Donatiedatum', 'Donatie_Tijd_Start', 'Donatiecentrumcode',
                       'Donatiesoortcode', 'AfgenomenVolume',
                       'hb', 'HbGoedgekeurd', 'Ferritine']].rename(columns={'hb':'Hb'})
don_2021 = don_2021[['KeyID', 'Geslacht', 'Geboortedatum', 'EINnummer', 
                       'Donatiedatum', 'Donatie_Tijd_Start', 'Donatiecentrumcode',
                       'Donatiesoortcode', 'AfgenomenVolume',
                       'Hb', 'HbGoedgekeurd', 'Ferritine']].rename(columns={'EINnummer':'Einnummer'})

# Map column values for 'Donatiesoortcode' in donations to those of don_2021
don_2008_2020['Donatiesoortcode'].replace(donatiesoort_dict, inplace=True)

# Combine into one dataframe
donations = pd.concat([don_2008_2020, don_2021])
donations = donations.rename(columns = {
                            'Geslacht':'Sex', 'Geboortedatum':'DoB', 'Einnummer':'EIN',
                            'Donatiedatum':'Date', 'Donatie_Tijd_Start':'Time', 
                            'Donatiecentrumcode':'Center', 'Donatiesoortcode':'DonType', 
                            'AfgenomenVolume':'Volume', 'HbGoedgekeurd':'HbOK', 'Ferritine':'Ferritin'})

In [8]:
all_dontypes = np.unique(donations['DonType'])
oth_dontypes = all_dontypes[(all_dontypes != 'V') & 
                            (all_dontypes != 'N')]
donors_oth_dontypes = donations.loc[donations['DonType'].isin(oth_dontypes), 'KeyID']

d_tot = np.unique(donations['KeyID'])
d_wb = np.unique(donations.loc[~donations['KeyID'].isin(donors_oth_dontypes), 'KeyID'])
d_wbn = np.unique(donations.loc[(donations['KeyID'].isin(d_wb)) & (donations['DonType'] == 'N'), 'KeyID'])

print(f'Total number of donors: {len(d_tot)}\n'
      f'Number of donors with only whole-blood donations:{len(d_wb)}\n'
      f'Number of donors with only whole-blood donations, with donor intake:{len(d_wbn)}\n')

Total number of donors: 931533 
Number of donors with only whole-blood donations: 756007 
Number of donors with only whole-blood donations, with donor intake: 485314


### Selecting donors and donations:

- Keep donors with only whole-blood donations
- Drop donors without donor intake in dataset
- Drop rows where Hb is not measured (Hb == 'niet bepaald')

In [9]:
data = donations.loc[donations['KeyID'].isin(d_wbn), ].copy()
data = data.loc[(data['HbOK'] != 'niet gekeurd') & (data['Hb'] != 'niet bepaald'), ]
data['Hb'] = pd.to_numeric(data['Hb'], errors='coerce')
data = data.loc[data['Sex'].isin(['M','F']), ]
data = data.dropna(axis=0, subset=['Hb'])

### Column dtypes

In [4]:
def datestr_to_date(datestr):
    month, day, year = datestr.split('/')
    dob = f'{year}/{month.zfill(2)}/{day.zfill(2)}'
    return dob

def timestr_to_float(timestr):
    hour, minute = timestr.split(':')
    time = int(hour) + int(minute) / 60
    return time

In [None]:
data['DoB'] = data['DoB'].apply(datestr_to_date)
data['Date'] = data['Date'].apply(datestr_to_date)
data['Time'] = data['Time'].apply(timestr_to_float)
data['DoB'] = pd.to_datetime(data['DoB'])
data['Date'] = pd.to_datetime(data['Date'])
data['HbOK'].replace({'afgekeurd':'0', 'goedgekeurd':'1'}, inplace=True)
data[['Volume', 'HbOK', 'Ferritin']] = data[['Volume', 'HbOK', 'Ferritin']].apply(pd.to_numeric, errors='coerce')

data = data.loc[data['HbOK'].isin([0, 1]), ]
data = data.dropna(axis=0, subset=['KeyID', 'Sex', 'DoB', 'EIN', 'Date', 'Time', 'DonType', 'Hb', 'HbOK'])

data.to_pickle(f'{data_path}data_clean.pkl')

### Adding more variables

- Age at day of donation 
- Month of visit 
- Year of visit
- Number of previous visits in past 2 years
- Previous ferritin level
- Days since previous ferritin level
- Previous Hb (for up to 5 previous visits)
- Days since previous Hb (for up to 5 previous visits)

In [None]:
data = pd.read_pickle(f'{data_path}data_clean.pkl')

data['Age'] = (data['Date'] - data['DoB']) / pd.Timedelta('365.25d')
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['Vol_over_250'] = data['Volume'] > 250
data = data.sort_values(['KeyID', 'Date']).reset_index(drop=True)

In [21]:
def add_prev_hb_time(df, number):
    colnames = [f'HbPrev{str(number)}', f'DaysSinceHb{str(number)}']
    df[colnames[0]] = df['Hb'].shift(number)
    df[colnames[1]] = (df['Date'] - df['Date'].shift(number)) / pd.Timedelta('1 day')
    return df

def add_numdon_inner(df):
    df['NumDon'] = df['Vol_over_250'].rolling('730d', closed='left').sum()
    return df

def add_numdon(df):
    df['index'] = df.index
    df = df.set_index('Date', drop=False)
    df = df.groupby('KeyID').apply(add_numdon_inner)
    df = df.set_index('index')
    return df

def add_last_ferritin(df):
    fers = df.loc[df['Ferritin'].notnull(), ['Date', 'Ferritin']]
    if fers.shape[0] == 0:
        df['Last_Fer'] = np.NaN
        df['Last_Fer_Date'] = np.NaN
    else:
        df = pd.merge_asof(df, fers, left_index=True, right_index=True, allow_exact_matches=False, direction='backward', suffixes=['','_fer'])
        df = df.rename(columns={'Ferritin_fer':'FerritinPrev',
                                'Date_fer':'Last_Fer_Date'})
    return df

In [None]:
print(datetime.datetime.now(), 'Starting')
df = data.loc[data.Year > 2014, ].copy()
print(datetime.datetime.now(), 'Dropped pre-2015')
df = add_numdon(df)
print(datetime.datetime.now(), 'NumDon done')
df_1 = df.groupby('KeyID').apply(add_prev_hb_time, number=1)
print(datetime.datetime.now(), 'Hb1 done')
df_2 = df_1.groupby('KeyID').apply(add_prev_hb_time, number=2)
print(datetime.datetime.now(), 'Hb2 done')
df_3 = df_2.groupby('KeyID').apply(add_prev_hb_time, number=3)
print(datetime.datetime.now(), 'Hb3 done')
df_4 = df_3.groupby('KeyID').apply(add_prev_hb_time, number=4)
print(datetime.datetime.now(), 'Hb4 done')
df_5 = df_4.groupby('KeyID').apply(add_prev_hb_time, number=5)
print(datetime.datetime.now(), 'Hb5 done')
df_5.head()

df_5f = df_5.groupby('KeyID').apply(add_last_ferritin)
df_5f['DaysSinceFer'] = (df_5f['Date'] - df_5f['Last_Fer_Date']) / pd.Timedelta('1d')
print(datetime.datetime.now(), 'Ferritin done')

df_5f.to_csv(f'{data_path}df_allvars.csv', index=False)
df_5f.to_pickle(f'{data_path}df_allvars.pkl')

### Marginal distributions of variables per SVM

Age, time, month, numdon, ferritin, timetofer, hbprevn, timetoprevn

In [107]:
df = pd.read_pickle(f'{data_path}df_allvars.pkl')

for sex in ['F','M']:
    for nback in range(1, 6):
        var = ['Age', 'NumDon', 'FerritinPrev', 'DaysSinceFer']
        for n in range(1, nback+1):
            var.extend([f'HbPrev{n}', f'DaysSinceHb{n}'])
        var.append('HbOK')
        
        df_sub = df.loc[df.Sex == sex, var].copy().dropna()
        print(f'\n\nDistribution of predictor variables for {sex}, SVM-{nback}')
        print(df_sub.describe())



Distribution of predictor variables for F, SVM-1
                 Age         NumDon   FerritinPrev   DaysSinceFer  \
count  236994.000000  236994.000000  236994.000000  236994.000000   
mean       35.086816       1.690115      60.164509     289.471603   
std        13.664863       1.535758      46.850158     229.896911   
min        18.050650       0.000000       1.000000       5.000000   
25%        23.405886       0.000000      33.000000     124.000000   
50%        30.288843       1.000000      47.000000     232.000000   
75%        46.573580       3.000000      74.000000     416.000000   
max        74.182067       5.000000    2000.000000    1501.000000   

             HbPrev1   DaysSinceHb1           HbOK  
count  236994.000000  236994.000000  236994.000000  
mean        8.509235     161.574812       0.967408  
std         0.536882     115.353425       0.177565  
min         5.000000       1.000000       0.000000  
25%         8.100000     104.000000       1.000000  
50%      

# Scaled based on training data

We need to scale all explanatory variables before doing anything with the SVM. We use the StandardScaler option in the sk-learn package, which makes all variables have a mean of zero and variance of one. We save the scalers for later use when we change time-related variables. Scalers are fitted using only the training data and then used to transform both training and test data.

Test data will be the last year of donations (1 January 2021 - 31 December 2021) and training data everything before that.

In [34]:
def save_scaled_train_test_sets(train_men, test_men, train_women, test_women, foldersuffix=''):
    for nback in range(1, 6):
        var = ['Time', 'Age', 'Month', 'NumDon', 'FerritinPrev', 'DaysSinceFer']
        for n in range(1, nback+1):
            var.extend([f'HbPrev{n}', f'DaysSinceHb{n}'])
        var.append('HbOK')

        train_men_sub = train_men[var].dropna()
        train_women_sub = train_women[var].dropna()
        test_men_sub = test_men[var].dropna()
        test_women_sub = test_women[var].dropna()

        scaler_men = StandardScaler()
        scaler_women = StandardScaler()
        scaler_men.fit(train_men_sub[train_men_sub.columns[:-1]])
        scaler_women.fit(train_women_sub[train_men_sub.columns[:-1]])

        train_men_sub[train_men_sub.columns[:-1]] = scaler_men.transform(train_men_sub[train_men_sub.columns[:-1]])
        train_women_sub[train_women_sub.columns[:-1]] = scaler_women.transform(train_women_sub[train_women_sub.columns[:-1]])
        test_men_sub[test_men_sub.columns[:-1]] = scaler_men.transform(test_men_sub[test_men_sub.columns[:-1]])
        test_women_sub[test_women_sub.columns[:-1]] = scaler_women.transform(test_women_sub[test_women_sub.columns[:-1]])

        pickle.dump(scaler_men, open('../results/scalers'+foldersuffix+'/men_'+str(nback)+'.pkl', 'wb'))
        pickle.dump(scaler_women, open('../results/scalers'+foldersuffix+'/women_'+str(nback)+'.pkl', 'wb'))

        train_men_sub.to_pickle(f'{data_path}scaled{foldersuffix}/men_{nback}_train.pkl')
        train_women_sub.to_pickle(f'{data_path}scaled{foldersuffix}/women_{nback}_train.pkl')
        test_men_sub.to_pickle(f'{data_path}scaled{foldersuffix}/men_{nback}_test.pkl')
        test_women_sub.to_pickle(f'{data_path}scaled{foldersuffix}/women_{nback}_test.pkl')

In [40]:
df = pd.read_pickle(f'{data_path}df_allvars.pkl')
var = ['KeyID', 'Year', 'Sex', 'Time', 'Age', 'Month', 'NumDon', 'FerritinPrev', 'DaysSinceFer']

for n in range(1, 6):
    var.extend([f'HbPrev{n}', f'DaysSinceHb{n}'])
var.append('HbOK')

train_men = df.loc[(df.Sex == 'M') & (df.Year <= 2020), var]
train_men = train_men[train_men.columns[3:]]
train_women = df.loc[(df.Sex == 'F') & (df.Year <= 2020), var]
train_women = train_women[train_women.columns[3:]]

test_men = df.loc[(df.Sex == 'M') & (df.Year == 2021), var]
test_men = test_men[test_men.columns[3:]]
test_women = df.loc[(df.Sex == 'F') & (df.Year == 2021), var]
test_women = test_women[test_women.columns[3:]]

save_scaled_train_test_sets(train_men, test_men, train_women, test_women, foldersuffix='')

# Test set of random donors not in train set

A train/test split in time makes sense for the hypothetical implementation of this prediction model. However, this means that many donors are present in both the train and test set. To check whether the models are generalizable also to donors that are not in the train set, we also make a version of the models where the train/test split is done randomly on donor level. The test set includes the last donation visit from 20% of all unique donors. The training set includes all donations from the remaining 80% of donors.

In [None]:
df = pd.read_pickle(f'{data_path}df_allvars.pkl')
df_donors = df.drop_duplicates(subset='KeyID', keep='last')
df_test = df_donors.groupby(['Sex', 'HbOK', 'NumDon'], group_keys=False).apply(lambda x: x.sample(frac=0.2))
df_train = df.loc[~df.KeyID.isin(df_test.KeyID), ]

In [20]:
save_scaled_train_test_sets(df_train.loc[df_train.Sex == 'M', ].copy(), 
                            df_test.loc[df_test.Sex == 'M', ].copy(), 
                            df_train.loc[df_train.Sex == 'F', ].copy(), 
                            df_test.loc[df_test.Sex == 'F', ].copy(), 
                            foldersuffix='_randomsplit')

### Checking the proportion of donations at mobile donation sites

Names of mobile donation centers start with 'ML'. Return rates of donors who donate at mobile centers may differ from donors who visit permanent centers, as mobile sites visit the same location one day every 3-5 weeks. If this proportion is high, the assumption that donors will visit within a week of an invitation will not hold.

In [104]:
df = pd.read_pickle(f'{data_path}df_allvars.pkl')
df = df.dropna(subset=['KeyID', 'Year', 'Sex', 'Time', 'Age', 'Month', 'NumDon', 'FerritinPrev', 'DaysSinceFer', 'HbPrev1', 'DaysSinceHb1'])

centers = pd.DataFrame(df.Center.value_counts()).reset_index().rename(columns={'index':'center', 'Center':'count'})
mobile = centers['center'][centers['center'].str.startswith('ML', na=False)]

mobile_donations = df.loc[df.Center.isin(mobile), ]
mobile_donors = df.loc[df.Center.isin(mobile), 'KeyID']
dons_mobile_donors = df.loc[df.KeyID.isin(mobile_donors), ]

print('{} out of {} donation visits were at mobile sites ({}%)'.format(mobile_donations.shape[0], 
                                                                       df.shape[0], 
                                                                       round(mobile_donations.shape[0]/df.shape[0], 4)*100))

def count_mobile_visits(df):
    df['MobileDons'] = sum(df['Center'].str.startswith('ML', na=False))
    df['TotalDons'] = df.shape[0]
    return df

dons_mobile_donors = df.loc[df.KeyID.isin(mobile_donors), ].groupby('KeyID').apply(count_mobile_visits)
dons_mobile_donors = dons_mobile_donors.drop_duplicates('KeyID')
dons_mobile_donors['PropMobile'] = dons_mobile_donors['MobileDons'] / dons_mobile_donors['TotalDons']

print('{} out of {} unique donors visited only mobile sites ({}%)'.format(dons_mobile_donors.loc[dons_mobile_donors.PropMobile == 1, ].shape[0],
                                                                          len(np.unique(df.KeyID)),
                                                                          round(dons_mobile_donors.loc[dons_mobile_donors.PropMobile == 1, ].shape[0] / len(np.unique(df.KeyID)), 4)*100))


21532 out of 456384 donation visits were at mobile sites (4.72%)
4770 out of 156562 unique donors visited only mobile sites (3.05%)
