# Preprocessing of raw blood donation data for use in SVMs

### Importing packages

In [1]:
import numpy as np
import pandas as pd 
import datetime
import pickle
import ast
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

### Loading original data files

Source data: 
- all donations 2008 - 2020     - alledonaties_2008_2020.csv
- donations in 2021            - 2021Donatiesplus.txt

Sex and date of birth are already included in the donations file in this version, so the donor file is not needed. Matching columns in both data sets are selected and data is combined into one dataframe. 

In [6]:
donations_raw = pd.read_csv(data_path+'alledonaties_2008_2020.csv', low_memory=False)
don_2021_raw = pd.read_csv(data_path+'2021Donatiesplus.txt', sep='\t', low_memory=False)
file = open(data_path+'donatiesoortcodes.txt', 'r')
contents = file.read()
donatiesoort_dict = ast.literal_eval(contents)
file.close()

In [None]:
don_2008_2020 = donations_raw.copy()
don_2021 = don_2021_raw.copy()

# Only keep donations with permission to use in research
don_2008_2020 = don_2008_2020.loc[(don_2008_2020['WOtoestemming'] == 'Ja') | (don_2008_2020['ToestemmingWO'] == 'Ja'), ]
don_2021 = don_2021.loc[don_2021['ToestemmingWO'] == 'Ja', ]

# Select relevant columns
don_2008_2020 = don_2008_2020[['KeyID', 'Geslacht', 'Geboortedatum', 'Einnummer', 
                       'Donatiedatum', 'Donatie_Tijd_Start', 'Donatiecentrumcode',
                       'Donatiesoortcode', 'AfgenomenVolume',
                       'hb', 'HbGoedgekeurd', 'Ferritine']].rename(columns={'hb':'Hb'})
don_2021 = don_2021[['KeyID', 'Geslacht', 'Geboortedatum', 'EINnummer', 
                       'Donatiedatum', 'Donatie_Tijd_Start', 'Donatiecentrumcode',
                       'Donatiesoortcode', 'AfgenomenVolume',
                       'Hb', 'HbGoedgekeurd', 'Ferritine']].rename(columns={'EINnummer':'Einnummer'})

# Map column values for 'Donatiesoortcode' in donations to those of don_2021
don_2008_2020['Donatiesoortcode'].replace(donatiesoort_dict, inplace=True)

# Combine into one dataframe
donations = pd.concat([don_2008_2020, don_2021])
donations = donations.rename(columns = {
                            'Geslacht':'Sex', 'Geboortedatum':'DoB', 'Einnummer':'EIN',
                            'Donatiedatum':'Date', 'Donatie_Tijd_Start':'Time', 
                            'Donatiecentrumcode':'Center', 'Donatiesoortcode':'DonType', 
                            'AfgenomenVolume':'Volume', 'HbGoedgekeurd':'HbOK', 'Ferritine':'Ferritin'})

In [8]:
all_dontypes = np.unique(donations['DonType'])
oth_dontypes = all_dontypes[(all_dontypes != 'V') & 
                            (all_dontypes != 'N')]
donors_oth_dontypes = donations.loc[donations['DonType'].isin(oth_dontypes), 'KeyID']

d_tot = np.unique(donations['KeyID'])
d_wb = np.unique(donations.loc[~donations['KeyID'].isin(donors_oth_dontypes), 'KeyID'])
d_wbn = np.unique(donations.loc[(donations['KeyID'].isin(d_wb)) & (donations['DonType'] == 'N'), 'KeyID'])

print('Total number of donors:', len(d_tot),
      '\nNumber of donors with only whole-blood donations:', len(d_wb),
      '\nNumber of donors with only whole-blood donations, with donor intake:', len(d_wbn))

Total number of donors: 931533 
Number of donors with only whole-blood donations: 756007 
Number of donors with only whole-blood donations, with donor intake: 485314


### Selecting donors and donations:

- Keep donors with only whole-blood donations
- Drop donors without donor intake in dataset
- Drop rows where Hb is not measured (Hb == 'niet bepaald')

In [9]:
data = donations.loc[donations['KeyID'].isin(d_wbn), ].copy()
data = data.loc[(data['HbOK'] != 'niet gekeurd') & (data['Hb'] != 'niet bepaald'), ]
data['Hb'] = pd.to_numeric(data['Hb'], errors='coerce')
data = data.loc[data['Sex'].isin(['M','F']), ]
data = data.dropna(axis=0, subset=['Hb'])

### Column dtypes

In [4]:
def datestr_to_date(datestr):
    month, day, year = datestr.split('/')
    dob = year + '/' + month.zfill(2) + '/' + day.zfill(2)
    return dob

def timestr_to_float(timestr):
    hour, minute = timestr.split(':')
    time = int(hour) + int(minute) / 60
    return time

In [None]:
data['DoB'] = data['DoB'].apply(datestr_to_date)
data['Date'] = data['Date'].apply(datestr_to_date)
data['Time'] = data['Time'].apply(timestr_to_float)
data['DoB'] = pd.to_datetime(data['DoB'])
data['Date'] = pd.to_datetime(data['Date'])
data['HbOK'].replace({'afgekeurd':'0', 'goedgekeurd':'1'}, inplace=True)
data[['Volume', 'HbOK', 'Ferritin']] = data[['Volume', 'HbOK', 'Ferritin']].apply(pd.to_numeric, errors='coerce')

data = data.loc[data['HbOK'].isin([0, 1]), ]
data = data.dropna(axis=0, subset=['KeyID', 'Sex', 'DoB', 'EIN', 'Date', 'Time', 'DonType', 'Hb', 'HbOK'])

data.to_pickle(data_path+'data_clean.pkl')

### Adding more variables

- Age at day of donation 
- Month of visit 
- Year of visit
- Number of previous visits in past 2 years
- Previous ferritin level
- Days since previous ferritin level
- Previous Hb (for up to 5 previous visits)
- Days since previous Hb (for up to 5 previous visits)

In [None]:
data = pd.read_pickle(data_path+'data_clean.pkl')

data['Age'] = (data['Date'] - data['DoB']) / pd.Timedelta('365.25d')
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['Vol_over_250'] = data['Volume'] > 250
data = data.sort_values(['KeyID', 'Date']).reset_index(drop=True)
data.head()

In [5]:
def add_prev_hb_time(df, number):
    colnames = ['HbPrev'+str(number), 'TimetoPrev'+str(number)]
    df[colnames[0]] = df['Hb'].shift(number)
    df[colnames[1]] = (df['Date'] - df['Date'].shift(number)) / pd.Timedelta('1 day') 
    return(df)

def add_numdon_inner(df):
    df['Num_Don'] = df['Vol_over_250'].rolling('730d', closed='left').sum()
    return(df)

def add_numdon(df):
    df['index'] = df.index
    df = df.set_index('Date', drop=False)
    df = df.groupby('KeyID').apply(add_numdon_inner)
    df = df.set_index('index')
    return(df)

def add_last_ferritin(df):
    fers = df.loc[df['Ferritin'].notnull(), ['Date', 'Ferritin']]
    if (fers.shape[0] == 0):
        df['Last_Fer'] = np.NaN
        df['Last_Fer_Date'] = np.NaN
    else:
        df = pd.merge_asof(df, fers, left_index=True, right_index=True, allow_exact_matches=False, direction='backward', suffixes=['','_fer'])
        df = df.rename(columns={'Ferritin_fer':'Last_Fer',
                                'Date_fer':'Last_Fer_Date'})
    return(df)

In [6]:
print(datetime.datetime.now(), 'Starting')
df = data.loc[data.Year > 2014, ].copy()
print(datetime.datetime.now(), 'Dropped pre-2015')
df = add_numdon(df)
print(datetime.datetime.now(), 'NumDon done')
df_1 = df.groupby('KeyID').apply(add_prev_hb_time, number=1)
print(datetime.datetime.now(), 'Hb1 done')
df_2 = df_1.groupby('KeyID').apply(add_prev_hb_time, number=2)
print(datetime.datetime.now(), 'Hb2 done')
df_3 = df_2.groupby('KeyID').apply(add_prev_hb_time, number=3)
print(datetime.datetime.now(), 'Hb3 done')
df_4 = df_3.groupby('KeyID').apply(add_prev_hb_time, number=4)
print(datetime.datetime.now(), 'Hb4 done')
df_5 = df_4.groupby('KeyID').apply(add_prev_hb_time, number=5)
print(datetime.datetime.now(), 'Hb5 done')
df_5.head()

df_5f = df_5.groupby('KeyID').apply(add_last_ferritin)
df_5f['TimetoFer'] = (df_5f['Date'] - df_5f['Last_Fer_Date']) / pd.Timedelta('1d')
print(datetime.datetime.now(), 'Ferritin done')

df_5f.to_csv(data_path+'df_allvars.csv', index=False)
df_5f.to_pickle(data_path+'df_allvars.pkl')

2022-07-07 18:32:20.154873 Starting
2022-07-07 18:32:20.642739 Dropped pre-2015
2022-07-07 18:49:51.315182 NumDon done
2022-07-07 19:11:02.106720 Hb1 done
2022-07-07 19:34:27.382617 Hb2 done
2022-07-07 19:54:32.050661 Hb3 done
2022-07-07 20:16:21.923161 Hb4 done
2022-07-07 20:40:45.329132 Hb5 done
2022-07-07 21:20:44.572392 Ferritin done


### Marginal distributions of variables per SVM

Age, time, month, ferritin, timetofer, hbprevn, timetoprevn

In [7]:
# TO DO

# Scaled based on training data

We need to scale all explanatory variables before doing anything with the SVM. We use the StandardScaler option in the sk-learn package, which makes all variables have a mean of zero and variance of one. We save the scalers for later use when we change time-related variables. Scalers are fitted using only the training data and then used to transform both training and test data.

Test data will be the last year of donations (1 January 2021 - 31 December 2021) and training data everything before that.

In [None]:
df = pd.read_pickle(data_path+'df_allvars.pkl')
df.head()

In [16]:
var = ['KeyID', 'Year', 'Sex', 'Time', 'Age', 'Month', 'Num_Don', 'Last_Fer', 'TimetoFer']

for n in range(1, 6):
    var.extend(['HbPrev'+str(n), 'TimetoPrev'+str(n)])
var.append('HbOK')

train_men = df.loc[(df.Sex == 'M') & (df.Year <= 2020), var]
train_men = train_men[train_men.columns[3:]]
train_women = df.loc[(df.Sex == 'F') & (df.Year <= 2020), var]
train_women = train_women[train_women.columns[3:]]

test_men = df.loc[(df.Sex == 'M') & (df.Year == 2021), var]
test_men = test_men[test_men.columns[3:]]
test_women = df.loc[(df.Sex == 'F') & (df.Year == 2021), var]
test_women = test_women[test_women.columns[3:]]

In [20]:
for nback in range(1, 6):
    var = ['Time', 'Age', 'Month', 'Num_Don', 'Last_Fer', 'TimetoFer']
    for n in range(1, nback+1):
        var.extend(['HbPrev'+str(n), 'TimetoPrev'+str(n)])
    var.append('HbOK')
    
    train_men_sub = train_men[var].dropna()
    train_women_sub = train_women[var].dropna()
    test_men_sub = test_men[var].dropna()
    test_women_sub = test_women[var].dropna()
    
    scaler_men = StandardScaler()
    scaler_women = StandardScaler()
    scaler_men.fit(train_men_sub[train_men_sub.columns[:-1]])
    scaler_women.fit(train_women_sub[train_men_sub.columns[:-1]])
    
    train_men_sub[train_men_sub.columns[:-1]] = scaler_men.transform(train_men_sub[train_men_sub.columns[:-1]])
    train_women_sub[train_women_sub.columns[:-1]] = scaler_women.transform(train_women_sub[train_women_sub.columns[:-1]])
    test_men_sub[test_men_sub.columns[:-1]] = scaler_men.transform(test_men_sub[test_men_sub.columns[:-1]])
    test_women_sub[test_women_sub.columns[:-1]] = scaler_women.transform(test_women_sub[test_women_sub.columns[:-1]])
    
    pickle.dump(scaler_men, open('../results/scalers/men_'+str(nback)+'.pkl', 'wb'))
    pickle.dump(scaler_women, open('../results/scalers/women_'+str(nback)+'.pkl', 'wb'))
    
    train_men_sub.to_pickle(data_path+'scaled/men_'+str(nback)+'_train.pkl')
    train_women_sub.to_pickle(data_path+'scaled/women_'+str(nback)+'_train.pkl')
    test_men_sub.to_pickle(data_path+'scaled/men_'+str(nback)+'_test.pkl')
    test_women_sub.to_pickle(data_path+'scaled/women_'+str(nback)+'_test.pkl')