In [None]:
# Import modules

import sys
import pandas as pd
import numpy as np
import random
import pickle
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import cufflinks as cf
from lifelines import KaplanMeierFitter, CoxPHFitter

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [None]:
# Read donor data
df_donors = pd.read_pickle('X:/201902 Hb Trajectories/donors.pkl')

# List of KeyIDs by sex
id_f = list(df_donors.loc[df_donors['Geslacht'] == 'V', ].index)
id_m = list(df_donors.loc[df_donors['Geslacht'] == 'M', ].index)

df_donors.head()

In [None]:
# Read donation data
df = pd.read_pickle('X:/Brondata/donatiedata_200601_201902.pkl')
df.head()

In [None]:
np.arange(6.0, 12.0, 0.1)

In [None]:
df_m = df.loc[df['KeyID'].isin(id_m), ]
ax = df_m['Hb'].hist(bins=np.arange(6.0, 12.0, 0.1))
ax.set_xlim((6,12))
fig = plt.gcf()
fig.set_size_inches(15, 8)

In [None]:
# Create dataframes (women/men separately) of the number of low and good Hb values per day

df_f = df.loc[df['KeyID'].isin(id_f), :]
donations_f = df_f[['KeyID', 'DateTime']].groupby('DateTime').count().resample('1d').sum().rename(columns={'KeyID': 'Total'})
donations_f['Low Hb'] = df_f.loc[df_f['HbLowHigh'] == 'low', ['KeyID', 'DateTime']].groupby('DateTime').count().resample('1d').sum().rename(columns={'KeyID': 'Low Hb'})
donations_f['Good Hb'] = df_f.loc[df_f['HbLowHigh'] == 'good', ['KeyID', 'DateTime']].groupby('DateTime').count().resample('1d').sum().rename(columns={'KeyID': 'Good Hb'})

df_m = df.loc[df['KeyID'].isin(id_m), :]
donations_m = df_m[['KeyID', 'DateTime']].groupby('DateTime').count().resample('1d').sum().rename(columns={'KeyID': 'Total'})
donations_m['Low Hb'] = df_m.loc[df_m['HbLowHigh'] == 'low', ['KeyID', 'DateTime']].groupby('DateTime').count().resample('1d').sum().rename(columns={'KeyID': 'Low Hb'})
donations_m['Good Hb'] = df_m.loc[df_m['HbLowHigh'] == 'good', ['KeyID', 'DateTime']].groupby('DateTime').count().resample('1d').sum().rename(columns={'KeyID': 'Good Hb'})

donations_f.head()

In [None]:
# Plot number of low/good Hb by specified interval

@interact_manual
def plot_donations_by_interval_line(interval='Q',
                                    figsize_x=8,
                                    figsize_y=6,
                                    color1='#FF7800',
                                    color2='darkblue'):
    
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(figsize_x,figsize_y), sharey=True)
    
    plot_df_f = donations_f.resample(interval, closed='left', label='left').sum()
    plot_df_m = donations_m.resample(interval, closed='left', label='left').sum()
    
    ax1.stackplot(plot_df_f.index, plot_df_f['Low Hb'], plot_df_f['Good Hb'], labels=['Low Hb', 'Healthy Hb'], colors=[color1, color2])
    ax2.stackplot(plot_df_m.index, plot_df_m['Low Hb'], plot_df_m['Good Hb'], labels=['Low Hb', 'Healthy Hb'], colors=[color1, color2])

    ax1.set_title('Donations by women')
    ax1.set_ylabel('Number of donations \n (per quarter)')
    ax1.legend()
    ax1.set_xlim((pd.Timestamp(2006, 1, 1), pd.Timestamp(2018, 3, 31)))

    ax2.set_title('Donations by men')
    ax2.set_ylabel('Number of donations \n (per quarter)')
    ax2.legend()
    ax2.set_xlim((pd.Timestamp(2006, 1, 1), pd.Timestamp(2018, 3, 31)))

    plt.tight_layout()
#     plt.savefig('../Figures/donations.pdf', frameon=False, transparent=True)
    plt.show()

In [None]:
def getSurvivalDf(donations, donors, lim=None):
    if lim is None:
        lim = len(np.unique(donations['KeyID']))
    
    df_surv = pd.DataFrame(data=None, index=donors.index, columns=['time', 'deferred', 'sex', 'age'])

    counter = 0
    for d_id in np.unique(donations['KeyID'])[:lim]:
        df_sub = donations.loc[donations['KeyID'] == d_id, ].sort_values('DateTime', ascending=True).reset_index(drop=True)
        if df_sub.shape[0] > 1:
            df_sub.drop(df_sub.index[0], inplace=True)
        else:
            continue
        df_sub_low = df_sub.loc[df_sub['HbLowHigh'] == 'low', ]
        if df_sub_low.shape[0] > 0:
            df_surv.loc[df_surv.index == d_id, 'time'] = df_sub_low.index[0]
            df_surv.loc[df_surv.index == d_id, 'deferred'] = 1
            df_surv.loc[df_surv.index == d_id, 'age'] = list(df_sub_low['DateTime'].dt.year)[0] - donors.loc[d_id, 'Geboortejaar']
        else:
            df_surv.loc[df_surv.index == d_id, 'time'] = df_sub.index[-1]
            df_surv.loc[df_surv.index == d_id, 'deferred'] = 0
            df_surv.loc[df_surv.index == d_id, 'age'] = list(df_sub['DateTime'].dt.year)[-1] - donors.loc[d_id, 'Geboortejaar']
        df_surv.loc[df_surv.index == d_id, 'sex'] = donors.loc[donors.index == d_id, 'Geslacht']
        counter += 1
        if counter in [10, 100, 1000, 2000, 3000, 4000, 10000, 100000, 200000, 300000, 400000, 500000, 600000, 700000]:
            print(datetime.datetime.now())
            print(counter)

    return(df_surv)

In [None]:
# Only take donors that have donated in 2010 or later, but not before
# Proxy for new donors

old_donors = np.unique(list(df.loc[df['DateTime'].dt.year < 2010, 'KeyID']))
df2 = df.loc[~df['KeyID'].isin(old_donors), ]
df2.shape

In [None]:
df_donors.head()

In [None]:
df2.head()

In [None]:
# RUN ON PC OVERNIGHT (266605 total)

counter = 0
for d_id in np.unique(df2['KeyID']):
    df2.loc[df2['KeyID'] == d_id, 'Geboortejaar'] = df_donors.loc[d_id, 'Geboortejaar']
    df2.loc[df2['KeyID'] == d_id, 'Geslacht'] = df_donors.loc[d_id, 'Geslacht']
    counter += 1
    if counter in [100, 1000] or counter % 10000 == 0:
        print(datetime.datetime.now())
        print(counter)
        
df2.to_csv('X:/donatiesR.csv')

In [None]:
df_surv = getSurvivalDf(df2, df_donors, lim=3000)
df_surv.dropna(inplace=True)
df_surv['time'] = pd.to_numeric(df_surv['time'])
df_surv['deferred'] = pd.to_numeric(df_surv['deferred'])
df_surv['age'] = pd.to_numeric(df_surv['age'])
df_surv[['M', 'V']] = pd.get_dummies(df_surv['sex'])
df_surv.drop('sex', axis=1, inplace=True)
df_surv.drop('M', axis=1, inplace=True)

In [None]:
df_surv.head()

In [None]:
kmf = KaplanMeierFitter()
kmf.fit(df_surv.loc[df_surv['V'] == 1, 'time'], event_observed=df_surv.loc[df_surv['V'] == 1, 'deferred'], label='Women')
ax = kmf.plot(color='#FF7800')
print(kmf.survival_function_)

kmf.fit(df_surv.loc[df_surv['V'] == 0, 'time'], event_observed=df_surv.loc[df_surv['V'] == 0, 'deferred'], label='Men')
ax = kmf.plot(ax=ax, color='darkblue')
print(kmf.survival_function_)

ax.set_xlabel('Number of donations')
ax.set_ylabel('Probability of 0 deferrals')
ax.set_xlim((0, 30))
ax.set_ylim((0, 1))

plt.savefig('survival.pdf', frameon=False, transparent=True)
plt.show()

In [None]:
cph = CoxPHFitter()
cph.fit(df_surv, 'time', event_col='deferred')
cph.print_summary()