# Propensity Score Analysis
We will perform a propensity score matching analysis. Our input is a training dataset and the output will be a smaller dataset which is homogenous based on lymph node status variable (lni).

Doing so, we will lose a lot of patients. This can and will be reflected in the final predictions. To counter that, after making predictions, we'll have to calculate adjusted probabilities.

Propensity score matching is done on binary variables, yet in our case, lni is a 3 level variable. To solve this, propensity score matching will be performed twice.

In [14]:
import pickle
import pandas as pd
import numpy as np

# student's t-test
from scipy.stats import ttest_ind

# propensity score matching modules
from psmpy import PsmPy

In [11]:
def students_t_test_control_vs_treatment(control, treatment, alpha = 0.05):
    print(f'Control mean: {control.mean():.3f}\nTreatment mean: {treatment.mean():.3f}')
    
    # compare samples
    _, p = ttest_ind(control, treatment)
    print(f'p = {p:.3f}')
    
    # interpret
    if p > alpha:
        print('same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)')
    else:
        print('different distributions/different group mean (reject H0)')

In [12]:
trainFilePath = 'data/data_train.pkl'

print('--------------------------------------')
print(f'Reading train data from: {trainFilePath} ........')
with open(trainFilePath, 'rb') as f:
    data_train = pickle.load(f)
print(f'Finished reading, loaded train dataset shape: {data_train.shape}')
print(f'Read train column names: {data_train.columns}\n')

--------------------------------------
Reading train data from: data/data_train.pkl ........
Finished reading, loaded train dataset shape: (1251, 21)
Read train column names: Index(['age', 'psa', 'clinical_stage', 'biopsy_gleason', 'biopsy_gleason_gg',
       'pathological_gleason_gg', 'pathologic_stage', 'lni',
       'surgical_margin_status', 'persistent_psa', 'survival_months',
       'pathologic_gleason', 'TRYSgrupes', 'PLNDO1', 'survival_months_bcr',
       'survival_months_mts', 'patient_id', 'bcr', 'mts',
       'death_from_other_causes', 'cancer_specific_mortality'],
      dtype='object')



## Propensity score matching for lymph nodes status "Unknown" and "0"
We will firstly perform a matching algorithm between lymph node status "Unknown" and "0"

In [36]:
# prepare data

# Firstly we'll perform propensity score matching on LNI "Unknown and "1"
train_lni_Unknown_0 = data_train.copy()
train_lni_Unknown_0 = train_lni_Unknown_0[train_lni_Unknown_0.lni != '1.0']
print('LNI counts excluding (1):\n', train_lni_Unknown_0.lni.value_counts(), '\n')

# To make things simple, we'll rename values to '0' and '1'
train_lni_Unknown_0['lni'].replace('0.0', '1', inplace=True)
train_lni_Unknown_0['lni'].replace('unknown','0', inplace=True)
print('LNI counts after renaming values for simplicity:\n', train_lni_Unknown_0.lni.value_counts(), '\n')

LNI counts excluding (1):
 unknown    766
0.0        430
Name: lni, dtype: int64 

LNI counts after renaming values for simplicity:
 0    766
1    430
Name: lni, dtype: int64 



In [75]:
# Get dummies for categorical variables
X_propensity = pd.get_dummies(train_lni_Unknown_0, columns=['clinical_stage', 'pathological_gleason_gg', 
                                                     'biopsy_gleason_gg', 'pathologic_stage', 
                                                     'surgical_margin_status', 'persistent_psa',
                                                     'TRYSgrupes']) 

# Initiate propensity score matching class
psm_excluded_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'mts', 'bcr', 
                       'survival_months', 'PLNDO1', 'survival_months_bcr', 'survival_months_mts']
print(f'Initiating propensity score matchin class.....')
print(f'treatment column: lni\n index column: patient_id')
print('feature columns: ', X_propensity.drop(psm_excluded_columns, axis=1).columns)
psm = PsmPy(X_propensity, treatment='lni', indx='patient_id', 
            exclude = psm_excluded_columns)

# Calculate propensity scores
psm.logistic_ps(balance=False)
#print(psm.predicted_data.head())

# Match patients using propensity logit 
psm.knn_matched(matcher='propensity_logit', replacement=False, caliper=1.38)

# plot matching results
#psm.plot_match(Title='Matching Result', Ylabel='# of obs', Xlabel= 'propensity logit', names = ['treatment', 'control'])

# get a list of matched ids
matched_ids = psm.matched_ids[~np.isnan(psm.matched_ids.matched_ID)]
all_ids = pd.concat([matched_ids.patient_id, matched_ids.matched_ID]).values

# select only a subset of data with matched patients
train_lni_Unknown_0_matched = psm.df_matched.query('patient_id in @all_ids')

print('\nBefore matching:\n', train_lni_Unknown_0.lni.value_counts())
print('\nAfter  matching:\n', train_lni_Unknown_0_matched.lni.value_counts())

Initiating propensity score matchin class.....
treatment column: lni
 index column: patient_id
feature columns:  Index(['age', 'psa', 'biopsy_gleason', 'lni', 'pathologic_gleason',
       'patient_id', 'clinical_stage_1', 'clinical_stage_2',
       'clinical_stage_3', 'pathological_gleason_gg_1',
       'pathological_gleason_gg_2', 'pathological_gleason_gg_3',
       'pathological_gleason_gg_4', 'pathological_gleason_gg_5',
       'biopsy_gleason_gg_1', 'biopsy_gleason_gg_2', 'biopsy_gleason_gg_3',
       'biopsy_gleason_gg_4', 'biopsy_gleason_gg_5', 'pathologic_stage_0',
       'pathologic_stage_1', 'pathologic_stage_2', 'surgical_margin_status_0',
       'surgical_margin_status_1', 'persistent_psa_0', 'persistent_psa_1',
       'TRYSgrupes_0', 'TRYSgrupes_1', 'TRYSgrupes_2'],
      dtype='object')

Before matching:
 0    766
1    430
Name: lni, dtype: int64

After  matching:
 1    282
0    281
Name: lni, dtype: int64




homogeneity assumption testing

In [76]:
# student's t-test for numerical variables after matching
print('\nPSA:')
students_t_test_control_vs_treatment(control=train_lni_Unknown_0_matched[train_lni_Unknown_0_matched.lni==0].psa, 
                                    treatment=train_lni_Unknown_0_matched[train_lni_Unknown_0_matched.lni==1].psa)
print('\nAge:')
students_t_test_control_vs_treatment(control=train_lni_Unknown_0_matched[train_lni_Unknown_0_matched.lni==0].age, 
                                    treatment=train_lni_Unknown_0_matched[train_lni_Unknown_0_matched.lni==1].age)
print('\nbiopsy_gleason:')
students_t_test_control_vs_treatment(control=train_lni_Unknown_0_matched[train_lni_Unknown_0_matched.lni==0].biopsy_gleason, 
                                    treatment=train_lni_Unknown_0_matched[train_lni_Unknown_0_matched.lni==1].biopsy_gleason)
print('\npathologic_gleason:')
students_t_test_control_vs_treatment(control=train_lni_Unknown_0_matched[train_lni_Unknown_0_matched.lni==0].pathologic_gleason, 
                                    treatment=train_lni_Unknown_0_matched[train_lni_Unknown_0_matched.lni==1].pathologic_gleason)


PSA:
Control mean: 8.215
Treatment mean: 8.676
p = 0.200
same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)

Age:
Control mean: 64.331
Treatment mean: 64.326
p = 0.993
same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)

biopsy_gleason:
Control mean: 6.509
Treatment mean: 6.539
p = 0.599
same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)

pathologic_gleason:
Control mean: 6.854
Treatment mean: 6.840
p = 0.818
same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)


## Propensity score matching for lymph nodes status "{Unknown, 0}" and "1"
Now we will perform a matching algorithm between lymph node status {"Unknown", "0"} and "1"

In [77]:
# prepare data

train_lni_1 = data_train.copy()

# Because the previous matched dataset will already have dummy variables, we need to do the same
# for this one at the start
train_lni_1 = pd.get_dummies(train_lni_1, columns=['clinical_stage', 'pathological_gleason_gg', 
                                                   'biopsy_gleason_gg', 'pathologic_stage', 
                                                   'surgical_margin_status', 'persistent_psa',
                                                   'TRYSgrupes']) 

# We need only data for lni == 1 this time.
train_lni_1 = train_lni_1[train_lni_1.lni == '1.0']
print('LNI counts only (1):\n', train_lni_1.lni.value_counts(), '\n')

train_lni_1['lni'].replace('1.0', '1', inplace=True)
train_lni_1 = train_lni_1.astype({'lni': 'int32'})

# We take the copy of matched dataset
train_lni_Unknown0_1 = train_lni_Unknown_0_matched.copy()
train_lni_Unknown0_1 = train_lni_Unknown0_1.drop(['propensity_score', 'propensity_logit'], axis=1)
train_lni_Unknown0_1.reset_index(inplace=True, drop=True)

# "Unknown" and "0" lni's are both will be replaced with value "0"
print('LNI counts of matched dataset:\n', train_lni_Unknown0_1.lni.value_counts(), '\n')
train_lni_Unknown0_1['lni'].replace(1, 0, inplace=True)
print('LNI counts of matched dataset ("unknown" and "0" have been merged):\n', train_lni_Unknown0_1.lni.value_counts(), '\n')

# We combine matched dataset and dataset where lni == 1
train_lni_Unknown0_1 = pd.concat([train_lni_Unknown0_1, train_lni_1])
print('LNI counts after combining dataset:\n', train_lni_Unknown0_1.lni.value_counts())

LNI counts only (1):
 1.0    55
Name: lni, dtype: int64 

LNI counts of matched dataset:
 1    282
0    281
Name: lni, dtype: int64 

LNI counts of matched dataset ("unknown" and "0" have been merged):
 0    563
Name: lni, dtype: int64 

LNI counts after combining dataset:
 0    563
1     55
Name: lni, dtype: int64


In [85]:
psm_excluded_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'mts', 'bcr', 
                       'survival_months', 'PLNDO1', 'survival_months_bcr', 'survival_months_mts',
                       'matched_ID']
print(f'Initiating propensity score matchin class.....')
print(f'treatment column: lni\n index column: patient_id')
print('feature columns: ', train_lni_Unknown0_1.drop(psm_excluded_columns, axis=1).columns)

psm = PsmPy(train_lni_Unknown0_1, treatment='lni', indx='patient_id', 
            exclude = psm_excluded_columns)

# Calculate propensity scores
psm.logistic_ps(balance=False)
#print(psm.predicted_data.head())

# Match patients using propensity logit 
psm.knn_matched(matcher='propensity_logit', replacement=False, caliper=2.9)

# plot matching results
#psm.plot_match(Title='Matching Result', Ylabel='# of obs', Xlabel= 'propensity logit', names = ['treatment', 'control'])

# get a list of matched ids
matched_ids = psm.matched_ids[~np.isnan(psm.matched_ids.matched_ID)]
all_ids = pd.concat([matched_ids.patient_id, matched_ids.matched_ID]).values

# select only a subset of data with matched patients
train_lni_Unknown0_1_matched = psm.df_matched.query('patient_id in @all_ids')

print('\nBefore matching:\n', train_lni_Unknown0_1.lni.value_counts())
print('\nAfter  matching:\n', train_lni_Unknown0_1_matched.lni.value_counts())

Initiating propensity score matchin class.....
treatment column: lni
 index column: patient_id
feature columns:  Index(['patient_id', 'age', 'psa', 'biopsy_gleason', 'pathologic_gleason',
       'clinical_stage_1', 'clinical_stage_2', 'clinical_stage_3',
       'pathological_gleason_gg_1', 'pathological_gleason_gg_2',
       'pathological_gleason_gg_3', 'pathological_gleason_gg_4',
       'pathological_gleason_gg_5', 'biopsy_gleason_gg_1',
       'biopsy_gleason_gg_2', 'biopsy_gleason_gg_3', 'biopsy_gleason_gg_4',
       'biopsy_gleason_gg_5', 'pathologic_stage_0', 'pathologic_stage_1',
       'pathologic_stage_2', 'surgical_margin_status_0',
       'surgical_margin_status_1', 'persistent_psa_0', 'persistent_psa_1',
       'TRYSgrupes_0', 'TRYSgrupes_1', 'TRYSgrupes_2', 'lni'],
      dtype='object')

Before matching:
 0    563
1     55
Name: lni, dtype: int64

After  matching:
 1    44
0    44
Name: lni, dtype: int64




homogeneity assumption testing

In [86]:
# student's t-test for numerical variables after matching
print('\nPSA:\n')
students_t_test_control_vs_treatment(control=train_lni_Unknown0_1_matched[train_lni_Unknown0_1_matched.lni==0].psa, 
                                    treatment=train_lni_Unknown0_1_matched[train_lni_Unknown0_1_matched.lni==1].psa)
print('\nAge:\n')
students_t_test_control_vs_treatment(control=train_lni_Unknown0_1_matched[train_lni_Unknown0_1_matched.lni==0].age, 
                                    treatment=train_lni_Unknown0_1_matched[train_lni_Unknown0_1_matched.lni==1].age)
print('\nbiopsy_gleason:\n')
students_t_test_control_vs_treatment(control=train_lni_Unknown0_1_matched[train_lni_Unknown0_1_matched.lni==0].biopsy_gleason, 
                                    treatment=train_lni_Unknown0_1_matched[train_lni_Unknown0_1_matched.lni==1].biopsy_gleason)
print('\npathologic_gleason:\n')
students_t_test_control_vs_treatment(control=train_lni_Unknown0_1_matched[train_lni_Unknown0_1_matched.lni==0].pathologic_gleason, 
                                    treatment=train_lni_Unknown0_1_matched[train_lni_Unknown0_1_matched.lni==1].pathologic_gleason)


PSA:

Control mean: 12.410
Treatment mean: 13.278
p = 0.597
same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)

Age:

Control mean: 64.045
Treatment mean: 63.727
p = 0.796
same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)

biopsy_gleason:

Control mean: 7.068
Treatment mean: 7.364
p = 0.147
same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)

pathologic_gleason:

Control mean: 7.705
Treatment mean: 8.068
p = 0.062
same distributions/same group mean (fail to reject H0 - we do not have enough evidence to reject H0)


In [87]:
# We want to take a subset of original training dataset based on our matched ids
all_ids = train_lni_Unknown0_1_matched.patient_id.values

# select only a subset of data with matched patients
data_train_homogenous = data_train.query('patient_id in @all_ids')

In [88]:
# save homogenous train dataset
print(f'Saving homogenous train dataset.......')
with open('data/data_train_homogenous.pkl', 'wb') as f:
    pickle.dump(data_train_homogenous, f)
print('Finished saving homogenous train dataset','\n')

Saving homogenous train dataset.......
Finished saving homogenous train dataset 

