# Cleaning insurance and socioeconomic status prior to survival analysis 

In [1]:
import numpy as np
import pandas as pd

from flatiron_cleaner import DataProcessorUrothelial

## Import data

In [2]:
df = pd.read_csv('../outputs/full_cohort.csv')

In [3]:
df.shape

(6461, 3)

In [4]:
df.head(3)

Unnamed: 0,PatientID,LineName,StartDate
0,F5AAF96C85477,Pembrolizumab,2021-07-08
1,F788831A66E9A,Pembrolizumab,2023-02-22
2,F75847DF35E43,Atezolizumab,2019-04-25


## Clean CSV 

In [5]:
# Initialize class 
processor = DataProcessorUrothelial()

### Process insurance 

In [6]:
# Process Insurance.csv 
insurance_df = processor.process_insurance(file_path = '../data/Insurance.csv',
                                           index_date_df = df,
                                           index_date_column = 'StartDate',
                                           days_before = None,
                                           days_after = 0,
                                           missing_date_strategy = 'liberal')

2025-04-20 22:54:16,813 - INFO - Successfully read Insurance.csv file with shape: (53709, 14) and unique PatientIDs: 12391
2025-04-20 22:54:16,853 - INFO - Successfully merged Insurance.csv df with index_date_df resulting in shape: (27499, 15) and unique PatientIDs: 6137
2025-04-20 22:54:16,922 - INFO - Successfully processed Insurance.csv file with final shape: (6461, 5) and unique PatientIDs: 6461


### Process SES

In [7]:
ses = pd.read_csv('../data/SocialDeterminantsOfHealth.csv')

In [8]:
ses.head(3)

Unnamed: 0,PatientID,SESIndex2015_2019
0,F5AAF96C85477,1 - Lowest SES
1,F43136CF07859,4
2,F6FAD468C5AE0,2


In [9]:
ses.SESIndex2015_2019.value_counts(dropna = False)

SESIndex2015_2019
4                  2830
3                  2576
5 - Highest SES    2336
2                  2288
1 - Lowest SES     1745
NaN                1354
Name: count, dtype: int64

In [10]:
ses['SESIndex2015_2019'] = ses['SESIndex2015_2019'].fillna('unknown')

In [11]:
ses['ses_mod'] = np.where(
    (ses['SESIndex2015_2019'] == '2') | (ses['SESIndex2015_2019'] == '3') | (ses['SESIndex2015_2019'] == '4'),
    '2-4',
    ses['SESIndex2015_2019'])

In [12]:
ses.ses_mod.value_counts(dropna = False)

ses_mod
2-4                7694
5 - Highest SES    2336
1 - Lowest SES     1745
unknown            1354
Name: count, dtype: int64

In [13]:
ses['ses_mod'] = ses['ses_mod'].astype('category')

### Merge dataframes

In [14]:
df = pd.merge(df[['PatientID']], insurance_df, on = 'PatientID', how = 'left')

In [15]:
df = pd.merge(df, ses[['PatientID', 'ses_mod']], on = 'PatientID', how = 'left') 

In [16]:
df.shape

(6461, 6)

In [17]:
df.dtypes

PatientID            object
commercial            Int64
medicaid              Int64
medicare              Int64
other_insurance       Int64
ses_mod            category
dtype: object

In [18]:
df.to_csv('../outputs/insurance_ses_df.csv', index = False)