In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st

root_path = "C:/Users/Abdul Zakkar/Documents/UICOM/research/salahudeen/analysis_04122024/"
path_sybil = root_path + "sybil_predictions_05202024.csv"
path_dx = root_path + "202300334/REQ_202300334_DX_OUT.csv"
path_famhx = root_path + "202300334/REQ_202300334_FAM_HX_OUT.csv"
path_dem = root_path + "202300334/REQ_202300334_DEM2_OUT.csv"
path_vital = root_path + "202300334/REQ_202300334_VITAL2_OUT.csv"

test_out_path = root_path + '../test_train_sets/test/'

In [2]:
STUDY_DATE = pd.to_datetime('2024-03-16')
REMOVE_UNKNOWN = True

In [3]:
df_sybil = pd.read_csv(path_sybil)
df_dx = pd.read_csv(path_dx)
df_famhx = pd.read_csv(path_famhx)
df_dem = pd.read_csv(path_dem)
df_vital = pd.read_csv(path_vital)

print("df_sybil shape", df_sybil.shape)
print("df_dx shape", df_dx.shape)
print("df_famhx shape", df_famhx.shape)
print("df_dem shape", df_dem.shape)
print("df_vital shape", df_vital.shape)

df_sybil shape (53918, 7)
df_dx shape (26765, 3)
df_famhx shape (475, 4)
df_dem shape (11653, 15)
df_vital shape (11154, 2)


In [4]:
def trim_path(path, index):
    indexes = [index for index, char in enumerate(path) if char=='/']
    return path[indexes[index]+1:]

df_sybil["path"] = df_sybil["path"].apply(trim_path, args=[4])

In [5]:
def get_pid(path):
    return int(path.split('/')[2].split('_')[1])

df_sybil["pid"] = df_sybil["path"].apply(get_pid)

In [6]:
def get_dicom_date(path):
    return pd.to_datetime(path.split('/')[2].split('_')[2])

df_sybil["date"] = df_sybil["path"].apply(get_dicom_date)

In [7]:
def get_entry_date(pid):
    return np.min(df_sybil[df_sybil['pid'] == pid]['date'])

df_dem['ENTRY_DATE'] = df_dem['PATIENT_ID'].apply(get_entry_date)

In [8]:
df_dem = df_dem.dropna(subset=['ENTRY_DATE'])

display(df_dem.shape)
df_dem[['PATIENT_ID', 'ENTRY_DATE']].head()

(8758, 16)

Unnamed: 0,PATIENT_ID,ENTRY_DATE
0,1,2021-04-01
1,2,2018-09-12
2,3,2017-09-07
3,4,2017-02-26
4,5,2018-07-18


In [9]:
def get_adjusted_val(row, val, base_date):
    days = base_date - row['ENTRY_DATE']
    return row[val] - int(round(days.days / 365.0))

df_dem['ADJUSTED_AGE'] = df_dem.apply(
    get_adjusted_val, axis=1, args=['CURRENT_AGE_IF_ALIVE', STUDY_DATE])

df_dem['ADJUSTED_TOBACCO_USED_YEARS'] = df_dem.apply(
    get_adjusted_val, axis=1, args=['TOBACCO_USED_YEARS', STUDY_DATE])
    # This assumes that everyone is a current smoker

df_dem[['PATIENT_ID', 'ENTRY_DATE', 'CURRENT_AGE_IF_ALIVE',
    'ADJUSTED_AGE', 'TOBACCO_USED_YEARS', 'ADJUSTED_TOBACCO_USED_YEARS']].head()

Unnamed: 0,PATIENT_ID,ENTRY_DATE,CURRENT_AGE_IF_ALIVE,ADJUSTED_AGE,TOBACCO_USED_YEARS,ADJUSTED_TOBACCO_USED_YEARS
0,1,2021-04-01,63,60,,
1,2,2018-09-12,62,56,45.0,39.0
2,3,2017-09-07,81,74,,
3,4,2017-02-26,71,64,44.0,37.0
4,5,2018-07-18,65,59,,


In [10]:
def get_dx_date(patient_id, df_dx):
    search = df_dx[df_dx["PATIENT_ID"] == patient_id]
    if search.shape[0] == 0:
        return None
    return pd.to_datetime(search["DX_DATE"].values[0])

df_dem['DX_DATE'] = df_dem['PATIENT_ID'].apply(get_dx_date, args = [df_dx])

df_dem[~pd.isna(df_dem['DX_DATE'])][['PATIENT_ID', 'ENTRY_DATE', 'DX_DATE']].head()

Unnamed: 0,PATIENT_ID,ENTRY_DATE,DX_DATE
1,2,2018-09-12,2023-05-24
2,3,2017-09-07,2017-11-03
4,5,2018-07-18,2018-11-29
6,7,2017-02-07,2023-10-08
14,15,2017-02-22,2017-05-17


In [11]:
def get_cancer(row, year, base_date):
    cancer_by = row['ENTRY_DATE'] + pd.offsets.DateOffset(years=year)
    if pd.isna(row['DX_DATE']):
        if REMOVE_UNKNOWN:
            if cancer_by > base_date: # To remove unknown future cancer events
                return np.nan
        return 0
    return 1 if row['DX_DATE'] <= cancer_by else 0

for i in range(1,6+1):
    df_dem['CANC_YR' + str(i)] = df_dem.apply(get_cancer, axis=1, args=[i, STUDY_DATE])

display(df_dem.shape)
df_dem[['ENTRY_DATE','DX_DATE','CANC_YR1','CANC_YR2','CANC_YR3','CANC_YR4','CANC_YR5','CANC_YR6']].head()

(8758, 25)

Unnamed: 0,ENTRY_DATE,DX_DATE,CANC_YR1,CANC_YR2,CANC_YR3,CANC_YR4,CANC_YR5,CANC_YR6
0,2021-04-01,NaT,0.0,0.0,,,,
1,2018-09-12,2023-05-24,0.0,0.0,0.0,0.0,1.0,1.0
2,2017-09-07,2017-11-03,1.0,1.0,1.0,1.0,1.0,1.0
3,2017-02-26,NaT,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-07-18,2018-11-29,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
def race2_to_svm(row):
    if pd.isna(row["RACE"]) or row["RACE"] == "No Information":
        if row["ETHNICITY"] == "Hispanic":
            return 0
        return np.nan
    if row["RACE"] == "Black or African American":
        return 1
    return 0
# If someone is Hispanic, they likely have missing Race information.
# So Hispanic patients are assumed to be White since the majority of Hispanic patients are White.

df_dem['RACE_2'] = df_dem.apply(race2_to_svm, axis=1)

display(df_dem.shape)
df_dem[['RACE','ETHNICITY','RACE_2']].head()

(8758, 26)

Unnamed: 0,RACE,ETHNICITY,RACE_2
0,Black or African American,Not Hispanic,1.0
1,White,Not Hispanic,0.0
2,Black or African American,Not Hispanic,1.0
3,No Information,Hispanic,0.0
4,Black or African American,Not Hispanic,1.0


In [13]:
def ethnic_to_svm(ethnic):
    if ethnic == "No Information":
        return np.nan
    if ethnic == "Hispanic":
        return 1
    return 0

df_dem['ETHNICITY'] = df_dem['ETHNICITY'].apply(ethnic_to_svm)

In [14]:
df_dem['CIGS_PER_DAY'] = df_dem['PACKS_PER_DAY'] * 20

In [15]:
def marital_to_svm(marital):
    if marital == "No Information":
        return np.nan
    if marital == "Divorced" or marital == "Widow":
        return 1
    return 0

df_dem['DIVORCED_OR_WIDOWED'] = df_dem['MARITAL_STATUS'].apply(marital_to_svm)

In [16]:
def bmi_to_svm(pid, df):
    if pid not in df['pid'].values:
        return np.nan
    return df[df['pid'] == pid]['bmi'].values[0]

df_dem['BMI'] = df_dem['PATIENT_ID'].apply(bmi_to_svm, args=[df_vital])

In [17]:
cols = ['PATIENT_ID','AGE_AT_FIRST_SCAN','RACE_2','ETHNICITY','CIGS_PER_DAY',
    'ADJUSTED_TOBACCO_USED_YEARS','DIVORCED_OR_WIDOWED','BMI','SEX','RACE',
    'CANC_YR1','CANC_YR2','CANC_YR3','CANC_YR4','CANC_YR5','CANC_YR6']

df_dem_test = df_dem[cols]
display(df_dem_test[cols].isna().sum())

df_dem_test = df_dem_test[cols].dropna(subset=[
    'AGE_AT_FIRST_SCAN','RACE_2','ETHNICITY','CIGS_PER_DAY',
    'ADJUSTED_TOBACCO_USED_YEARS','DIVORCED_OR_WIDOWED','CANC_YR1'
])

# Rename columns
rename_dict = {
    'PATIENT_ID': 'pid',
    'AGE_AT_FIRST_SCAN': 'age',
    'ETHNICITY': 'ethnic',
    'RACE_2': 'race_2',
    'CIGS_PER_DAY': 'smokeday',
    'ADJUSTED_TOBACCO_USED_YEARS': 'smokeyr',
    'DIVORCED_OR_WIDOWED': 'divorced_or_widowed',
    'BMI': 'bmi',
    'SEX': 'sex',
    'RACE': 'race',
    'CANC_YR1': 'canc_yr_1',
    'CANC_YR2': 'canc_yr_2',
    'CANC_YR3': 'canc_yr_3',
    'CANC_YR4': 'canc_yr_4',
    'CANC_YR5': 'canc_yr_5',
    'CANC_YR6': 'canc_yr_6'
}
df_dem_test = df_dem_test.rename(columns=rename_dict)

display(df_dem_test.shape)
display(df_dem_test.head(10))

print("CANCER COUNTS IN FULL SET")
for i in range(1,6+1):
    print('TEST YEAR', i, ':', int(df_dem_test['canc_yr_'+str(i)].sum()),
        'of', df_dem_test.shape[0]-df_dem_test['canc_yr_'+str(i)].isna().sum())

PATIENT_ID                        0
AGE_AT_FIRST_SCAN                 0
RACE_2                          600
ETHNICITY                       772
CIGS_PER_DAY                   5745
ADJUSTED_TOBACCO_USED_YEARS    6060
DIVORCED_OR_WIDOWED             521
BMI                             769
SEX                               0
RACE                              0
CANC_YR1                       1243
CANC_YR2                       2315
CANC_YR3                       3412
CANC_YR4                       4096
CANC_YR5                       5001
CANC_YR6                       5848
dtype: int64

(1759, 16)

Unnamed: 0,pid,age,race_2,ethnic,smokeday,smokeyr,divorced_or_widowed,bmi,sex,race,canc_yr_1,canc_yr_2,canc_yr_3,canc_yr_4,canc_yr_5,canc_yr_6
1,2,57,0.0,0.0,40.0,39.0,0.0,28.106899,Female,White,0.0,0.0,0.0,0.0,1.0,1.0
8,9,66,1.0,0.0,40.0,30.0,1.0,33.877022,Female,Black or African American,0.0,0.0,0.0,0.0,0.0,
16,17,64,1.0,0.0,20.0,36.0,0.0,39.354894,Female,Black or African American,0.0,0.0,0.0,0.0,,
17,18,62,0.0,0.0,20.0,41.0,0.0,29.757902,Male,White,0.0,0.0,0.0,0.0,0.0,0.0
20,21,56,0.0,0.0,20.0,27.0,0.0,25.747675,Female,White,0.0,0.0,0.0,,,
22,23,57,1.0,0.0,0.0,-2.0,0.0,31.351079,Female,Black or African American,1.0,1.0,1.0,1.0,1.0,1.0
24,25,62,1.0,0.0,20.0,25.0,0.0,27.690511,Male,Black or African American,0.0,0.0,0.0,0.0,0.0,
34,35,58,1.0,0.0,20.0,41.0,0.0,45.838329,Female,Black or African American,0.0,,,,,
35,36,54,0.0,0.0,20.0,36.0,0.0,25.718086,Female,White,0.0,0.0,0.0,0.0,0.0,0.0
47,48,76,0.0,0.0,20.0,48.0,0.0,21.521741,Male,Asian,0.0,0.0,0.0,0.0,,


CANCER COUNTS IN FULL SET
TEST YEAR 1 : 40 of 1759
TEST YEAR 2 : 61 of 1432
TEST YEAR 3 : 74 of 1063
TEST YEAR 4 : 87 of 891
TEST YEAR 5 : 96 of 704
TEST YEAR 6 : 101 of 537


In [18]:
def filter_path(path, exclude, at_least_one, include, trim=2):
    path = trim_path(path, trim)
    for value in exclude:
        if value.lower() in path.lower():
            return False
    for value in include:
        if value.lower() not in path.lower():
            return False
    for group in at_least_one:
        is_ok = False
        for value in group:
            if value.lower() in path.lower():
                is_ok = True
                break
        if not is_ok:
            return False
    return True

df_sybil["filter_ldct"] = df_sybil["path"].apply(filter_path, args=[
    [ # EXCLUDE
        "pt", "mr",
        "rt", "right", "lt", "left",
        "abd", "pelvis", 
        "skull",
        "cta", "angio", "angiography",
        "with contrast", "w cont", "wwo", "w iv contrast", "with and without contrast"
    ],
    [
        ["chest", "lung", "thorax"], # AT LEAST ONE
        ["screening", "ld", "low dose"]
    ], 
    ["ct"] # INCLUDE
])

df_sybil["filter_ctwocont"] = df_sybil["path"].apply(filter_path, args=[
    [ # EXCLUDE
        "pt", "mr",
        "rt", "right", "lt", "left",
        "abd", "pelvis", 
        "skull",
        "cta", "angio", "angiography",
        "with contrast", "w cont", "wwo", "w iv contrast", "with and without contrast",
        "screening", "ld", "low dose"
    ],
    [
        ["chest", "lung", "thorax"], # AT LEAST ONE
    ], 
    ["ct"] # INCLUDE
])

In [19]:
print(df_sybil.shape)

(53918, 11)


In [20]:
df_dx["DX_DATE"] = pd.to_datetime(df_dx["DX_DATE"])
earliest_dates = df_dx.groupby("PATIENT_ID")["DX_DATE"].min().reset_index()
df_dx = pd.merge(df_dx, earliest_dates, on=["PATIENT_ID", "DX_DATE"], how="inner")
df_dx = df_dx.drop_duplicates(subset="PATIENT_ID", keep="first")
print(df_dx.shape)
df_dx.head(10)

(870, 3)


Unnamed: 0,PATIENT_ID,DX,DX_DATE
0,2,ICD10CM:C34.12,2023-05-17
1,3,ICD10CM:C34.90,2017-11-03
2,5,ICD10CM:C34.90,2018-09-27
3,7,ICD10CM:C34.12,2023-10-02
5,8,ICD10CM:C34.2,2017-11-27
6,11,ICD10CM:C34.90,2017-03-16
7,12,ICD10CM:C34.11,2021-04-22
8,15,ICD10CM:C34.12,2017-05-17
9,16,ICD10CM:C34.90,2019-01-31
10,19,ICD10CM:C34.11,2021-08-05


In [21]:
df_sybil["dx_date"] = df_sybil["pid"].apply(get_dx_date, args=[df_dx])
df_sybil.head()

Unnamed: 0,path,pred_yr1,pred_yr2,pred_yr3,pred_yr4,pred_yr5,pred_yr6,pid,date,filter_ldct,filter_ctwocont,dx_date
0,screening_batch_1/limited/screening_660_2018-0...,0.00196,0.01048,0.01579,0.02472,0.03451,0.04836,660,2018-01-05,True,False,NaT
1,screening_batch_1/limited/screening_660_2018-0...,0.00196,0.00573,0.011,0.01572,0.02111,0.03595,660,2018-01-05,True,False,NaT
2,screening_batch_1/limited/screening_660_2018-0...,0.00042,0.00205,0.00502,0.00884,0.01417,0.02166,660,2018-01-05,True,False,NaT
3,screening_batch_1/limited/screening_660_2018-0...,0.00178,0.00528,0.0093,0.01345,0.0195,0.03296,660,2018-01-05,True,False,NaT
4,screening_batch_1/limited/screening_2035_2023-...,0.00178,0.00528,0.0093,0.0144,0.02111,0.0341,2035,2023-10-15,True,False,NaT


In [22]:
def get_dx_days(row):
    if pd.isna(row["dx_date"]):
        return np.nan
    return (row["dx_date"] - row["date"]).days

df_sybil["dx_days"] = df_sybil.apply(get_dx_days, axis=1)
df_sybil[~df_sybil['dx_days'].isna()].head()

Unnamed: 0,path,pred_yr1,pred_yr2,pred_yr3,pred_yr4,pred_yr5,pred_yr6,pid,date,filter_ldct,filter_ctwocont,dx_date,dx_days
289,neoplasm_batch_1/limited/neoplasm_3_2017-09-07...,0.25563,0.351,0.36531,0.38905,0.41002,0.45655,3,2017-09-07,False,False,2017-11-03,57.0
290,neoplasm_batch_1/limited/neoplasm_3_2017-09-07...,0.08556,0.14352,0.17286,0.1978,0.21198,0.2766,3,2017-09-07,False,False,2017-11-03,57.0
291,neoplasm_batch_1/limited/neoplasm_1589_2017-03...,0.07079,0.13432,0.16666,0.19426,0.20745,0.2766,1589,2017-03-03,False,False,2017-08-25,175.0
292,neoplasm_batch_1/limited/neoplasm_1589_2017-03...,0.00178,0.00528,0.0093,0.01345,0.0195,0.03296,1589,2017-03-03,False,False,2017-08-25,175.0
293,neoplasm_batch_1/limited/neoplasm_1539_2022-02...,0.02812,0.05019,0.07192,0.07927,0.09585,0.13842,1539,2022-02-03,False,True,2022-03-14,39.0


In [23]:
def get_cancer(row, year):
    if pd.isna(row['dx_days']):
        if REMOVE_UNKNOWN:
            if row['date'] + pd.DateOffset(year * 365) > pd.Timestamp('2024-03-16'): 
                return np.nan
        return 0
    return 1 if row['dx_days'] <= year * 365 else 0

n_years = 6
for year in range(1,n_years+1):
    df_sybil["canc_yr_" + str(year)] = df_sybil.apply(get_cancer, axis=1, args=[year])

In [24]:
df_dem_test.head()

Unnamed: 0,pid,age,race_2,ethnic,smokeday,smokeyr,divorced_or_widowed,bmi,sex,race,canc_yr_1,canc_yr_2,canc_yr_3,canc_yr_4,canc_yr_5,canc_yr_6
1,2,57,0.0,0.0,40.0,39.0,0.0,28.106899,Female,White,0.0,0.0,0.0,0.0,1.0,1.0
8,9,66,1.0,0.0,40.0,30.0,1.0,33.877022,Female,Black or African American,0.0,0.0,0.0,0.0,0.0,
16,17,64,1.0,0.0,20.0,36.0,0.0,39.354894,Female,Black or African American,0.0,0.0,0.0,0.0,,
17,18,62,0.0,0.0,20.0,41.0,0.0,29.757902,Male,White,0.0,0.0,0.0,0.0,0.0,0.0
20,21,56,0.0,0.0,20.0,27.0,0.0,25.747675,Female,White,0.0,0.0,0.0,,,


In [26]:
columns_to_add = [
    'pid', 'age', 'ethnic', 'race_2', 'bmi', 'smokeday', 'smokeyr', 'divorced_or_widowed', 'sex', 'race'
]
df_sybil_svm = pd.merge(df_sybil, df_dem_test[columns_to_add], on='pid', how='left')
df_sybil_svm = df_sybil_svm.dropna(subset=(columns_to_add + ['canc_yr_1']))

display(df_sybil_svm.shape)
display(df_sybil_svm.head())
display(df_sybil_svm.isna().sum())
PATH = (
    'C:/' +
    'Users/' +
    'Abdul Zakkar/' +
    'Documents/' +
    'UICOM/' +
    'research/' +
    'salahudeen/'
)
df_sybil_svm.to_csv(PATH + 'uic_complete_table_20240613.csv', index=False)

(10192, 28)

Unnamed: 0,path,pred_yr1,pred_yr2,pred_yr3,pred_yr4,pred_yr5,pred_yr6,pid,date,filter_ldct,...,canc_yr_6,age,ethnic,race_2,bmi,smokeday,smokeyr,divorced_or_widowed,sex,race
0,screening_batch_1/limited/screening_660_2018-0...,0.00196,0.01048,0.01579,0.02472,0.03451,0.04836,660,2018-01-05,True,...,0.0,56.0,0.0,1.0,27.168436,20.0,36.0,0.0,Male,Black or African American
1,screening_batch_1/limited/screening_660_2018-0...,0.00196,0.00573,0.011,0.01572,0.02111,0.03595,660,2018-01-05,True,...,0.0,56.0,0.0,1.0,27.168436,20.0,36.0,0.0,Male,Black or African American
2,screening_batch_1/limited/screening_660_2018-0...,0.00042,0.00205,0.00502,0.00884,0.01417,0.02166,660,2018-01-05,True,...,0.0,56.0,0.0,1.0,27.168436,20.0,36.0,0.0,Male,Black or African American
3,screening_batch_1/limited/screening_660_2018-0...,0.00178,0.00528,0.0093,0.01345,0.0195,0.03296,660,2018-01-05,True,...,0.0,56.0,0.0,1.0,27.168436,20.0,36.0,0.0,Male,Black or African American
24,screening_batch_1/limited/screening_2049_2023-...,0.00196,0.00679,0.01362,0.01729,0.02169,0.03595,2049,2023-03-01,True,...,,60.0,0.0,1.0,35.8744,20.0,39.0,0.0,Female,Black or African American


path                      0
pred_yr1                  0
pred_yr2                  0
pred_yr3                  0
pred_yr4                  0
pred_yr5                  0
pred_yr6                  0
pid                       0
date                      0
filter_ldct               0
filter_ctwocont           0
dx_date                9300
dx_days                9300
canc_yr_1                 0
canc_yr_2              2471
canc_yr_3              4388
canc_yr_4              5331
canc_yr_5              6385
canc_yr_6              7212
age                       0
ethnic                    0
race_2                    0
bmi                       0
smokeday                  0
smokeyr                   0
divorced_or_widowed       0
sex                       0
race                      0
dtype: int64

In [26]:
df_sybil_svm_ldct = df_sybil_svm[df_sybil_svm["filter_ldct"] == True]
df_sybil_svm_ctwocont = df_sybil_svm[df_sybil_svm["filter_ctwocont"] == True]


In [27]:
print("N unique PIDs with LCDT:", len(df_sybil_svm_ldct['pid'].unique()))
print("N unique PIDs with CT wo contrast:", len(df_sybil_svm_ctwocont['pid'].unique()))
print("N unique PIDs across both groups:",
    len(pd.concat([df_sybil_svm_ldct['pid'],df_sybil_svm_ctwocont['pid']]).unique()))

N unique PIDs with LCDT: 695
N unique PIDs with CT wo contrast: 512
N unique PIDs across both groups: 1084


In [28]:
test_uic_svm6_cols = [
    'age','ethnic','race_2','smokeday','smokeyr','divorced_or_widowed',
    'canc_yr_1','canc_yr_2','canc_yr_3','canc_yr_4','canc_yr_5','canc_yr_6'
]
test_uic_svm6 = df_dem_test[test_uic_svm6_cols]

test_uic_svm7_cols = [
    'age','ethnic','race_2','bmi','smokeday','smokeyr','divorced_or_widowed',
    'canc_yr_1','canc_yr_2','canc_yr_3','canc_yr_4','canc_yr_5','canc_yr_6'
]
test_uic_svm7 = df_dem_test[test_uic_svm7_cols].dropna(subset=['bmi'])

test_uic_svm6sybil_cols = [
    'age','ethnic','race_2','smokeday','smokeyr','divorced_or_widowed',
    'pred_yr1','pred_yr2','pred_yr3','pred_yr4','pred_yr5','pred_yr6',
    'canc_yr_1','canc_yr_2','canc_yr_3','canc_yr_4','canc_yr_5','canc_yr_6'
]

test_uic_svm6sybil = df_sybil_svm_ldct[test_uic_svm6sybil_cols]
test_uic_svm6sybil_white = df_sybil_svm_ldct[df_sybil_svm_ldct['race'] == 'White'][test_uic_svm6sybil_cols]
test_uic_svm6sybil_black = df_sybil_svm_ldct[df_sybil_svm_ldct['race'] == 'Black or African American'][test_uic_svm6sybil_cols]

test_uic_svm7sybil_cols = [
    'age','ethnic','race_2','bmi','smokeday','smokeyr','divorced_or_widowed',
    'pred_yr1','pred_yr2','pred_yr3','pred_yr4','pred_yr5','pred_yr6',
    'canc_yr_1','canc_yr_2','canc_yr_3','canc_yr_4','canc_yr_5','canc_yr_6'
]

test_uic_svm7sybil = df_sybil_svm_ldct[test_uic_svm7sybil_cols].dropna(subset=['bmi'])
test_uic_svm7sybil_white = df_sybil_svm_ldct[df_sybil_svm_ldct['race'] == 'White'][test_uic_svm7sybil_cols].dropna(subset=['bmi'])
test_uic_svm7sybil_black = df_sybil_svm_ldct[df_sybil_svm_ldct['race'] == 'Black or African American'][test_uic_svm7sybil_cols].dropna(subset=['bmi'])

test_uic_svm2sybil_cols = [
    'age','race_2',
    'pred_yr1','pred_yr2','pred_yr3','pred_yr4','pred_yr5','pred_yr6',
    'canc_yr_1','canc_yr_2','canc_yr_3','canc_yr_4','canc_yr_5','canc_yr_6'
]

test_uic_svm2sybil = df_sybil_svm_ldct[test_uic_svm2sybil_cols]
test_uic_svm2sybil_white = df_sybil_svm_ldct[df_sybil_svm_ldct['race'] == 'White'][test_uic_svm2sybil_cols]
test_uic_svm2sybil_black = df_sybil_svm_ldct[df_sybil_svm_ldct['race'] == 'Black or African American'][test_uic_svm2sybil_cols]

test_uic_sybil_cols = [
    'pred_yr1','pred_yr2','pred_yr3','pred_yr4','pred_yr5','pred_yr6',
    'canc_yr_1','canc_yr_2','canc_yr_3','canc_yr_4','canc_yr_5','canc_yr_6'
]

test_uic_sybil = df_sybil_svm_ldct[test_uic_sybil_cols]
test_uic_sybil_white = df_sybil_svm_ldct[df_sybil_svm_ldct['race'] == 'White'][test_uic_sybil_cols]
test_uic_sybil_black = df_sybil_svm_ldct[df_sybil_svm_ldct['race'] == 'Black or African American'][test_uic_sybil_cols]
test_uic_sybil_male = df_sybil_svm_ldct[df_sybil_svm_ldct['sex'] == 'Male'][test_uic_sybil_cols]
test_uic_sybil_female = df_sybil_svm_ldct[df_sybil_svm_ldct['sex'] == 'Female'][test_uic_sybil_cols]
test_uic_sybil_bmih = df_sybil_svm_ldct[df_sybil_svm_ldct['bmi'] >= 30.0][test_uic_sybil_cols]
test_uic_sybil_bmil = df_sybil_svm_ldct[df_sybil_svm_ldct['bmi'] < 30.0][test_uic_sybil_cols]

test_uic_sybil_ctwocont = df_sybil_svm_ctwocont[test_uic_sybil_cols]
test_uic_sybil_ctwocont_white = df_sybil_svm_ctwocont[df_sybil_svm_ctwocont['race'] == 'White'][test_uic_sybil_cols]
test_uic_sybil_ctwocont_black = df_sybil_svm_ctwocont[df_sybil_svm_ctwocont['race'] == 'Black or African American'][test_uic_sybil_cols]
test_uic_sybil_ctwocont_male = df_sybil_svm_ctwocont[df_sybil_svm_ctwocont['sex'] == 'Male'][test_uic_sybil_cols]
test_uic_sybil_ctwocont_female = df_sybil_svm_ctwocont[df_sybil_svm_ctwocont['sex'] == 'Female'][test_uic_sybil_cols]
test_uic_sybil_ctwocont_bmih = df_sybil_svm_ctwocont[df_sybil_svm_ctwocont['bmi'] >= 30.0][test_uic_sybil_cols]
test_uic_sybil_ctwocont_bmil = df_sybil_svm_ctwocont[df_sybil_svm_ctwocont['bmi'] < 30.0][test_uic_sybil_cols]

keepall = '' if REMOVE_UNKNOWN else '_keepall'
test_uic_svm6.to_csv(test_out_path + 'test_uic_svm6' + keepall + '.csv', index=False)
test_uic_svm7.to_csv(test_out_path + 'test_uic_svm7' + keepall + '.csv', index=False)
test_uic_svm2sybil.to_csv(test_out_path + 'test_uic_svm2sybil' + keepall + '.csv', index=False)
test_uic_svm2sybil_white.to_csv(test_out_path + 'test_uic_svm2sybil_white' + keepall + '.csv', index=False)
test_uic_svm2sybil_black.to_csv(test_out_path + 'test_uic_svm2sybil_black' + keepall + '.csv', index=False)
test_uic_svm6sybil.to_csv(test_out_path + 'test_uic_svm6sybil' + keepall + '.csv', index=False)
test_uic_svm6sybil_white.to_csv(test_out_path + 'test_uic_svm6sybil_white' + keepall + '.csv', index=False)
test_uic_svm6sybil_black.to_csv(test_out_path + 'test_uic_svm6sybil_black' + keepall + '.csv', index=False)
test_uic_svm7sybil.to_csv(test_out_path + 'test_uic_svm7sybil' + keepall + '.csv', index=False)
test_uic_svm7sybil_white.to_csv(test_out_path + 'test_uic_svm7sybil_white' + keepall + '.csv', index=False)
test_uic_svm7sybil_black.to_csv(test_out_path + 'test_uic_svm7sybil_black' + keepall + '.csv', index=False)
test_uic_sybil.to_csv(test_out_path + 'test_uic_sybil' + keepall + '.csv', index=False)
test_uic_sybil_white.to_csv(test_out_path + 'test_uic_sybil_white' + keepall + '.csv', index=False)
test_uic_sybil_black.to_csv(test_out_path + 'test_uic_sybil_black' + keepall + '.csv', index=False)
test_uic_sybil_male.to_csv(test_out_path + 'test_uic_sybil_male' + keepall + '.csv', index=False)
test_uic_sybil_female.to_csv(test_out_path + 'test_uic_sybil_female' + keepall + '.csv', index=False)
test_uic_sybil_bmih.to_csv(test_out_path + 'test_uic_sybil_bmih' + keepall + '.csv', index=False)
test_uic_sybil_bmil.to_csv(test_out_path + 'test_uic_sybil_bmil' + keepall + '.csv', index=False)
test_uic_sybil_ctwocont.to_csv(test_out_path + 'test_uic_sybil_ctwocont' + keepall + '.csv', index=False)
test_uic_sybil_ctwocont_white.to_csv(test_out_path + 'test_uic_sybil_ctwocont_white' + keepall + '.csv', index=False)
test_uic_sybil_ctwocont_black.to_csv(test_out_path + 'test_uic_sybil_ctwocont_black' + keepall + '.csv', index=False)
test_uic_sybil_ctwocont_male.to_csv(test_out_path + 'test_uic_sybil_ctwocont_male' + keepall + '.csv', index=False)
test_uic_sybil_ctwocont_female.to_csv(test_out_path + 'test_uic_sybil_ctwocont_female' + keepall + '.csv', index=False)
test_uic_sybil_ctwocont_bmih.to_csv(test_out_path + 'test_uic_sybil_ctwocont_bmih' + keepall + '.csv', index=False)
test_uic_sybil_ctwocont_bmil.to_csv(test_out_path + 'test_uic_sybil_ctwocont_bmil' + keepall + '.csv', index=False)