In [4]:
import numpy as np
import pandas as pd
from random import sample
        
# Set the random seed
np.random.seed(1234)

In [5]:
def different_cenored_data_support(support_data_csv_file):
    data=pd.read_csv(support_data_csv_file) #Load the data
    data_uncen=data.loc[data["status"]==1,:] #Create uncensored data by extracting uncensored subjects from the original data
    data_cen=data.loc[data["status"]==0,:]  #Create censored data by extracting censored subjects from the original data
    
    ## Print the summary of the dataset
    print('Dimension of uncensored:',data_uncen.shape)
    print('Dimension of censored:',data_cen.shape)
    print('Percentage of white censored: {:.2f}%'.format((data_cen.loc[data_cen["race"]==1,:].shape[0])/(data_cen.shape[0])*100))
    print('Percentage of non-white censored: {:.2f}%'.format((data_cen.loc[data_cen["race"]==2,:].shape[0])/(data_cen.shape[0])*100))
    print('No. of uncensored white:',data_uncen.loc[data_uncen["race"]==1,:].shape[0])
    print('No. of uncensored non-white:',data_uncen.loc[data_uncen["race"]==2,:].shape[0])
    print('No. of censored white:',data_cen.loc[data_cen["race"]==1,:].shape[0])
    print('No. of censored non-white:',data_cen.loc[data_cen["race"]==2,:].shape[0])   
    
    
    uncen_white_1000=data_uncen.loc[data_uncen["race"]==1,:].sample(n = 1000) #Randomly select 1000 uncenored subjects from the white group of uncensored data
    uncen_non_white_1000=data_uncen.loc[data_uncen["race"]==2,:].sample(n = 1000) #Randomly select 1000 uncenored subjects from the non-white group of uncensored data
    uncenored_data=pd.concat([uncen_white_1000, uncen_non_white_1000])  #Concatenate randomly selected 1000+1000=2000 uncensored subjects from the white and non-white group of uncensored data
    increment_cenored_white_500=data_cen.loc[data_cen["race"]==1,:].sample(n = 500) #Select 500 cenored subjects from the white group of censored data
    increment_cenored_non_white_500=data_cen.loc[data_cen["race"]==2,:].sample(n = 500) #Select 500 cenored subjects from the non-white group of censored data
    increment_all=pd.concat([uncenored_data, increment_cenored_white_500, increment_cenored_non_white_500]).sample(frac=1.0) ## Create Increment All data by respectively adding 500 censored cenored subjects from the white group and non-white group of censored data to uncensored dataset
    increment_majority=pd.concat([uncenored_data, increment_cenored_white_500]).sample(frac=1.0) ## Create Increment Majority data by only adding 500 censored cenored subjects from the white group of censored data to uncensored dataset
    increment_minority=pd.concat([uncenored_data, increment_cenored_non_white_500]).sample(frac=1.0) ## Create Increment Minority data by only adding 500 censored cenored subjects from the non-white group of censored data to uncensored dataset


    induced_cenored_white_500=uncen_white_1000.sample(n = 500) #Randomly select 500 cenored subjects from the white group of uncensored data
    induced_cenored_white_500['status']=0 #Change the event status uncensored to censored (1 to 0)
    induced_uncenored_white_500=uncen_white_1000[~uncen_white_1000.index.isin(induced_cenored_white_500.index)] #Extract the white uncensored data where censoring is not induced

    induced_cenored_non_white_500=uncen_non_white_1000.sample(n = 500) #Randomly select 500 cenored subjects from the non-white group of uncensored data
    induced_cenored_non_white_500['status']=0 #Change the event status uncensored to censored (1 to 0)
    induced_uncenored_non_white_500=uncen_non_white_1000[~uncen_non_white_1000.index.isin(induced_cenored_non_white_500.index)] #Extract the non-white uncensored data where censoring is not induced


    induced_majority=pd.concat([induced_uncenored_white_500, induced_cenored_white_500, uncen_non_white_1000]).sample(frac=1.0) ## Create Induced Majority data by inducing 500 censored subjects to the white group of uncensored data 
    induced_minority=pd.concat([induced_uncenored_non_white_500, induced_cenored_non_white_500, uncen_white_1000]).sample(frac=1.0) ## Create Induced Minority data by inducing 500 censored subjects to the non-white group of uncensored data 
    
    return uncenored_data, increment_majority, increment_minority, induced_majority, induced_minority

In [6]:
uncenored_data, increment_majority, increment_minority, induced_majority, induced_minority=different_cenored_data_support('/home/local/AD/mrahman6/0.KDD_2022/0.FISA_FINAL_EXPERIMENT/Data/SUPPORT/support.csv')

Dimension of uncensored: (6168, 34)
Dimension of censored: (2894, 34)
Percentage of white censored: 77.51%
Percentage of non-white censored: 22.49%
No. of uncensored white: 4947
No. of uncensored non-white: 1221
No. of censored white: 2243
No. of censored non-white: 651


In [7]:
def different_cenored_flchain(flchain_data_csv_file):
    data=pd.read_csv(flchain_data_csv_file) #Load the data
    data_uncen=data.loc[data["status"]==1,:] #Create uncensored data by extracting uncensored subjects from the original data
    data_cen=data.loc[data["status"]==0,:]  #Create censored data by extracting censored subjects from the original data
    
    ## Print the summary of the dataset
    print('Dimension of uncensored:',data_uncen.shape)
    print('Dimension of censored:',data_cen.shape)
    print('Percentage of male censored: {:.2f}%'.format((data_cen.loc[data_cen["sex"]==1,:].shape[0])/(data_cen.shape[0])*100))
    print('Percentage of female censored: {:.2f}%'.format((data_cen.loc[data_cen["sex"]==0,:].shape[0])/(data_cen.shape[0])*100))
    print('No. of uncensored male:',data_uncen.loc[data_uncen["sex"]==1,:].shape[0])
    print('No. of uncensored female:',data_uncen.loc[data_uncen["sex"]==0,:].shape[0])
    print('No. of censored male:',data_cen.loc[data_cen["sex"]==1,:].shape[0])
    print('No. of censored female:',data_cen.loc[data_cen["sex"]==0,:].shape[0])   
    
    
    uncen_male_500=data_uncen.loc[data_uncen["sex"]==1,:].sample(n = 500) #Randomly select 500 uncenored subjects from the male group of uncensored data
    uncen_female_500=data_uncen.loc[data_uncen["sex"]==0,:].sample(n = 500) #Randomly select 500 uncenored subjects from the female group of uncensored data
    uncenored_data=pd.concat([uncen_male_500, uncen_female_500])  #Concatenate randomly selected 500+500=1000 uncensored subjects from the male and female group of uncensored data
    increment_cenored_male_250=data_cen.loc[data_cen["sex"]==1,:].sample(n = 250) #Select 250 cenored subjects from the male group of censored data
    increment_cenored_female_250=data_cen.loc[data_cen["sex"]==0,:].sample(n = 250) #Select 250 cenored subjects from the female group of censored data
    increment_all=pd.concat([uncenored_data, increment_cenored_male_250, increment_cenored_female_250]).sample(frac=1.0) ## Create Increment All data by respectively adding 500 censored cenored subjects from the male group and female group of censored data to uncensored dataset
    increment_majority=pd.concat([uncenored_data, increment_cenored_male_250]).sample(frac=1.0) ## Create Increment Majority data by only adding 250 censored cenored subjects from the male group of censored data to uncensored dataset
    increment_minority=pd.concat([uncenored_data, increment_cenored_female_250]).sample(frac=1.0) ## Create Increment Minority data by only adding 250 censored cenored subjects from the female group of censored data to uncensored dataset


    induced_cenored_male_250=uncen_male_500.sample(n = 250) #Randomly select 500 cenored subjects from the male group of uncensored data
    induced_cenored_male_250['status']=0 #Change the event status uncensored to censored (1 to 0)
    induced_uncenored_male_250=uncen_male_500[~uncen_male_500.index.isin(induced_cenored_male_250.index)] #Extract the male uncensored data where censoring is not induced

    induced_cenored_female_250=uncen_female_500.sample(n = 250) #Randomly select 500 cenored subjects from the female group of uncensored data
    induced_cenored_female_250['status']=0 #Change the event status uncensored to censored (1 to 0)
    induced_uncenored_female_250=uncen_female_500[~uncen_female_500.index.isin(induced_cenored_female_250.index)] #Extract the female uncensored data where censoring is not induced


    induced_majority=pd.concat([induced_uncenored_male_250, induced_cenored_male_250, uncen_female_500]).sample(frac=1.0) ## Create Induced Majority data by inducing 500 censored subjects to the male group of uncensored data 
    induced_minority=pd.concat([induced_uncenored_female_250, induced_cenored_female_250, uncen_male_500]).sample(frac=1.0) ## Create Induced Minority data by inducing 500 censored subjects to the female group of uncensored data 
    
    return uncenored_data, increment_majority, increment_minority, induced_majority, induced_minority

In [8]:
uncenored_data, increment_majority, increment_minority, induced_majority, induced_minority=different_cenored_flchain('/home/local/AD/mrahman6/0.KDD_2022/0.FISA_FINAL_EXPERIMENT/Data/FLC/flchain.csv')

Dimension of uncensored: (1959, 10)
Dimension of censored: (4562, 10)
Percentage of male censored: 44.76%
Percentage of female censored: 55.24%
No. of uncensored male: 890
No. of uncensored female: 1069
No. of censored male: 2042
No. of censored female: 2520


In [11]:
def different_cenored_seer(seer_data_csv_file):
    data=pd.read_csv(seer_data_csv_file) #Load the data
    data=data[data['Race_ord']!=5]
    data_uncen=data.loc[data["status"]==1,:] #Create uncensored data by extracting uncensored subjects from the original data
    data_cen=data.loc[data["status"]==0,:]  #Create censored data by extracting censored subjects from the original data
    
    ## Print the summary of the dataset
    print('Dimension of uncensored:',data_uncen.shape)
    print('Dimension of censored:',data_cen.shape)
    print('Percentage of white censored: {:.2f}%'.format((data_cen.loc[data_cen["Race_ord"]==1,:].shape[0])/(data_cen.shape[0])*100))
    print('Percentage of black censored: {:.2f}%'.format((data_cen.loc[data_cen["Race_ord"]==2,:].shape[0])/(data_cen.shape[0])*100))
    print('Percentage of asian censored: {:.2f}%'.format((data_cen.loc[data_cen["Race_ord"]==3,:].shape[0])/(data_cen.shape[0])*100))
    print('Percentage of hispanic censored: {:.2f}%'.format((data_cen.loc[data_cen["Race_ord"]==4,:].shape[0])/(data_cen.shape[0])*100))

    
    print('No. of uncensored white:',data_uncen.loc[data_uncen["Race_ord"]==1,:].shape[0])
    print('No. of uncensored black:',data_uncen.loc[data_uncen["Race_ord"]==2,:].shape[0])
    print('No. of uncensored asian:',data_uncen.loc[data_uncen["Race_ord"]==3,:].shape[0])
    print('No. of uncensored hispanic:',data_uncen.loc[data_uncen["Race_ord"]==4,:].shape[0])    
    
    
    print('No. of censored white:',data_cen.loc[data_cen["Race_ord"]==1,:].shape[0])
    print('No. of censored black:',data_cen.loc[data_cen["Race_ord"]==2,:].shape[0])   
    print('No. of censored asian:',data_cen.loc[data_cen["Race_ord"]==3,:].shape[0])
    print('No. of censored hispanic:',data_cen.loc[data_cen["Race_ord"]==4,:].shape[0])       
    
    uncen_white_500=data_uncen.loc[data_uncen["Race_ord"]==1,:].sample(n = 500) #Randomly select 500 uncenored subjects from the white group of uncensored data
    uncen_black_500=data_uncen.loc[data_uncen["Race_ord"]==2,:].sample(n = 500) #Randomly select 500 uncenored subjects from the black group of uncensored data
    uncen_asian_500=data_uncen.loc[data_uncen["Race_ord"]==3,:].sample(n = 500) #Randomly select 500 uncenored subjects from the asian group of uncensored data
    uncen_hispanic_500=data_uncen.loc[data_uncen["Race_ord"]==4,:].sample(n = 500) #Randomly select 500 uncenored subjects from the hispanic group of uncensored data

    
    uncenored_data=pd.concat([uncen_white_500, uncen_black_500, uncen_asian_500, uncen_hispanic_500])  #Concatenate randomly selected 500+500+500+500=2000 uncensored subjects from the white, black, asian and hispanic groups of uncensored data

    
    increment_cenored_white_250=data_cen.loc[data_cen["Race_ord"]==1,:].sample(n = 250) #Select 250 cenored subjects from the white group of censored data
    increment_cenored_black_250=data_cen.loc[data_cen["Race_ord"]==2,:].sample(n = 250) #Select 250 cenored subjects from the black group of censored data
    increment_cenored_asian_250=data_cen.loc[data_cen["Race_ord"]==3,:].sample(n = 250) #Select 250 cenored subjects from the asian group of censored data
    increment_cenored_hispanic_250=data_cen.loc[data_cen["Race_ord"]==4,:].sample(n = 250) #Select 250 cenored subjects from the hispanic group of censored data

    
    increment_all=pd.concat([uncenored_data, increment_cenored_white_250, increment_cenored_black_250, increment_cenored_asian_250, increment_cenored_hispanic_250]).sample(frac=1.0) ## Create Increment All data by respectively adding 500 censored cenored subjects from the white, black, asian and hispanic group and female group of censored data to uncensored dataset
    increment_majority=pd.concat([uncenored_data, increment_cenored_white_250]).sample(frac=1.0) ## Create Increment Majority data by only adding 250 censored cenored subjects from the white group of censored data to uncensored dataset
    increment_minority=pd.concat([uncenored_data, increment_cenored_black_250, increment_cenored_asian_250, increment_cenored_hispanic_250]).sample(frac=1.0) ## Create Increment Minority data by only adding 250 censored cenored subjects from the black, asian and hispanic group of censored data to uncensored dataset


    induced_cenored_white_250=uncen_white_500.sample(n = 250) #Randomly select 500 cenored subjects from the white group of uncensored data
    induced_cenored_white_250['status']=0 #Change the event status uncensored to censored (1 to 0)
    induced_uncenored_white_250=uncen_white_500[~uncen_white_500.index.isin(induced_cenored_white_250.index)] #Extract the white uncensored data where censoring is not induced

    induced_cenored_black_250=uncen_black_500.sample(n = 250) #Randomly select 500 cenored subjects from the black group of uncensored data
    induced_cenored_black_250['status']=0 #Change the event status uncensored to censored (1 to 0)
    induced_uncenored_black_250=uncen_black_500[~uncen_black_500.index.isin(induced_cenored_black_250.index)] #Extract the black uncensored data where censoring is not induced

    induced_cenored_asian_250=uncen_asian_500.sample(n = 250) #Randomly select 500 cenored subjects from the asian group of uncensored data
    induced_cenored_asian_250['status']=0 #Change the event status uncensored to censored (1 to 0)
    induced_uncenored_asian_250=uncen_asian_500[~uncen_asian_500.index.isin(induced_cenored_asian_250.index)] #Extract the asian uncensored data where censoring is not induced

    induced_cenored_hispanic_250=uncen_hispanic_500.sample(n = 250) #Randomly select 500 cenored subjects from the hispanic group of uncensored data
    induced_cenored_hispanic_250['status']=0 #Change the event status uncensored to censored (1 to 0)
    induced_uncenored_hispanic_250=uncen_hispanic_500[~uncen_hispanic_500.index.isin(induced_cenored_hispanic_250.index)] #Extract the hispanic uncensored data where censoring is not induced
    

    induced_majority=pd.concat([induced_uncenored_white_250, induced_cenored_white_250, uncen_black_500, uncen_asian_500, uncen_hispanic_500]).sample(frac=1.0) ## Create Induced Majority data by inducing 500 censored subjects to the white group of uncensored data 
    induced_minority=pd.concat([induced_uncenored_black_250, induced_cenored_black_250, induced_uncenored_asian_250, induced_cenored_asian_250, induced_uncenored_hispanic_250, induced_cenored_hispanic_250, uncen_white_500]).sample(frac=1.0) ## Create Induced Minority data by inducing 500 censored subjects to the black, asian and hispanic group of uncensored data 
    
    return uncenored_data, increment_majority, increment_minority, induced_majority, induced_minority

In [12]:
uncenored_data, increment_majority, increment_minority, induced_majority, induced_minority=different_cenored_seer('/home/local/AD/mrahman6/0.KDD_2022/0.FISA_FINAL_EXPERIMENT/Data/SEER/seer_data.csv')

Dimension of uncensored: (6396, 15)
Dimension of censored: (18923, 15)
Percentage of white censored: 54.44%
Percentage of black censored: 9.97%
Percentage of asian censored: 25.83%
Percentage of hispanic censored: 9.76%
No. of uncensored white: 3506
No. of uncensored black: 1002
No. of uncensored asian: 1319
No. of uncensored hispanic: 569
No. of censored white: 10301
No. of censored black: 1887
No. of censored asian: 4888
No. of censored hispanic: 1847
