In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st

# File paths
root_path = 'C:/Users/Abdul Zakkar/Documents/UICOM/research/salahudeen/'

nlst_path = root_path + 'package-nlst-1125.2023-11-21/participant.data.d040722.csv/participant_d040722.csv'
sybil_path = root_path + 'sybil_predictions.csv'
split_path = root_path + 'pid2split.csv'

train_out_path = root_path + 'test_train_sets/train/'
test_out_path = root_path + 'test_train_sets/test/'

In [2]:
REMOVE_UNKNOWN = True

In [3]:
# Import dataframes.
df_nlst = pd.read_csv(nlst_path)
df_sybil = pd.read_csv(sybil_path)
df_split = pd.read_csv(split_path)

  df_nlst = pd.read_csv(nlst_path)


In [4]:
# Replace missing value indicators with NaN.

df_nlst['educat'] = df_nlst['educat'].replace([8, 95, 98, 99], np.nan)

df_nlst['ethnic'] = df_nlst['ethnic'].replace([7, 95, 98, 99], np.nan)
df_nlst['ethnic'] = df_nlst['ethnic'].replace(2, 0)

df_nlst['marital'] = df_nlst['marital'].replace([7, 9], np.nan)

df_nlst['race'] = df_nlst['race'].replace([7, 95, 96, 98, 99], np.nan)

In [5]:
# Create new columns.

df_nlst['smoking_quit_time'] = np.where(df_nlst['age_quit'].isna(), 0, df_nlst['age'] - df_nlst['age_quit']).astype(int)
df_nlst['bmi'] = (df_nlst['weight'] * 703) / (df_nlst['height'] * df_nlst['height'])
cancer_columns = [col for col in df_nlst.columns if 'canc' in col and len(col) == 8]
df_nlst["cancer_hist"] = df_nlst[cancer_columns].any(axis=1).astype(int)
family_columns = [col for col in df_nlst.columns if 'fam' in col]
df_nlst["family_hist_lung_cancer"] = df_nlst[family_columns].any(axis=1).astype(int)
df_nlst["divorced_or_widowed"] = np.where(
    (df_nlst['marital'] == 3) | (df_nlst['marital'] == 5),
    1,
    np.where(
        np.isnan(df_nlst['marital']),
        np.nan,
        0
    )
)
df_nlst["bachelors_and_above"] = np.where(
    (df_nlst['educat'] == 6) | (df_nlst['educat'] == 7),
    1,
    np.where(
        np.isnan(df_nlst['educat']),
        np.nan,
        0
    )
)

In [6]:
def get_race_plco(row):
    if row['ethnic'] == 1:
        return 8
    if row['race'] == 6:
        return np.nan
    return row['race']

def get_race_n(row, n):
    if row['race'] == n:
        return 1
    if np.isnan(row['race']):
        return np.nan
    return 0

def get_educat_plco(row):
    if row['educat'] > 1:
        return row['educat'] - 1
    return row['educat']

# Create race columns for use by my SVM and PLCOm2012
df_nlst['race_plco'] = df_nlst.apply(get_race_plco, axis=1)
df_nlst['race_2'] = df_nlst.apply(get_race_n, n=2, axis=1)

# Create educat column to match PLCOm2012
df_nlst['educat_plco'] = df_nlst.apply(get_educat_plco, axis=1)

In [7]:
# Add truth columns
def ground_truth(row, year):
    if pd.isna(row['candx_days']):
        if REMOVE_UNKNOWN:
            if row['canc_free_days'] <= year*365:
                return np.nan
        return 0
    return 1 if row['candx_days'] <= year*365 else 0

for year in range(1,6+1):
    df_nlst["canc_yr_" + str(year)] = df_nlst.apply(ground_truth, axis=1, args=[year])
    n_cancer = df_nlst["canc_yr_" + str(year)].sum()
    n_total = df_nlst.shape[0] - df_nlst["canc_yr_" + str(year)].isna().sum()
    print(f"{int(n_cancer)} positive out of {n_total} known outcomes by year {year}.")

484 positive out of 52717 known outcomes by year 1.
794 positive out of 52093 known outcomes by year 2.
1129 positive out of 51404 known outcomes by year 3.
1363 positive out of 50642 known outcomes by year 4.
1624 positive out of 49817 known outcomes by year 5.
1922 positive out of 47296 known outcomes by year 6.


In [8]:
year = 6

# For exposure years, convert missing values to 0
corr_dict = {}
for col in df_nlst.columns:
    if 'yrs' == col[:3] and len(col) == 7:
        df_nlst[col] = df_nlst[col].replace([np.nan], 0)
        current_df = df_nlst[[col, 'canc_yr_' + str(year)]]
        current_df = current_df.dropna()
        corr = st.pearsonr(current_df[col], current_df['canc_yr_' + str(year)])
        corr_dict[col] = corr[1]
top_work_exposure = []
for key, value in corr_dict.items():
    if value < 0.05:
        top_work_exposure.append(key)
        print(key, value)

# New top work exposures column
df_nlst["max_top_work_exposure"] = df_nlst[top_work_exposure].max(axis=1)

yrsasbe 0.00673578720985411
yrschem 0.03746494091043966
yrscoal 0.005748144195031929
yrsfoun 5.411920331810031e-07
yrssand 0.011245120604115864


In [9]:
# Add data split column

output = []
for pid in df_nlst['pid']:
    value = df_split[df_split['PID'] == pid]['SPLIT'].values
    if len(value) > 0:
        output.append(value[0])
    else:
        output.append('unseen')
df_nlst['pid2split'] = output

In [10]:
columns = [
    'age', 'ethnic', 'race_2', 'race_plco', 'educat_plco', 'cigsmok',
    'smokeday', 'smokeyr', 'smoking_quit_time', 'bmi', 'diagcopd', 'cancer_hist',
    'family_hist_lung_cancer', 'divorced_or_widowed', 'bachelors_and_above',
    'max_top_work_exposure',

    'candx_days', 'canc_free_days',
    'canc_yr_1', 'canc_yr_2', 'canc_yr_3', 'canc_yr_4', 'canc_yr_5', 'canc_yr_6',

    'pid', 'rndgroup', 'pid2split'
]

df = df_nlst[columns]

In [11]:
# Count missing values
print(df.shape)
df.isna().sum()

(53452, 27)


age                            0
ethnic                       399
race_2                       370
race_plco                    835
educat_plco                 1234
cigsmok                        0
smokeday                       0
smokeyr                        0
smoking_quit_time              0
bmi                          361
diagcopd                     336
cancer_hist                    0
family_hist_lung_cancer        0
divorced_or_widowed          339
bachelors_and_above         1234
max_top_work_exposure          0
candx_days                 51394
canc_free_days                 0
canc_yr_1                    735
canc_yr_2                   1359
canc_yr_3                   2048
canc_yr_4                   2810
canc_yr_5                   3635
canc_yr_6                   6156
pid                            0
rndgroup                       0
pid2split                      0
dtype: int64

In [12]:
display(df.shape)
df.head()

(53452, 27)

Unnamed: 0,age,ethnic,race_2,race_plco,educat_plco,cigsmok,smokeday,smokeyr,smoking_quit_time,bmi,...,canc_free_days,canc_yr_1,canc_yr_2,canc_yr_3,canc_yr_4,canc_yr_5,canc_yr_6,pid,rndgroup,pid2split
0,70,0.0,0.0,1.0,1.0,1,30,66,0,19.224898,...,2353,0.0,0.0,0.0,0.0,0.0,0.0,100001,2,unseen
1,66,0.0,0.0,1.0,2.0,1,20,52,0,26.605753,...,2391,0.0,0.0,0.0,0.0,0.0,0.0,100002,1,train
2,64,0.0,0.0,1.0,2.0,1,30,44,0,25.82449,...,2381,0.0,0.0,0.0,0.0,0.0,0.0,100003,2,unseen
3,60,0.0,0.0,1.0,4.0,0,40,17,15,29.411224,...,2688,0.0,0.0,0.0,0.0,0.0,0.0,100004,1,dev
4,64,0.0,0.0,1.0,1.0,0,40,46,3,34.453108,...,2435,0.0,0.0,0.0,0.0,0.0,0.0,100005,1,test


In [13]:
df_clin = df.dropna(subset=df.columns.difference(
    ['candx_days', 'pid', 'pid2split', 'rndgroup'] + ['canc_yr_'+str(i) for i in range(1,6+1)]))

display(df_clin.shape)
df_clin.head()

(50849, 27)

Unnamed: 0,age,ethnic,race_2,race_plco,educat_plco,cigsmok,smokeday,smokeyr,smoking_quit_time,bmi,...,canc_free_days,canc_yr_1,canc_yr_2,canc_yr_3,canc_yr_4,canc_yr_5,canc_yr_6,pid,rndgroup,pid2split
0,70,0.0,0.0,1.0,1.0,1,30,66,0,19.224898,...,2353,0.0,0.0,0.0,0.0,0.0,0.0,100001,2,unseen
1,66,0.0,0.0,1.0,2.0,1,20,52,0,26.605753,...,2391,0.0,0.0,0.0,0.0,0.0,0.0,100002,1,train
2,64,0.0,0.0,1.0,2.0,1,30,44,0,25.82449,...,2381,0.0,0.0,0.0,0.0,0.0,0.0,100003,2,unseen
3,60,0.0,0.0,1.0,4.0,0,40,17,15,29.411224,...,2688,0.0,0.0,0.0,0.0,0.0,0.0,100004,1,dev
4,64,0.0,0.0,1.0,1.0,0,40,46,3,34.453108,...,2435,0.0,0.0,0.0,0.0,0.0,0.0,100005,1,test


In [14]:
df_clin_test = df_clin[df_clin['pid2split'] == 'test']
df_clin_train = df_clin[~(df_clin['pid2split'] == 'test')]

display(df_clin_test.shape)
display(df_clin_train.shape)

(2221, 27)

(48628, 27)

In [16]:
plco_columns = [
    'age', 'race_plco', 'educat_plco', 'bmi', 'diagcopd', 'cancer_hist',
    'family_hist_lung_cancer', 'cigsmok', 'smokeday', 'smokeyr', 'smoking_quit_time',
    'canc_yr_1', 'canc_yr_2', 'canc_yr_3', 'canc_yr_4', 'canc_yr_5', 'canc_yr_6'
]
       
svm11_columns = [
    'age', 'ethnic', 'race_2', 'cigsmok', 'smokeday', 'smokeyr', 'bmi',
    'family_hist_lung_cancer', 'divorced_or_widowed', 'bachelors_and_above', 'max_top_work_exposure',
    'canc_yr_1', 'canc_yr_2', 'canc_yr_3', 'canc_yr_4', 'canc_yr_5', 'canc_yr_6'
]

svm6_columns = [
    'age', 'ethnic', 'race_2', 'smokeday', 'smokeyr', 'divorced_or_widowed',
    'canc_yr_1', 'canc_yr_2', 'canc_yr_3', 'canc_yr_4', 'canc_yr_5', 'canc_yr_6'
]

svm7_columns = [
    'age', 'ethnic', 'race_2', 'bmi', 'smokeday', 'smokeyr', 'divorced_or_widowed',
    'canc_yr_1', 'canc_yr_2', 'canc_yr_3', 'canc_yr_4', 'canc_yr_5', 'canc_yr_6'
]

df_clin_test_plco = df_clin_test[plco_columns]
df_clin_train_plco = df_clin_train[plco_columns]

df_clin_all_plco_white = df_clin[df_clin['race_plco'] == 1][plco_columns]
df_clin_all_plco_black = df_clin[df_clin['race_plco'] == 2][plco_columns]

plco_col_dict = {
    'age': 'age',
    'race_plco': 'race',
    'educat_plco': 'education',
    'bmi': 'bmi',
    'diagcopd': 'copd',
    'cancer_hist': 'cancer_hist',
    'family_hist_lung_cancer': 'family_hist_lung_cancer',
    'cigsmok': 'smoking_status',
    'smokeday': 'cig_day',
    'smokeyr': 'smoking_years',
    'smoking_quit_time': 'quit_years'
}

df_clin_test_plco = df_clin_test_plco.rename(columns = plco_col_dict)
df_clin_train_plco = df_clin_train_plco.rename(columns = plco_col_dict)
df_clin_all_plco_white = df_clin_all_plco_white.rename(columns = plco_col_dict)
df_clin_all_plco_black = df_clin_all_plco_black.rename(columns = plco_col_dict)

df_clin_test_svm11 = df_clin_test[svm11_columns]
df_clin_train_svm11 = df_clin_train[svm11_columns]

df_clin_test_svm6 = df_clin_test[svm6_columns]
df_clin_train_svm6 = df_clin_train[svm6_columns]

df_clin_test_svm7 = df_clin_test[svm7_columns]
df_clin_train_svm7 = df_clin_train[svm7_columns]

print('df_clin_test_plco shape:', df_clin_test_plco.shape)
print('df_clin_train_plco shape:', df_clin_train_plco.shape)
print('df_clin_all_plco_white shape:', df_clin_all_plco_white.shape)
print('df_clin_all_plco_black shape:', df_clin_all_plco_black.shape)
print('df_clin_test_svm11 shape:', df_clin_test_svm11.shape)
print('df_clin_train_svm11 shape:', df_clin_train_svm11.shape)
print('df_clin_test_svm6 shape:', df_clin_test_svm6.shape)
print('df_clin_train_svm6 shape:', df_clin_train_svm6.shape)
print('df_clin_test_svm7 shape:', df_clin_test_svm7.shape)
print('df_clin_train_svm7 shape:', df_clin_train_svm7.shape)

#Write
keepall = '' if REMOVE_UNKNOWN else '_keepall'
df_clin_train_svm11.to_csv(train_out_path + 'train_nlst_svm11' + keepall + '.csv', index=False)
df_clin_train_svm6.to_csv(train_out_path + 'train_nlst_svm6' + keepall + '.csv', index=False)
df_clin_train_svm7.to_csv(train_out_path + 'train_nlst_svm7' + keepall + '.csv', index=False)

df_clin_test_plco.to_csv(test_out_path + 'test_nlst_plcom2012' + keepall + '.csv', index=False)
df_clin_all_plco_white.to_csv(test_out_path + 'test_nlst_all_plcom2012_white' + keepall + '.csv', index=False)
df_clin_all_plco_black.to_csv(test_out_path + 'test_nlst_all_plcom2012_black' + keepall + '.csv', index=False)
df_clin_test_svm11.to_csv(test_out_path + 'test_nlst_svm11' + keepall + '.csv', index=False)
df_clin_test_svm6.to_csv(test_out_path + 'test_nlst_svm6' + keepall + '.csv', index=False)
df_clin_test_svm7.to_csv(test_out_path + 'test_nlst_svm7' + keepall + '.csv', index=False)

df_clin_test_plco shape: (2221, 17)
df_clin_train_plco shape: (48628, 17)
df_clin_all_plco_white shape: (46454, 17)
df_clin_all_plco_black shape: (2254, 17)
df_clin_test_svm11 shape: (2221, 17)
df_clin_train_svm11 shape: (48628, 17)
df_clin_test_svm6 shape: (2221, 12)
df_clin_train_svm6 shape: (48628, 12)
df_clin_test_svm7 shape: (2221, 13)
df_clin_train_svm7 shape: (48628, 13)


In [47]:
columns_to_add = [
    'age', 'ethnic', 'race_2', 'cigsmok', 'smokeday', 'smokeyr', 'bmi',
	'family_hist_lung_cancer', 'divorced_or_widowed', 'bachelors_and_above',
	'max_top_work_exposure', 'race_plco',
	
    'candx_days', 'canc_free_days',

    'pid', 'pid2split'
]
df_sybil=pd.merge(df_sybil, df[columns_to_add], on='pid', how='left')

In [48]:
study_day = []
for index, row in df_sybil.iterrows():
    study_day.append(
        df_nlst.loc[(df_nlst['pid'] == row['pid'])]['scr_days' + str(row['study_yr'])].values[0]
    )
df_sybil['study_day'] = study_day

In [49]:
output = []
for index, row in df_sybil.iterrows():
    if np.isnan(row["candx_days"]):
        if REMOVE_UNKNOWN:
            output.append([0 if row['canc_free_days'] >= i else np.nan for i in [(j+1)*365 for j in range(6)]])
        else:
            output.append([0 for i in range(6)])
    else:
        days_diff = row["candx_days"] - row["study_day"]
        output.append([1 if days_diff <= i else 0 for i in [(j+1)*365 for j in range(6)]])
df_sybil[["canc_yr_" + str(y) for y in range(1,6+1)]] = output

In [50]:
def increase_age(row):
    new_age = row['age'] + np.floor(row['study_day'] / 365)
    return new_age
df_sybil['age'] = df_sybil.apply(increase_age, axis=1)

In [51]:
def increase_smokeyr(row):
    new_smokeyr = row['smokeyr'] + np.floor(row['study_day'] / 365) * row['cigsmok']
    return new_smokeyr
df_sybil['smokeyr'] = df_sybil.apply(increase_smokeyr, axis=1)

In [52]:
display(df_sybil.columns)
df_sybil.head()

Index(['pid', 'study_yr', 'unique_id', 'pred_yr1', 'pred_yr2', 'pred_yr3',
       'pred_yr4', 'pred_yr5', 'pred_yr6', 'age', 'ethnic', 'race_2',
       'cigsmok', 'smokeday', 'smokeyr', 'bmi', 'family_hist_lung_cancer',
       'divorced_or_widowed', 'bachelors_and_above', 'max_top_work_exposure',
       'race_plco', 'candx_days', 'canc_free_days', 'pid2split', 'study_day',
       'canc_yr_1', 'canc_yr_2', 'canc_yr_3', 'canc_yr_4', 'canc_yr_5',
       'canc_yr_6'],
      dtype='object')

Unnamed: 0,pid,study_yr,unique_id,pred_yr1,pred_yr2,pred_yr3,pred_yr4,pred_yr5,pred_yr6,age,...,candx_days,canc_free_days,pid2split,study_day,canc_yr_1,canc_yr_2,canc_yr_3,canc_yr_4,canc_yr_5,canc_yr_6
0,100002,0,0OPAGELSPLUSLUNG3602.512080.00.11.5,0.00042,0.00205,0.00746,0.01084,0.01532,0.02727,66.0,...,,2391,train,13.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100002,0,0OPAGELSPLUSD3602.512080.00.11.5,0.00117,0.00255,0.00783,0.0128,0.01923,0.03296,66.0,...,,2391,train,13.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100002,1,1OPAGELSPLUSLUNG3602.512080.00.11.5,0.0,0.00157,0.00403,0.00685,0.01038,0.01911,66.0,...,,2391,train,349.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100002,1,1OPAGELSPLUSD3602.512080.00.11.5,0.00117,0.00255,0.00783,0.01104,0.01844,0.03089,66.0,...,,2391,train,349.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100002,2,2OPAGELSPLUSLUNG3602.512080.00.11.5,0.00117,0.00205,0.00746,0.01084,0.01532,0.02609,68.0,...,,2391,train,731.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
cols = [
    'age', 'ethnic', 'race_2', 'cigsmok', 'smokeday', 'smokeyr', 'bmi',
	'family_hist_lung_cancer', 'divorced_or_widowed', 'bachelors_and_above',
	'max_top_work_exposure',
	
    'pred_yr1', 'pred_yr2', 'pred_yr3', 'pred_yr4', 'pred_yr5', 'pred_yr6', 
    'canc_yr_1', 'canc_yr_2', 'canc_yr_3', 'canc_yr_4', 'canc_yr_5', 'canc_yr_6',

    'pid2split', 'race_plco'
]
df_sybil = df_sybil[cols]

In [54]:
cancer_columns = ['canc_yr_' + str(i) for i in range(1,6+1)]

# Training set
df_sybil_svm11_train = df_sybil[df_sybil['pid2split'].isin(['train', 'dev'])]
df_sybil_svm11_train = df_sybil_svm11_train.drop(columns=['pid2split', 'race_plco'])
df_sybil_svm11_train = df_sybil_svm11_train.dropna(subset=df_sybil_svm11_train.columns.difference(cancer_columns))

df_sybil_svm11_train_nn = df_sybil[df_sybil['pid2split'].isin(['train'])]
df_sybil_svm11_train_nn = df_sybil_svm11_train_nn.drop(columns=['pid2split', 'race_plco'])
df_sybil_svm11_train_nn = df_sybil_svm11_train_nn.dropna(subset=df_sybil_svm11_train_nn.columns.difference(cancer_columns))

df_sybil_svm11_val_nn = df_sybil[df_sybil['pid2split'].isin(['dev'])]
df_sybil_svm11_val_nn = df_sybil_svm11_val_nn.drop(columns=['pid2split', 'race_plco'])
df_sybil_svm11_val_nn = df_sybil_svm11_val_nn.dropna(subset=df_sybil_svm11_val_nn.columns.difference(cancer_columns))

df_sybil_svm7_train = df_sybil_svm11_train.drop(columns=[
    'cigsmok', 'family_hist_lung_cancer', 'bachelors_and_above', 'max_top_work_exposure'])

df_sybil_svm6_train = df_sybil_svm11_train.drop(columns=[
    'cigsmok', 'bmi', 'family_hist_lung_cancer', 'bachelors_and_above', 'max_top_work_exposure'])

df_sybil_svm2_train = df_sybil_svm11_train.drop(columns=[
    'ethnic', 'cigsmok', 'smokeday', 'smokeyr', 'bmi', 'family_hist_lung_cancer',
    'divorced_or_widowed', 'bachelors_and_above', 'max_top_work_exposure'])

# Test set
df_sybil_svm11_test = df_sybil[df_sybil['pid2split'].isin(['test'])]
df_sybil_svm11_test = df_sybil_svm11_test.drop(columns='pid2split')
df_sybil_svm11_test = df_sybil_svm11_test.dropna(subset=df_sybil_svm11_test.columns.difference(cancer_columns))

df_sybil_svm6_test = df_sybil_svm11_test.drop(columns=[
    'cigsmok', 'bmi', 'family_hist_lung_cancer', 'bachelors_and_above', 'max_top_work_exposure'])

df_sybil_svm7_test = df_sybil_svm11_test.drop(columns=[
    'cigsmok', 'family_hist_lung_cancer', 'bachelors_and_above', 'max_top_work_exposure'])

df_sybil_svm2_test = df_sybil_svm11_test.drop(columns=[
    'ethnic', 'cigsmok', 'smokeday', 'smokeyr', 'bmi', 'family_hist_lung_cancer',
    'divorced_or_widowed', 'bachelors_and_above', 'max_top_work_exposure'])

df_sybil_test = df_sybil_svm6_test.drop(columns=[
    'age', 'ethnic', 'race_2', 'smokeday', 'smokeyr', 'divorced_or_widowed'])

df_sybil_svm11_test_white = df_sybil_svm11_test[df_sybil_svm11_test['race_plco'] == 1].drop(columns=['race_plco'])
df_sybil_svm11_test_black = df_sybil_svm11_test[df_sybil_svm11_test['race_plco'] == 2].drop(columns=['race_plco'])

df_sybil_test_white = df_sybil_test[df_sybil_test['race_plco'] == 1].drop(columns=['race_plco'])
df_sybil_test_black = df_sybil_test[df_sybil_test['race_plco'] == 2].drop(columns=['race_plco'])

df_sybil_test_bmih = df_sybil_svm11_test[df_sybil_svm11_test['bmi'] >= 30.0].drop(columns=[
    'cigsmok', 'bmi', 'family_hist_lung_cancer', 'bachelors_and_above', 'max_top_work_exposure',
    'age', 'ethnic', 'race_2', 'smokeday', 'smokeyr', 'divorced_or_widowed']).drop(columns=['race_plco'])
df_sybil_test_bmil = df_sybil_svm11_test[df_sybil_svm11_test['bmi'] < 30.0].drop(columns=[
    'cigsmok', 'bmi', 'family_hist_lung_cancer', 'bachelors_and_above', 'max_top_work_exposure',
    'age', 'ethnic', 'race_2', 'smokeday', 'smokeyr', 'divorced_or_widowed']).drop(columns=['race_plco'])

df_sybil_svm11_test = df_sybil_svm11_test.drop(columns=['race_plco'])
df_sybil_svm6_test = df_sybil_svm6_test.drop(columns=['race_plco'])
df_sybil_test = df_sybil_test.drop(columns=['race_plco'])

print('df_sybil_svm11_train shape:', df_sybil_svm11_train.shape)
print('df_sybil_svm11_train_nn shape:', df_sybil_svm11_train_nn.shape)
print('df_sybil_svm11_val_nn shape:', df_sybil_svm11_val_nn.shape)
print('df_sybil_svm6_train shape:', df_sybil_svm6_train.shape)
print('df_sybil_svm7_train shape:', df_sybil_svm7_train.shape)
print('df_sybil_svm2_train shape:', df_sybil_svm2_train.shape)

print('df_sybil_svm11_test shape:', df_sybil_svm11_test.shape)
print('df_sybil_svm11_test_white shape:', df_sybil_svm11_test_white.shape)
print('df_sybil_svm11_test_black shape:', df_sybil_svm11_test_black.shape)
print('df_sybil_svm6_test shape:', df_sybil_svm6_test.shape)
print('df_sybil_svm7_test shape:', df_sybil_svm7_test.shape)
print('df_sybil_svm2_test shape:', df_sybil_svm2_test.shape)
print('df_sybil_test shape:', df_sybil_test.shape)
print('df_sybil_test_white shape:', df_sybil_test_white.shape)
print('df_sybil_test_black shape:', df_sybil_test_black.shape)
print('df_sybil_test_bmih shape:', df_sybil_test_bmih.shape)
print('df_sybil_test_bmil shape:', df_sybil_test_bmil.shape)

# Write
keepall = '' if REMOVE_UNKNOWN else '_keepall'
df_sybil_svm11_train.to_csv(train_out_path + 'train_nlst_svm11sybil' + keepall + '.csv', index=False)
df_sybil_svm11_train_nn.to_csv(train_out_path + 'trainNN_nlst_svm11sybil' + keepall + '.csv', index=False)
df_sybil_svm11_val_nn.to_csv(train_out_path + 'valNN_nlst_svm11sybil' + keepall + '.csv', index=False)
df_sybil_svm11_train.to_csv(train_out_path + 'train_nlst_svm11sybil' + keepall + '.csv', index=False)
df_sybil_svm6_train.to_csv(train_out_path + 'train_nlst_svm6sybil' + keepall + '.csv', index=False)
df_sybil_svm7_train.to_csv(train_out_path + 'train_nlst_svm7sybil' + keepall + '.csv', index=False)
df_sybil_svm2_train.to_csv(train_out_path + 'train_nlst_svm2sybil' + keepall + '.csv', index=False)

df_sybil_svm11_test.to_csv(test_out_path + 'test_nlst_svm11sybil' + keepall + '.csv', index=False)
df_sybil_svm11_test_white.to_csv(test_out_path + 'test_nlst_svm11sybil_white' + keepall + '.csv', index=False)
df_sybil_svm11_test_black.to_csv(test_out_path + 'test_nlst_svm11sybil_black' + keepall + '.csv', index=False)
df_sybil_svm6_test.to_csv(test_out_path + 'test_nlst_svm6sybil' + keepall + '.csv', index=False)
df_sybil_svm7_test.to_csv(test_out_path + 'test_nlst_svm7sybil' + keepall + '.csv', index=False)
df_sybil_svm2_test.to_csv(test_out_path + 'test_nlst_svm2sybil' + keepall + '.csv', index=False)
df_sybil_test.to_csv(test_out_path + 'test_nlst_sybil' + keepall + '.csv', index=False)
df_sybil_test_white.to_csv(test_out_path + 'test_nlst_sybil_white' + keepall + '.csv', index=False)
df_sybil_test_black.to_csv(test_out_path + 'test_nlst_sybil_black' + keepall + '.csv', index=False)
df_sybil_test_bmih.to_csv(test_out_path + 'test_nlst_sybil_bmih' + keepall + '.csv', index=False)
df_sybil_test_bmil.to_csv(test_out_path + 'test_nlst_sybil_bmil' + keepall + '.csv', index=False)

df_sybil_svm11_train shape: (65161, 23)
df_sybil_svm11_train_nn shape: (52443, 23)
df_sybil_svm11_val_nn shape: (12718, 23)
df_sybil_svm6_train shape: (65161, 18)
df_sybil_svm7_train shape: (65161, 19)
df_sybil_svm2_train shape: (65161, 14)
df_sybil_svm11_test shape: (11980, 23)
df_sybil_svm11_test_white shape: (11377, 23)
df_sybil_svm11_test_black shape: (291, 23)
df_sybil_svm6_test shape: (11980, 18)
df_sybil_svm7_test shape: (11980, 20)
df_sybil_svm2_test shape: (11980, 15)
df_sybil_test shape: (11980, 12)
df_sybil_test_white shape: (11377, 12)
df_sybil_test_black shape: (291, 12)
df_sybil_test_bmih shape: (3579, 12)
df_sybil_test_bmil shape: (8401, 12)
