In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# import raw data (may take a while)
df = pd.read_csv("../data/fourcancers/fourcancers_orig.csv", index_col = 0)

In [3]:
df.shape

(500, 127588)

In [4]:
df = df.transpose()

In [5]:
#import metadata
patients = pd.read_csv("../data/fourcancers/patients.csv", index_col = 0)
status = pd.read_csv("../data/fourcancers/status.csv", index_col = 0)
celltype = pd.read_csv("../data/fourcancers/celltype.csv", index_col = 0)



In [6]:
patients['patient'] = patients['x']
patients = patients.drop(columns = ['x'])

In [32]:
patients['patient'].value_counts()

Lung3     18283
Endo1     17604
Lung2     14860
Lung4     13919
Lung1     13015
Lung6     10477
Endo2      7565
Renal2     7036
Lung5      6592
Endo3      5528
Renal1     5039
Colon1     4037
Renal3     2863
Colon2      770
Name: patient, dtype: int64

In [9]:
df['patient'] = patients['patient']

## IID separation: everyone gets patients from (almost) every cancer type

In [14]:
patients['patient'].unique()

array(['Lung1', 'Lung2', 'Lung3', 'Lung4', 'Lung5', 'Lung6', 'Endo1',
       'Endo2', 'Endo3', 'Colon1', 'Colon2', 'Renal1', 'Renal2', 'Renal3'],
      dtype=object)

In [67]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Colon1', 'Renal1']
client2_patients = ['Lung3', 'Lung4', 'Endo2', 'Colon2', 'Renal2']
client3_patients = ['Lung5', 'Lung6', 'Endo3', 'Renal3'] # only 2 patients with colon

client1_idx = patients.loc[patients['patient'].isin(client1_patients)]
client1_df = df.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[patients['patient'].isin(client2_patients)]
client2_df = df.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[patients['patient'].isin(client3_patients)]
client3_df = df.iloc[client3_idx.index.values - 1, :]

In [68]:
client1_df.shape[0], client2_df.shape[0], client3_df.shape[0]

(54555, 47573, 25460)

In [69]:
client1_df.shape[0] + client2_df.shape[0] +  client3_df.shape[0]

127588

In [70]:
# save to csv
client1_df.to_csv("client1_iid.csv", index = False)
client2_df.to_csv("client2_iid.csv", index = False)
client3_df.to_csv("client3_iid.csv", index = False)

## non-IID separation: everyone gets one from Endo/colon/renal, lung is split between all 3

In [71]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Endo2', 'Endo3']
client2_patients = ['Lung3', 'Lung4', 'Colon1', 'Colon2']
client3_patients = ['Lung5', 'Lung6', 'Renal1', 'Renal2', 'Renal3'] # only 2 patients with colon

client1_idx = patients.loc[patients['patient'].isin(client1_patients)]
client1_df = df.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[patients['patient'].isin(client2_patients)]
client2_df = df.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[patients['patient'].isin(client3_patients)]
client3_df = df.iloc[client3_idx.index.values - 1, :]

In [72]:
client1_df.shape[0], client2_df.shape[0], client3_df.shape[0]

(58572, 37009, 32007)

In [73]:
client1_df.shape[0] + client2_df.shape[0] +  client3_df.shape[0]

127588

In [74]:
# save to csv
client1_df.to_csv("client1_noniid.csv", index = False)
client2_df.to_csv("client2_noniid.csv", index = False)
client3_df.to_csv("client3_noniid.csv", index = False)