In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
# import raw data (may take a while)
df = pd.read_csv("../data/fourcancers/fourcancers_orig.csv", index_col = 0)

In [3]:
df = df.transpose()

In [4]:
#import metadata
patients = pd.read_csv("../data/fourcancers/patients.csv", index_col = 0)
status = pd.read_csv("../data/fourcancers/status.csv", index_col = 0)
celltype = pd.read_csv("../data/fourcancers/celltype.csv", index_col = 0)



In [5]:
patients['patient'] = patients['x']
patients = patients.drop(columns = ['x'])

metadata = patients
metadata['status'] = status['x']
metadata['celltype'] = celltype['x']


In [12]:
metadata.head()

Unnamed: 0,patient,status,celltype
1,Lung1,Tumor,Lung
2,Lung1,Tumor,Lung
3,Lung1,Tumor,Lung
4,Lung1,Tumor,Lung
5,Lung1,Tumor,Lung


In [13]:
metadata.loc[metadata['patient'] == 'Lung1']['status'].unique()

array(['Tumor', 'NAT'], dtype=object)

In [9]:
status['x'].value_counts()

Tumor    68905
NAT      58683
Name: x, dtype: int64

In [32]:
patients['patient'].value_counts()

Lung3     18283
Endo1     17604
Lung2     14860
Lung4     13919
Lung1     13015
Lung6     10477
Endo2      7565
Renal2     7036
Lung5      6592
Endo3      5528
Renal1     5039
Colon1     4037
Renal3     2863
Colon2      770
Name: patient, dtype: int64

In [15]:
# first subset for only the NAT cells/discard cancer cells
NAT_idx = metadata.loc[metadata['status'] == 'NAT'].index
df_NAT = df.iloc[NAT_idx.values - 1, :]


In [19]:
client1_idx

Int64Index([  7096,   7097,   7098,   7099,   7100,   7101,   7102,   7103,
              7104,   7105,
            ...
            127579, 127580, 127581, 127582, 127583, 127584, 127585, 127586,
            127587, 127588],
           dtype='int64', length=58683)

## IID separation: cancer cells only

In [33]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Colon1', 'Renal1']
client2_patients = ['Lung3', 'Lung4', 'Endo2', 'Colon2', 'Renal2']
client3_patients = ['Lung5', 'Lung6', 'Endo3', 'Renal3'] # only 2 patients with colon


# subset for only the NAT cells/discard cancer cells
client1_idx = metadata.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client1_patients)]
client1_df = df.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client2_patients)]
client2_df = df.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client3_patients)]
client3_df = df.iloc[client3_idx.index.values - 1, :]

In [34]:
# save to csv
client1_df.to_csv("client1_iid_Tumor.csv", index = False)
client2_df.to_csv("client2_iid_Tumor.csv", index = False)
client3_df.to_csv("client3_iid_Tumor.csv", index = False)

#### IID, PCA

In [8]:
# do PCA 
X_full_PCA = PCA(n_components = 5).fit_transform(df.values)
df_PCA = pd.DataFrame(X_full_PCA)


In [10]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Colon1', 'Renal1']
client2_patients = ['Lung3', 'Lung4', 'Endo2', 'Colon2', 'Renal2']
client3_patients = ['Lung5', 'Lung6', 'Endo3', 'Renal3'] # only 2 patients with colon


# subset for only the NAT cells/discard cancer cells
client1_idx = metadata.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client1_patients)]
client1_df = df_PCA.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client2_patients)]
client2_df = df_PCA.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client3_patients)]
client3_df = df_PCA.iloc[client3_idx.index.values - 1, :]

In [11]:
# save to csv
client1_df.to_csv("client1_iid_Tumor_PCA.csv", index = False)
client2_df.to_csv("client2_iid_Tumor_PCA.csv", index = False)
client3_df.to_csv("client3_iid_Tumor_PCA.csv", index = False)

## Non-IID separation: cancer cells only

In [12]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Endo2', 'Endo3']
client2_patients = ['Lung3', 'Lung4', 'Colon1', 'Colon2']
client3_patients = ['Lung5', 'Lung6', 'Renal1', 'Renal2', 'Renal3'] 



# subset for only the NAT cells/discard cancer cells
client1_idx = metadata.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client1_patients)]
client1_df = df.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client2_patients)]
client2_df = df.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client3_patients)]
client3_df = df.iloc[client3_idx.index.values - 1, :]

In [13]:
# save to csv
client1_df.to_csv("client1_niid_Tumor.csv", index = False)
client2_df.to_csv("client2_niid_Tumor.csv", index = False)
client3_df.to_csv("client3_niid_Tumor.csv", index = False)

#### non-IID: PCA

In [None]:
# do PCA 
X_full_PCA = PCA(n_components = 5).fit_transform(df.values)
df_PCA = pd.DataFrame(X_full_PCA)


In [14]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Endo2', 'Endo3']
client2_patients = ['Lung3', 'Lung4', 'Colon1', 'Colon2']
client3_patients = ['Lung5', 'Lung6', 'Renal1', 'Renal2', 'Renal3'] 


# subset for only the NAT cells/discard cancer cells
client1_idx = metadata.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client1_patients)]
client1_df = df_PCA.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client2_patients)]
client2_df = df_PCA.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[metadata['status'] == 'Tumor'].loc[metadata['patient'].isin(client3_patients)]
client3_df = df_PCA.iloc[client3_idx.index.values - 1, :]

In [15]:
# save to csv
client1_df.to_csv("client1_niid_Tumor_PCA.csv", index = False)
client2_df.to_csv("client2_niid_Tumor_PCA.csv", index = False)
client3_df.to_csv("client3_niid_Tumor_PCA.csv", index = False)

## IID separation: everyone gets patients from (almost) every cell type: NAT cells only

In [24]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Colon1', 'Renal1']
client2_patients = ['Lung3', 'Lung4', 'Endo2', 'Colon2', 'Renal2']
client3_patients = ['Lung5', 'Lung6', 'Endo3', 'Renal3'] # only 2 patients with colon


# subset for only the NAT cells/discard cancer cells
client1_idx = metadata.loc[metadata['status'] == 'NAT'].loc[metadata['patient'].isin(client1_patients)]
client1_df = df.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[metadata['status'] == 'NAT'].loc[metadata['patient'].isin(client2_patients)]
client2_df = df.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[metadata['status'] == 'NAT'].loc[metadata['patient'].isin(client3_patients)]
client3_df = df.iloc[client3_idx.index.values - 1, :]

In [25]:
client1_df.shape[0], client2_df.shape[0], client3_df.shape[0]

(24101, 21879, 12703)

In [26]:
client1_df.shape[0] + client2_df.shape[0] + client3_df.shape[0]

58683

In [32]:
client1_df.head()

Unnamed: 0,IGJ,LOC100293211,LOC102725018,DERL3,MZB1,CXCL13,CST3,LYZ,RN7SK,IGLL5,...,LIMCH1,CLEC5A,IL6,ANXA13,CRIP2,ZNF812,CST6,ACP5,COTL1,TNFAIP3
LN1_AAACCTGCACGACGAA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.475897,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.475897,2.409183,0.0
LN1_AAACCTGCATTCACTT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.756799,0.0
LN1_AAACCTGGTCAAAGCG-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LN1_AAACCTGTCCCGGATG-1,0.0,0.0,0.0,0.0,0.0,0.0,1.648982,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.241067
LN1_AAACCTGTCCTAGGGC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# save to csv
client1_df.to_csv("client1_iid_NAT.csv", index = False)
client2_df.to_csv("client2_iid_NAT.csv", index = False)
client3_df.to_csv("client3_iid_NAT.csv", index = False)

## non-IID: NAT cells only

In [28]:
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Endo2', 'Endo3']
client2_patients = ['Lung3', 'Lung4', 'Colon1', 'Colon2']
client3_patients = ['Lung5', 'Lung6', 'Renal1', 'Renal2', 'Renal3'] 

# subset for only the NAT cells/discard cancer cells
client1_idx = metadata.loc[metadata['status'] == 'NAT'].loc[metadata['patient'].isin(client1_patients)]
client1_df = df.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[metadata['status'] == 'NAT'].loc[metadata['patient'].isin(client2_patients)]
client2_df = df.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[metadata['status'] == 'NAT'].loc[metadata['patient'].isin(client3_patients)]
client3_df = df.iloc[client3_idx.index.values - 1, :]

In [29]:
client1_df.shape[0], client2_df.shape[0], client3_df.shape[0]

(20479, 18879, 19325)

In [30]:
client1_df.shape[0] + client2_df.shape[0] + client3_df.shape[0]

58683

In [31]:
# save to csv
client1_df.to_csv("client1_niid_NAT.csv", index = False)
client2_df.to_csv("client2_niid_NAT.csv", index = False)
client3_df.to_csv("client3_niid_NAT.csv", index = False)

## IID separation: everyone gets patients from (almost) every cancer type

In [14]:
patients['patient'].unique()

array(['Lung1', 'Lung2', 'Lung3', 'Lung4', 'Lung5', 'Lung6', 'Endo1',
       'Endo2', 'Endo3', 'Colon1', 'Colon2', 'Renal1', 'Renal2', 'Renal3'],
      dtype=object)

In [67]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Colon1', 'Renal1']
client2_patients = ['Lung3', 'Lung4', 'Endo2', 'Colon2', 'Renal2']
client3_patients = ['Lung5', 'Lung6', 'Endo3', 'Renal3'] # only 2 patients with colon

client1_idx = patients.loc[patients['patient'].isin(client1_patients)]
client1_df = df.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[patients['patient'].isin(client2_patients)]
client2_df = df.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[patients['patient'].isin(client3_patients)]
client3_df = df.iloc[client3_idx.index.values - 1, :]

In [68]:
client1_df.shape[0], client2_df.shape[0], client3_df.shape[0]

(54555, 47573, 25460)

In [69]:
client1_df.shape[0] + client2_df.shape[0] +  client3_df.shape[0]

127588

In [70]:
# save to csv
client1_df.to_csv("client1_iid.csv", index = False)
client2_df.to_csv("client2_iid.csv", index = False)
client3_df.to_csv("client3_iid.csv", index = False)

## non-IID separation: everyone gets one from Endo/colon/renal, lung is split between all 3

In [71]:
# define which client has which patients
client1_patients = ['Lung1', 'Lung2', 'Endo1', 'Endo2', 'Endo3']
client2_patients = ['Lung3', 'Lung4', 'Colon1', 'Colon2']
client3_patients = ['Lung5', 'Lung6', 'Renal1', 'Renal2', 'Renal3'] # only 2 patients with colon

client1_idx = patients.loc[patients['patient'].isin(client1_patients)]
client1_df = df.iloc[client1_idx.index.values - 1, :] # index column is 1-indexed, hence the -1

client2_idx = patients.loc[patients['patient'].isin(client2_patients)]
client2_df = df.iloc[client2_idx.index.values - 1, :]

client3_idx = patients.loc[patients['patient'].isin(client3_patients)]
client3_df = df.iloc[client3_idx.index.values - 1, :]

In [72]:
client1_df.shape[0], client2_df.shape[0], client3_df.shape[0]

(58572, 37009, 32007)

In [73]:
client1_df.shape[0] + client2_df.shape[0] +  client3_df.shape[0]

127588

In [74]:
# save to csv
client1_df.to_csv("client1_noniid.csv", index = False)
client2_df.to_csv("client2_noniid.csv", index = False)
client3_df.to_csv("client3_noniid.csv", index = False)