In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

from scipy.stats import powerlaw

## Prerequisites

In [2]:
# convert labels into numbers (0 for no AML, 1 for AML)
def convert(x):
    if x == "CONTROL":
        return 0
    elif x == "CASE":
        return 1

In [3]:
# read input files
data_A1 = pd.read_csv("../datasets/AML_base/dataset_A1_RMA.txt", sep="\t")
annotation_A1 = pd.read_csv("../datasets/AML_base/dataset_A1_annotation.txt", sep="\t")

data_A2 = pd.read_csv("../datasets/AML_base/dataset_A2_RMA.txt", sep="\t")
annotation_A2 = pd.read_csv("../datasets/AML_base/dataset_A2_annotation.txt", sep="\t")

data_A3 = pd.read_csv("../datasets/AML_base/dataset_A3_DESeq2.txt", sep="\t")
annotation_A3 = pd.read_csv("../datasets/AML_base/dataset_A3_annotation.txt", sep="\t")


# flip datafile, as the samples are columns in the original file
data_A1 = data_A1.transpose()
data_A2 = data_A2.transpose()
data_A3 = data_A3.transpose()

## Preprocessing on the A1-A3 dataset

In [4]:
# grab the "condition" column, these are our labels
labels_A1 = annotation_A1["Condition"]
labels_A2 = annotation_A2["Condition"]
labels_A3 = annotation_A3["Condition"]

#data_cols = data_A1.columns
# add labels to data
data_A1["label"] = labels_A1
data_A2["label"] = labels_A2
data_A3["label"] = labels_A3

In [5]:
# shuffle data so labels are shuffled
data_A1 = data_A1.sample(frac=1).reset_index(drop=True)
data_A2 = data_A2.sample(frac=1).reset_index(drop=True)
data_A3 = data_A3.sample(frac=1).reset_index(drop=True)

# convert labels into 0 and 1 (see function at the top)
data_A1["label"] = data_A1["label"].apply(convert)
data_A2["label"] = data_A2["label"].apply(convert)
data_A3["label"] = data_A3["label"].apply(convert)

In [6]:
# create a train/test split of 80/20; we'll simply annotate the top 80% as train, and then shuffle again
full_sample_size = data_A1.shape[0]
train_sample_size = math.floor(full_sample_size * 0.8)
test_sample_size = full_sample_size - train_sample_size

data_A1.loc[0:train_sample_size, "test/train"] = "train"
data_A1.loc[train_sample_size:full_sample_size, "test/train"] = "test"

In [7]:
# repeat for A2
full_sample_size = data_A2.shape[0]
train_sample_size = math.floor(full_sample_size * 0.8)
test_sample_size = full_sample_size - train_sample_size

data_A2.loc[0:train_sample_size, "test/train"] = "train"
data_A2.loc[train_sample_size:full_sample_size, "test/train"] = "test"

In [8]:
# repeat for A3
full_sample_size = data_A3.shape[0]
train_sample_size = math.floor(full_sample_size * 0.8)
test_sample_size = full_sample_size - train_sample_size

data_A3.loc[0:train_sample_size, "test/train"] = "train"
data_A3.loc[train_sample_size:full_sample_size, "test/train"] = "test"


In [9]:
print(data_A1.loc[data_A1['label'] == 0].shape[0])
print(data_A1.loc[data_A1['label'] == 1].shape[0])

1451
1049


In [10]:
print(data_A2.loc[data_A2['label'] == 0].shape[0])
print(data_A2.loc[data_A2['label'] == 1].shape[0])

5760
2588


In [15]:
data_A1.columns

Index(['PAX8', 'CCL5', 'MMP14', 'DTX2P1-UPK3BP1-PMS2P11', 'BAD', 'PRPF8',
       'CAPNS1', 'RPL35', 'EIF4G2', 'EIF3D',
       ...
       'FKBP15', 'LRCH4', 'MEX3D', 'BCAN', 'ACTB', 'GAPDH', 'MIR3648-2',
       'MIR3648-1', 'label', 'test/train'],
      dtype='object', length=12710)

### To CSV

## Separation of A2 dataset (IID, CI, SI, 10 clients)

In [25]:
# create 10 datasets

# shuffle before splitting up into parts
data_A2 = data_A2.sample(frac=1).reset_index(drop=True)

num_samples = data_A2.shape[0]
num_clients = 10
IID_sets = []

samples_per_client = math.floor(num_samples/num_clients)
for i in range(num_clients):
    client = data_A2.iloc[i * samples_per_client:(i+1) * samples_per_client]
    IID_sets.append(client)

In [22]:
# create 10 more datasets, this time sample-imbalanced
### create uniform datasets
num_clients = 10

data_A2 = data_A2.sample(frac=1).reset_index(drop=True)

num_samples = data_A2.shape[0]
# create power-law distribution
x = np.linspace(0.1, 1.0, 10)
pdf = np.array(powerlaw.pdf(x, a=1.66))
pdf_norm = pdf / pdf.sum()
# sample according to power law
sample_nonIID_sets = []
begin_client_samples = 0
for i in range (num_clients):
    client_sample_amount = math.floor(pdf_norm[i] * num_samples)
    client = data_A2.iloc[begin_client_samples : begin_client_samples + client_sample_amount]
    begin_client_samples += client_sample_amount
    sample_nonIID_sets.append(client)
    


In [23]:
### create non-uniform datasets, class-wise
# split dataset in two for easier working
A2_0 = data_A2.loc[(data_A2['label'] == 0)]
A2_1 = data_A2.loc[(data_A2['label'] == 1)]

num_clients = 10
#shuffle both for good measure
A2_0 = A2_0.sample(frac=1).reset_index(drop=True)
A2_1 = A2_1.sample(frac=1).reset_index(drop=True)

total_samples_0 = A2_0.shape[0]
total_samples_1 = A2_1.shape[0]

class_nonIID_sets = []
begin_sample_0 = 0
begin_sample_1 = 0
for i in range(num_clients):
    amount_class_0 = math.floor(pdf_norm[i] * total_samples_0)
    amount_class_1 = math.floor(num_samples / 10) - amount_class_0
    tmp1 = A2_0.iloc[begin_sample_0:begin_sample_0+amount_class_0]
    tmp2 = A2_1.iloc[begin_sample_1:begin_sample_1+amount_class_1]
    client = pd.concat([tmp1, tmp2])
    client = client.sample(frac=1).reset_index(drop=True)
    class_nonIID_sets.append(client)
    begin_sample_0 += amount_class_0
    begin_sample_1 += amount_class_1

### To CSV

In [None]:
# writing to csv's
for i, client in enumerate(IID_sets):
    client.to_csv("AML_A2_IID_client" + str(i) + ".csv", index=False)
    
# writing to csv's
for i, client in enumerate(sample_nonIID_sets):
    client.to_csv("AML_A2_PCA_client" + str(i) + ".csv", index=False)

for i, client in enumerate(class_nonIID_sets):
    client.to_csv("A2_class_imbalance_client" + str(i) + ".csv", index = False)

## Splitting comBat A2 into two parts for memory reasons

In [4]:
A2_base = pd.read_csv("../datasets/AML/A2/AML_comBat_adj_A2.csv")
A2_base.head()

Unnamed: 0,PAX8,CCL5,MMP14,DTX2P1-UPK3BP1-PMS2P11,BAD,PRPF8,CAPNS1,RPL35,EIF4G2,EIF3D,...,FKBP15,LRCH4,MEX3D,BCAN,ACTB,GAPDH,MIR3648-2,MIR3648-1,label,test/train
0,-8.586827,1111.732881,-56.320797,76.712481,25.808423,1705.90473,1263.031037,1495.525106,2984.474018,928.301769,...,423.047371,645.137074,18.077582,6.030958,15870.45292,6516.236566,3.667506,3.672531,1,train
1,8.316241,-450.938138,-39.999605,201.313392,82.125698,1613.556163,954.724966,1515.709655,3132.296179,978.84123,...,302.017008,203.214318,9.646187,5.211092,15952.865035,5853.793219,6.583491,6.980514,0,train
2,4.426694,730.125103,84.54987,184.233553,87.236502,1722.005514,1122.654964,1440.484227,3065.280841,1019.155945,...,95.643406,760.03739,8.561917,7.526614,15533.05369,6388.510275,5.571398,5.460379,1,train
3,29.273835,1127.967744,-59.899153,86.636787,67.461858,1393.635869,837.062262,1490.663451,3247.885716,762.49394,...,189.997359,782.614853,11.118742,6.863773,15307.755957,6461.127793,5.266336,5.835727,1,test
4,-109.829206,911.286176,-675.586967,-406.767286,-127.263859,377.944079,533.348512,545.85292,3761.241308,-51.045323,...,379.453409,36.450412,-0.828204,-1.739212,14493.750211,4902.464341,5.706161,11.664367,0,train


In [10]:
n = A2_base.shape[0]
n1 = math.floor(n/2)
n2 = n - n1

A2_1 = A2_base.head(n1)
A2_2 = A2_base.tail(n2)

In [12]:
A2_1.to_csv("AML_comBat_A2_1.csv", index = False)
A2_2.to_csv("AML_comBat_A2_2.csv", index = False)