In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from pathlib import Path

In [2]:
PATH = Path("/data2/yinterian/microarray/ovarian_cancer")

In [3]:
genes = pd.read_csv(PATH/"GSE106817_gene_profile.csv")
outputs = pd.read_csv(PATH/"GSE106817_output.csv")

In [4]:
genes = genes.rename(columns={'Unnamed: 0': 'geo_accession'})

In [5]:
genes.shape, outputs.shape

((4046, 2566), (4046, 33))

In [6]:
genes.head()

Unnamed: 0,geo_accession,MIMAT0000062,MIMAT0000063,MIMAT0000064,MIMAT0000065,MIMAT0000066,MIMAT0000067,MIMAT0000068,MIMAT0000069,MIMAT0000070,...,MIMAT0031893,MIMAT0032026,MIMAT0032029,MIMAT0032110,"MIMAT0032114, MIMAT0032115",MIMAT0032116,MIMAT0033692,MIMAT0035542,MIMAT0035703,MIMAT0035704
0,GSM2850709,6.153,6.347,5.429,3.935,6.08,5.903,5.307,4.864,5.225,...,4.919,1.896,6.656,0.333,0.333,7.245,7.184,0.333,0.333,2.802
1,GSM2850710,6.416,3.862,1.941,2.242,6.696,1.941,3.913,1.941,7.696,...,1.941,2.068,7.182,1.941,1.941,7.86,1.941,1.941,1.941,1.941
2,GSM2850711,4.104,4.986,4.043,1.573,0.371,0.371,1.918,0.371,4.828,...,0.371,0.371,4.896,0.371,0.371,6.964,5.696,0.371,0.371,0.371
3,GSM2850712,7.215,6.083,7.03,7.499,7.249,7.6,8.221,8.522,5.563,...,4.99,0.415,4.337,5.072,0.415,6.956,8.504,3.928,0.415,0.637
4,GSM2850713,3.689,4.564,2.666,4.634,4.434,-0.586,-0.586,-0.586,5.071,...,-0.586,-0.586,7.978,-0.586,-0.586,7.155,4.536,-0.586,-0.586,-0.586


In [85]:
(genes.geo_accession == y.geo_accession).sum()/y.shape[0]

1.0

In [70]:
y = outputs[["geo_accession", "description"]].copy()
y.head()

Unnamed: 0,geo_accession,description
0,GSM2850709,Breast Cancer
1,GSM2850710,Breast Cancer
2,GSM2850711,Breast Cancer
3,GSM2850712,Breast Cancer
4,GSM2850713,Breast Cancer


In [72]:
desc = y.description.value_counts()
desc

non-Cancer                  2759
Ovarian Cancer               320
Lung Cancer                  115
Sarcoma                      115
Gastric Cancer               115
Pancreatic Cancer            115
Breast Cancer                115
Colorectal Cancer            115
Esophageal Cancer             88
Hepatocellular Carcinoma      81
Borderline Ovarian Tumor      66
Benign Ovarian Disease        29
OV_others                     13
Name: description, dtype: int64

In [81]:
def stratify_sample(y, col):
    np.random.seed(seed=3)
    desc = y[col].value_counts()
    y["Train"] = 0
    for cat, n in desc.items():
        print(cat, n)
        m1 = int(0.8*n)
        m0 = n - m1
        a = np.array([1]*m1 + [0]*m0)
        np.random.shuffle(a)
        y.loc[y["description"] == cat, "Train"] = a

In [82]:
stratify_sample(y, "description")

non-Cancer 2759
Ovarian Cancer 320
Lung Cancer 115
Sarcoma 115
Gastric Cancer 115
Pancreatic Cancer 115
Breast Cancer 115
Colorectal Cancer 115
Esophageal Cancer 88
Hepatocellular Carcinoma 81
Borderline Ovarian Tumor 66
Benign Ovarian Disease 29
OV_others 13


In [83]:
y.groupby(["description", "Train"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,geo_accession
description,Train,Unnamed: 2_level_1
Benign Ovarian Disease,0,6
Benign Ovarian Disease,1,23
Borderline Ovarian Tumor,0,14
Borderline Ovarian Tumor,1,52
Breast Cancer,0,23
Breast Cancer,1,92
Colorectal Cancer,0,23
Colorectal Cancer,1,92
Esophageal Cancer,0,18
Esophageal Cancer,1,70


In [80]:
2207 / (2207 + 552)

0.7999275099673795

In [86]:
df = genes.merge(y, on='geo_accession')
df.shape

(4046, 2568)

In [91]:
df["y"] = 0
df["y"] = np.where(df.description == "Ovarian Cancer", 1, 0)

In [95]:
df[["description", "y", "Train"]].groupby(["description", "y"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Train
description,y,Unnamed: 2_level_1
Benign Ovarian Disease,0,29
Borderline Ovarian Tumor,0,66
Breast Cancer,0,115
Colorectal Cancer,0,115
Esophageal Cancer,0,88
Gastric Cancer,0,115
Hepatocellular Carcinoma,0,81
Lung Cancer,0,115
OV_others,0,13
Ovarian Cancer,1,320


In [96]:
filter_col = [col for col in df if col.startswith('MIMA')]
filter_col

['MIMAT0000062',
 'MIMAT0000063',
 'MIMAT0000064',
 'MIMAT0000065',
 'MIMAT0000066',
 'MIMAT0000067',
 'MIMAT0000068',
 'MIMAT0000069',
 'MIMAT0000070',
 'MIMAT0000071',
 'MIMAT0000072',
 'MIMAT0000073',
 'MIMAT0000074',
 'MIMAT0000075',
 'MIMAT0000076',
 'MIMAT0000077',
 'MIMAT0000078',
 'MIMAT0000079',
 'MIMAT0000080',
 'MIMAT0000081',
 'MIMAT0000082',
 'MIMAT0000083',
 'MIMAT0000084',
 'MIMAT0000085',
 'MIMAT0000086',
 'MIMAT0000087',
 'MIMAT0000088',
 'MIMAT0000089',
 'MIMAT0000090',
 'MIMAT0000091',
 'MIMAT0000092',
 'MIMAT0000093',
 'MIMAT0000094',
 'MIMAT0000095',
 'MIMAT0000096',
 'MIMAT0000097',
 'MIMAT0000098',
 'MIMAT0000099',
 'MIMAT0000100',
 'MIMAT0000101',
 'MIMAT0000102',
 'MIMAT0000103',
 'MIMAT0000104',
 'MIMAT0000222',
 'MIMAT0000226',
 'MIMAT0000227',
 'MIMAT0000228',
 'MIMAT0000231',
 'MIMAT0000232, MIMAT0004563',
 'MIMAT0000241',
 'MIMAT0000242',
 'MIMAT0000243',
 'MIMAT0000244',
 'MIMAT0000245',
 'MIMAT0000250',
 'MIMAT0000251',
 'MIMAT0000252',
 'MIMAT0000253',


In [98]:
{col: "X"+str(i) for i, col in enumerate(filter_col)}

{'MIMAT0000062': 'X0',
 'MIMAT0000063': 'X1',
 'MIMAT0000064': 'X2',
 'MIMAT0000065': 'X3',
 'MIMAT0000066': 'X4',
 'MIMAT0000067': 'X5',
 'MIMAT0000068': 'X6',
 'MIMAT0000069': 'X7',
 'MIMAT0000070': 'X8',
 'MIMAT0000071': 'X9',
 'MIMAT0000072': 'X10',
 'MIMAT0000073': 'X11',
 'MIMAT0000074': 'X12',
 'MIMAT0000075': 'X13',
 'MIMAT0000076': 'X14',
 'MIMAT0000077': 'X15',
 'MIMAT0000078': 'X16',
 'MIMAT0000079': 'X17',
 'MIMAT0000080': 'X18',
 'MIMAT0000081': 'X19',
 'MIMAT0000082': 'X20',
 'MIMAT0000083': 'X21',
 'MIMAT0000084': 'X22',
 'MIMAT0000085': 'X23',
 'MIMAT0000086': 'X24',
 'MIMAT0000087': 'X25',
 'MIMAT0000088': 'X26',
 'MIMAT0000089': 'X27',
 'MIMAT0000090': 'X28',
 'MIMAT0000091': 'X29',
 'MIMAT0000092': 'X30',
 'MIMAT0000093': 'X31',
 'MIMAT0000094': 'X32',
 'MIMAT0000095': 'X33',
 'MIMAT0000096': 'X34',
 'MIMAT0000097': 'X35',
 'MIMAT0000098': 'X36',
 'MIMAT0000099': 'X37',
 'MIMAT0000100': 'X38',
 'MIMAT0000101': 'X39',
 'MIMAT0000102': 'X40',
 'MIMAT0000103': 'X41',
 '

In [99]:
df_final = df.rename(columns={col: "X"+str(i) for i, col in enumerate(filter_col)})

In [100]:
df_final.head()

Unnamed: 0,geo_accession,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X2558,X2559,X2560,X2561,X2562,X2563,X2564,description,Train,y
0,GSM2850709,6.153,6.347,5.429,3.935,6.08,5.903,5.307,4.864,5.225,...,0.333,0.333,7.245,7.184,0.333,0.333,2.802,Breast Cancer,1,0
1,GSM2850710,6.416,3.862,1.941,2.242,6.696,1.941,3.913,1.941,7.696,...,1.941,1.941,7.86,1.941,1.941,1.941,1.941,Breast Cancer,1,0
2,GSM2850711,4.104,4.986,4.043,1.573,0.371,0.371,1.918,0.371,4.828,...,0.371,0.371,6.964,5.696,0.371,0.371,0.371,Breast Cancer,1,0
3,GSM2850712,7.215,6.083,7.03,7.499,7.249,7.6,8.221,8.522,5.563,...,5.072,0.415,6.956,8.504,3.928,0.415,0.637,Breast Cancer,0,0
4,GSM2850713,3.689,4.564,2.666,4.634,4.434,-0.586,-0.586,-0.586,5.071,...,-0.586,-0.586,7.155,4.536,-0.586,-0.586,-0.586,Breast Cancer,1,0


In [101]:
df_final = df_final.drop(["description"], axis=1)
df_final.head()

Unnamed: 0,geo_accession,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X2557,X2558,X2559,X2560,X2561,X2562,X2563,X2564,Train,y
0,GSM2850709,6.153,6.347,5.429,3.935,6.08,5.903,5.307,4.864,5.225,...,6.656,0.333,0.333,7.245,7.184,0.333,0.333,2.802,1,0
1,GSM2850710,6.416,3.862,1.941,2.242,6.696,1.941,3.913,1.941,7.696,...,7.182,1.941,1.941,7.86,1.941,1.941,1.941,1.941,1,0
2,GSM2850711,4.104,4.986,4.043,1.573,0.371,0.371,1.918,0.371,4.828,...,4.896,0.371,0.371,6.964,5.696,0.371,0.371,0.371,1,0
3,GSM2850712,7.215,6.083,7.03,7.499,7.249,7.6,8.221,8.522,5.563,...,4.337,5.072,0.415,6.956,8.504,3.928,0.415,0.637,0,0
4,GSM2850713,3.689,4.564,2.666,4.634,4.434,-0.586,-0.586,-0.586,5.071,...,7.978,-0.586,-0.586,7.155,4.536,-0.586,-0.586,-0.586,1,0


In [102]:
train = df_final[df_final["Train"] == 1]

In [103]:
test = df_final[df_final["Train"] == 0]

In [105]:
train.shape, test.shape

((3234, 2568), (812, 2568))

In [106]:
train = train.drop(["Train"], axis=1)
test = test.drop(["Train"], axis=1)

In [107]:
train.to_csv("train_ovarian_cancer.csv",index=False)
test.to_csv("test_ovarian_cancer.csv",index=False)