In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.linear_model import LogisticRegression

In [2]:
PATH = Path("/data2/yinterian/microarray/")
PATH_data = PATH/"Datasets_before_gene_ranked"

In [112]:
def split_train_val_test(df, seed=23):
    seed = 23
    np.random.seed(seed)
    n = int(0.1*len(df))
    N = len(df) - 2*n
    train_mask = np.array(2*n*[False] + N*[True])
    np.random.shuffle(train_mask)
    train = df[train_mask].copy()
    test_valid = df[~train_mask].copy()
    test_mask = np.array(n*[False] + n*[True])
    np.random.shuffle(test_mask)
    test = test_valid[test_mask].copy()
    valid = test_valid[~test_mask].copy()
    return train, valid, test

In [113]:
def clean_target(df, target_index=978):
    y_agg = df.iloc[:, target_index].value_counts()
    max_index = y_agg.argmax()
    class_label = y_agg.index[0] 
    target = df.iloc[:, target_index].values
    df = df.iloc[:,:target_index].copy()
    df["target"] = np.array([1 if (x == class_label) else 0 for x in target])
    return df

In [114]:
def univ_predictions(train, valid, test, target_index=978):
    train_prob = np.zeros((train.shape[0], target_index))
    valid_prob = np.zeros((valid.shape[0], target_index))
    test_prob = np.zeros((test.shape[0], target_index))
    y_train = train.target.values
    for i in range(target_index):
        X_train = train.iloc[:,i].values.reshape(-1, 1)
        X_valid = valid.iloc[:,i].values.reshape(-1, 1)
        X_test = test.iloc[:,i].values.reshape(-1, 1)
        clf = LogisticRegression(random_state=0).fit(X_train, y_train)
        train_prob[:,i] = clf.predict_proba(X_train)[:, 1]
        valid_prob[:,i] = clf.predict_proba(X_valid)[:, 1]
        test_prob[:,i] = clf.predict_proba(X_test)[:, 1]
    
    return train_prob, valid_prob, test_prob

In [115]:
def prob2rank(train_prob, valid_prob, test_prob):
    q = np.quantile(train_prob, 0.1*np.arange(0,11))
    train_rank = 0.1*np.searchsorted(q, train_prob)
    valid_rank = 0.1*np.searchsorted(q, valid_prob)
    test_rank = 0.1*np.searchsorted(q, test_prob)
    return train_rank, valid_rank, test_rank

In [116]:
def pipeline(path, target_index=978):
    df = pd.read_csv(f_csv)
    df = clean_target(df)
    # replace 0s by nan
    df.iloc[:,:target_index] = df.iloc[:,:target_index].replace([0], np.nan)
    # if we see negative numbers we make them positive
    if min(df.min(skipna=True, axis=1)) < 0:
        min_row = df.min(skipna=True, axis=1)
        df.iloc[:, :target_index] = df.iloc[:, :target_index].add(-min_row + 1e-4, axis=0)
    # divide by median per row
    med = df.median(axis=1)
    df.iloc[:, :target_index] = df.iloc[:, :target_index].div(med, axis=0)
    df.iloc[:,:target_index] = df.iloc[:,:target_index].replace([np.nan], 0)
    train, valid, test = split_train_val_test(df)
    # compute masks
    train_mask = (train.iloc[:,:target_index].values > 0).astype(int)
    valid_mask = (valid.iloc[:,:target_index].values > 0).astype(int)
    test_mask = (test.iloc[:,:target_index].values > 0).astype(int)
    
    train_prob, valid_prob, test_prob = univ_predictions(train, valid, test)
    train_rank, valid_rank, test_rank = prob2rank(train_prob*train_mask, valid_prob*valid_mask, test_prob*test_mask)
    train.iloc[:,:target_index] = train_rank
    valid.iloc[:,:target_index] = valid_rank
    test.iloc[:,:target_index] = test_rank
    return train, valid, test 

In [119]:
PATH_out = PATH/"ranked_datasets/"
for f in PATH_data.iterdir():
    f_csv = [x for x in f.iterdir()][0]
    f_name = f_csv.name.replace(" ", "")
    f_train = PATH_out/"train_{}".format(f_name) 
    if not f_train.is_file():
        print(f_csv, f_name)
        train, valid, test = pipeline(f_csv)
        print(train.shape)
        train.to_csv(PATH_out/"train_{}".format(f_name))
        valid.to_csv(PATH_out/"valid_{}".format(f_name))
        test.to_csv(PATH_out/"test_{}".format(f_name))

/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE14814/GSE14814_gene_profile.csv GSE14814_gene_profile.csv
(107, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE7670/GSE7670_gene_profile.csv GSE7670_gene_profile.csv
(54, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE67916/GSE67916_gene_profile.csv GSE67916_gene_profile.csv
(16, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/ GDS2250/ GDS2250_gene_profile.csv GDS2250_gene_profile.csv
(39, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE75316/GSE75316_gene_profile.csv GSE75316_gene_profile.csv
(49, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE36076/GSE36076_gene_profile.csv GSE36076_gene_profile.csv
(22, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE37745/GSE37745_gene_profile.csv GSE37745_gene_profile.csv
(158, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE16795/GSE16795_gene_profile.csv GSE16795_gene_profile.

(13, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/ GDS4386/ GDS4386_gene_profile.csv GDS4386_gene_profile.csv
(10, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE124069/GSE124069_gene_profile.csv GSE124069_gene_profile.csv
(24, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE122306/GSE122306_gene_profile.csv GSE122306_gene_profile.csv
(10, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE82171/GSE82171_gene_profile.csv GSE82171_gene_profile.csv
(16, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE30494/GSE30494_gene_profile.csv GSE30494_gene_profile.csv
(10, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE85043/GSE85043_gene_profile.csv GSE85043_gene_profile.csv
(25, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/ GDS3716/ GDS3716_gene_profile.csv GDS3716_gene_profile.csv
(34, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE66162/GSE66162_gene_profile.csv GSE661

(12, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/ GDS2526/ GDS2526_gene_profile.csv GDS2526_gene_profile.csv
(16, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/ GDS4090/ GDS4090_gene_profile.csv GDS4090_gene_profile.csv
(17, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE31625/GSE31625_gene_profile.csv GSE31625_gene_profile.csv
(40, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE11352/GSE11352_gene_profile.csv GSE11352_gene_profile.csv
(16, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE49588/GSE49588_gene_profile.csv GSE49588_gene_profile.csv
(24, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE43365/GSE43365_gene_profile.csv GSE43365_gene_profile.csv
(89, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE100480/GSE100480_gene_profile.csv GSE100480_gene_profile.csv
(45, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE69330/GSE69330_gene_profile.csv GSE69330_

(20, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE9750/GSE9750_gene_profile.csv GSE9750_gene_profile.csv
(54, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE11121/GSE11121_gene_profile.csv GSE11121_gene_profile.csv
(160, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/ GDS4109/ GDS4109_gene_profile.csv GDS4109_gene_profile.csv
(65, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE26027/GSE26027_gene_profile.csv GSE26027_gene_profile.csv
(10, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE15852/GSE15852_gene_profile.csv GSE15852_gene_profile.csv
(70, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE25428/GSE25428_gene_profile.csv GSE25428_gene_profile.csv
(77, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/ GDS5287/ GDS5287_gene_profile.csv GDS5287_gene_profile.csv
(20, 979)
/data2/yinterian/microarray/Datasets_before_gene_ranked/GSE51373/GSE51373_gene_profile.csv GSE51373_gene_

In [126]:
PATH_data2 = PATH/"ranked_datasets/"

training_files = list(PATH_data2.glob('train_*'))
valid_files = list(PATH_data2.glob('valid_*'))
test_files = list(PATH_data2.glob('test_*'))

In [127]:
problem_ids = [f.name.split("_")[1] for f in training_files]
problem2index = {v:k  for k,v in enumerate(problem_ids)}
len(problem_ids)

552

In [130]:
def make_master_df(files):
    dfs = []
    for f in files:
        df = pd.read_csv(f)
        problem = f.name.split("_")[1]
        df["index"] = problem2index[problem]
        dfs.append(df)
    df = pd.concat(dfs)
    df = df.drop(columns=["Unnamed: 0"])
    return df

In [131]:
df = make_master_df(training_files)

In [132]:
df.shape

(32066, 980)

In [133]:
df.to_csv(PATH/"train_all_problems.csv", index=False)

In [134]:
df = make_master_df(valid_files)
df.shape

(3725, 980)

In [135]:
df.to_csv(PATH/"valid_all_problems.csv", index=False)

In [136]:
df = make_master_df(test_files)
df.shape

(3725, 980)

In [137]:
df.to_csv(PATH/"test_all_problems.csv", index=False)