In [1]:
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from preprocessing.missing_value_imputation import missing_value_imputation
from preprocessing.train_autoencoder import train_autoencoder
from preprocessing.normalization import normalize_data
from models.autoencoder import Autoencoder


In [2]:
ov_merged_data = pd.read_csv('edited dataset/merged_data/ov_merged_data.csv')
rc18flow_merged_data = pd.read_csv('edited dataset/merged_data/rc18flow_merged_data.csv')
rc18mr_merged_data = pd.read_csv('edited dataset/merged_data/rc18mr_merged_data.csv')
rc12_merged_data = pd.read_csv('edited dataset/merged_data/rc12_merged_data.csv')
pdac_merged_data = pd.read_csv('edited dataset/merged_data/pdac_merged_data.csv')
brca_terunuma_merged_data = pd.read_csv('edited dataset/merged_data/brca_terunuma_merged_data.csv')
coad_merged_data = pd.read_csv('edited dataset/merged_data/coad_merged_data.csv')
prad_2019a_merged_data = pd.read_csv('edited dataset/merged_data/prad_2019a_merged_data.csv')
prad_2019b_merged_data = pd.read_csv('edited dataset/merged_data/prad_2019b_merged_data.csv')
prad_2014_merged_data = pd.read_csv('edited dataset/merged_data/prad_2014_merged_data.csv')
hcc_merged_data = pd.read_csv('edited dataset/merged_data/hcc_merged_data.csv')
dlbcl_merged_data = pd.read_csv('edited dataset/merged_data/dlbcl_merged_data.csv')
brca_tang_merged_data = pd.read_csv('edited dataset/merged_data/brca_tang_merged_data.csv')
rc20_merged_data = pd.read_csv('edited dataset/merged_data/rc20_merged_data.csv')
gbm_merged_data = pd.read_csv('edited dataset/merged_data/gbm_merged_data.csv')
lica1_merged_data = pd.read_csv('edited dataset/merged_data/lica1_merged_data.csv')
lica2_merged_data = pd.read_csv('edited dataset/merged_data/lica2_merged_data.csv')

In [3]:
ov_merged_data.insert(0, 'Labels', ov_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
rc18flow_merged_data.insert(0, 'Labels', rc18flow_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
rc18mr_merged_data.insert(0, 'Labels', rc18mr_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
rc12_merged_data.insert(0, 'Labels', rc12_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
pdac_merged_data.insert(0, 'Labels', pdac_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
brca_terunuma_merged_data.insert(0, 'Labels', brca_terunuma_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
coad_merged_data.insert(0, 'Labels', coad_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
prad_2019a_merged_data.insert(0, 'Labels', prad_2019a_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
prad_2019b_merged_data.insert(0, 'Labels', prad_2019b_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
prad_2014_merged_data.insert(0, 'Labels', prad_2014_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
hcc_merged_data.insert(0, 'Labels', hcc_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
dlbcl_merged_data.insert(0, 'Labels', dlbcl_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
brca_tang_merged_data.insert(0, 'Labels', brca_tang_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
rc20_merged_data.insert(0, 'Labels', rc20_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
gbm_merged_data.insert(0, 'Labels', gbm_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
lica1_merged_data.insert(0, 'Labels', lica1_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))
lica2_merged_data.insert(0, 'Labels', lica2_merged_data['TN'].apply(lambda x: 1 if x == 'Tumor' else 0))

In [2]:
new_merged_data = pd.read_csv('edited dataset/all_merged_data.csv')

  new_merged_data = pd.read_csv('edited dataset/all_merged_data.csv')


In [3]:
new_merged_data[['Identifier', 'SampleID']] = new_merged_data['CommonID'].str.split(':', expand=True)

In [4]:
identifier_list = list(new_merged_data['Identifier'].unique())
identifier_df = pd.DataFrame({'Identifier': identifier_list, 'Index': range(1, len(identifier_list) + 1)})
# identifier_df.to_csv('edited dataset/identifier_df.csv', index=False)

In [7]:
identifier_df = pd.read_csv('edited dataset/identifier_df.csv')

In [16]:
identifier_df
identifier_df[identifier_df['Identifier'] == 'OV']['Index'].values[0]

1

In [17]:
def get_label_value(row):
    if row['TN'] == 'Tumor':
        try:
            return identifier_df[identifier_df['Identifier'] == row['Identifier']]['Index'].values[0]
            # return identifier_list.index(row['Identifier']) + 1
        except ValueError:
            return 0  # Identifier listesinde bulunmazsa 0 döner
    else:
        return 0

new_merged_data.insert(0, 'Labels', new_merged_data.apply(get_label_value, axis=1))


In [18]:
new_merged_data

Unnamed: 0,Labels,CommonID,MetabID,RNAID,TN,RANP6,RPL35P1,RNU7-116P,CLDN6,FAM138F,...,TRIP10,RNU6-748P,RPL34P29,LOC105378930,ALKBH5,MIR331,RNA5SP42,LOC100419318,Identifier,SampleID
0,1,OV:P1,P1,GSM642933_OC01_ARN0001_s1h1s1_U133p2.CEL.gz,Tumor,0.0,0.0,0.0,4.097907,3.807501,...,7.933336,0.0,0.0,0.0,7.789885,0.000000,0.0,0.0,OV,P1
1,1,OV:P2,P2,GSM642966_OC01_ARN0049_s2h1s1_U133p2.CEL.gz,Tumor,0.0,0.0,0.0,5.786800,4.122477,...,7.598009,0.0,0.0,0.0,6.648520,0.000000,0.0,0.0,OV,P2
2,1,OV:P41,P41,GSM642979_OC01_ARN0072_s1h1s1_U133p2.CEL.gz,Tumor,0.0,0.0,0.0,3.961681,3.731335,...,7.743404,0.0,0.0,0.0,6.898033,0.000000,0.0,0.0,OV,P41
3,1,OV:P42,P42,GSM643007_OC01_ARN0118_s1h1s1_U133p2.CEL.gz,Tumor,0.0,0.0,0.0,4.237427,3.915974,...,7.118114,0.0,0.0,0.0,7.019029,0.000000,0.0,0.0,OV,P42
4,1,OV:P43,P43,GSM642949_OC01_ARN0026_s2h1s1_U133p2.CEL.gz,Tumor,0.0,0.0,0.0,3.900173,3.960303,...,7.828710,0.0,0.0,0.0,7.300812,0.000000,0.0,0.0,OV,P43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
983,10,LiCa2:LCS_683,LCS_683,GSM1979661_D6_683A.CEL.gz,Tumor,0.0,0.0,0.0,4.857242,0.000000,...,7.178290,0.0,0.0,0.0,6.948318,3.739957,0.0,0.0,LiCa2,LCS_683
984,10,LiCa2:LCS_684,LCS_684,GSM1979663_B5_684A.CEL.gz,Tumor,0.0,0.0,0.0,4.807932,0.000000,...,7.352504,0.0,0.0,0.0,7.548978,3.402346,0.0,0.0,LiCa2,LCS_684
985,10,LiCa2:LCS_685,LCS_685,GSM1979665_H4_685A.CEL.gz,Tumor,0.0,0.0,0.0,4.963260,0.000000,...,7.431223,0.0,0.0,0.0,7.293227,3.970748,0.0,0.0,LiCa2,LCS_685
986,10,LiCa2:LCS_687,LCS_687,GSM1979667_D6_687A.CEL.gz,Tumor,0.0,0.0,0.0,5.570010,0.000000,...,7.102446,0.0,0.0,0.0,7.078563,3.222240,0.0,0.0,LiCa2,LCS_687


In [19]:
merged_datas = new_merged_data.iloc[:,5:-2]
merged_labels = new_merged_data.iloc[:,:1]

# Seperate Data to Labels and Data

In [32]:
ov_data = ov_merged_data.iloc[:,5:]
ov_labels = ov_merged_data.iloc[:,:1]

rc18flow_data = rc18flow_merged_data.iloc[:,5:]
rc18flow_labels = rc18flow_merged_data.iloc[:,:1]

rc18mr_data = rc18mr_merged_data.iloc[:,5:]
rc18mr_labels = rc18mr_merged_data.iloc[:,:1]

rc12_data = rc12_merged_data.iloc[:,5:]
rc12_labels = rc12_merged_data.iloc[:,:1]

pdac_data = pdac_merged_data.iloc[:,5:]
pdac_labels = pdac_merged_data.iloc[:,:1]

brca_terunuma_data = brca_terunuma_merged_data.iloc[:,5:]
brca_terunuma_labels = brca_terunuma_merged_data.iloc[:,:1]

coad_data = coad_merged_data.iloc[:,5:]
coad_labels = coad_merged_data.iloc[:,:1]

prad_2019a_data = prad_2019a_merged_data.iloc[:,5:]
prad_2019a_labels = prad_2019a_merged_data.iloc[:,:1]

prad_2019b_data = prad_2019b_merged_data.iloc[:,5:]
prad_2019b_labels = prad_2019b_merged_data.iloc[:,:1]

prad_2014_data = prad_2014_merged_data.iloc[:,5:]
prad_2014_labels = prad_2014_merged_data.iloc[:,:1]

hcc_data = hcc_merged_data.iloc[:,5:]
hcc_labels = hcc_merged_data.iloc[:,:1]

dlbcl_data = dlbcl_merged_data.iloc[:,5:]
dlbcl_labels = dlbcl_merged_data.iloc[:,:1]

brca_tang_data = brca_tang_merged_data.iloc[:,5:]
brca_tang_labels = brca_tang_merged_data.iloc[:,:1]

rc20_data = rc20_merged_data.iloc[:,5:]
rc20_labels = rc20_merged_data.iloc[:,:1]

gbm_data = gbm_merged_data.iloc[:,5:]
gbm_labels = gbm_merged_data.iloc[:,:1]

lica1_data = lica1_merged_data.iloc[:,5:]
lica1_labels = lica1_merged_data.iloc[:,:1]

lica2_data = lica2_merged_data.iloc[:,5:]
lica2_labels = lica2_merged_data.iloc[:,:1]

# Impute Missing Values

In [43]:
ov_data_imputted = pd.DataFrame(missing_value_imputation(ov_data.values))
rc18flow_data_imputted = pd.DataFrame(missing_value_imputation(rc18flow_data.values))
rc18mr_data_imputted = pd.DataFrame(missing_value_imputation(rc18mr_data.values))
rc12_data_imputted = pd.DataFrame(missing_value_imputation(rc12_data.values))
pdac_data_imputted = pd.DataFrame(missing_value_imputation(pdac_data.values))
brca_terunuma_data_imputted = pd.DataFrame(missing_value_imputation(brca_terunuma_data.values))
coad_data_imputted = pd.DataFrame(missing_value_imputation(coad_data.values))
prad_2019a_data_imputted = pd.DataFrame(missing_value_imputation(prad_2019a_data.values))
prad_2019b_data_imputted = pd.DataFrame(missing_value_imputation(prad_2019b_data.values))
prad_2014_data_imputted = pd.DataFrame(missing_value_imputation(prad_2014_data.values))
hcc_data_imputted = pd.DataFrame(missing_value_imputation(hcc_data.values))
dlbcl_data_imputted = pd.DataFrame(missing_value_imputation(dlbcl_data.values))
brca_tang_data_imputted = pd.DataFrame(missing_value_imputation(brca_tang_data.values))
rc20_data_imputted = pd.DataFrame(missing_value_imputation(rc20_data.values))
gbm_data_imputted = pd.DataFrame(missing_value_imputation(gbm_data.values))
lica1_data_imputted = pd.DataFrame(missing_value_imputation(lica1_data.values))
lica2_data_imputted = pd.DataFrame(missing_value_imputation(lica2_data.values))


In [20]:
merged_datas_imputted = pd.DataFrame(missing_value_imputation(merged_datas.values))

# Normalize Data

In [44]:
ov_data_imputted = normalize_data(ov_data_imputted)
rc18flow_data_imputted = normalize_data(rc18flow_data_imputted)
rc18mr_data_imputted = normalize_data(rc18mr_data_imputted)
rc12_data_imputted = normalize_data(rc12_data_imputted)
pdac_data_imputted = normalize_data(pdac_data_imputted)
brca_terunuma_data_imputted = normalize_data(brca_terunuma_data_imputted)
coad_data_imputted = normalize_data(coad_data_imputted)
prad_2019a_data_imputted = normalize_data(prad_2019a_data_imputted)
prad_2019b_data_imputted = normalize_data(prad_2019b_data_imputted)
prad_2014_data_imputted = normalize_data(prad_2014_data_imputted)
hcc_data_imputted = normalize_data(hcc_data_imputted)
dlbcl_data_imputted = normalize_data(dlbcl_data_imputted)
brca_tang_data_imputted = normalize_data(brca_tang_data_imputted)
rc20_data_imputted = normalize_data(rc20_data_imputted)
gbm_data_imputted = normalize_data(gbm_data_imputted)
lica1_data_imputted = normalize_data(lica1_data_imputted)
lica2_data_imputted = normalize_data(lica2_data_imputted)

In [22]:
merged_datas_imputted = normalize_data(merged_datas_imputted)


In [23]:
merged_datas_imputted.to_csv('edited dataset/imputted_merged_data.csv', index=False)

In [24]:
merged_datas_imputted = pd.read_csv('edited dataset/imputted_merged_data.csv')

In [38]:
(merged_datas_imputted)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45796,45797,45798,45799,45800,45801,45802,45803,45804,45805
0,0.0,0.0,0.0,0.240478,0.761711,0.0,0.0,0.0,0.0,0.0,...,0.046964,0.027297,0.015477,0.0,0.0,0.0,0.016192,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.339588,0.824724,0.0,0.0,0.0,0.0,0.0,...,0.060845,0.018300,0.014762,0.0,0.0,0.0,0.010330,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.232484,0.746474,0.0,0.0,0.0,0.0,0.0,...,0.046685,0.018595,0.015072,0.0,0.0,0.0,0.011611,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.248666,0.783412,0.0,0.0,0.0,0.0,0.0,...,0.054200,0.020489,0.013739,0.0,0.0,0.0,0.012233,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.228875,0.792280,0.0,0.0,0.0,0.0,0.0,...,0.055552,0.031089,0.015254,0.0,0.0,0.0,0.013680,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
983,0.0,0.0,0.0,0.285039,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.037394,0.022220,0.013867,0.0,0.0,0.0,0.011870,0.053055,0.0,0.0
984,0.0,0.0,0.0,0.282145,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.036854,0.025946,0.014239,0.0,0.0,0.0,0.014955,0.048266,0.0,0.0
985,0.0,0.0,0.0,0.291260,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.038708,0.021953,0.014406,0.0,0.0,0.0,0.013641,0.056329,0.0,0.0
986,0.0,0.0,0.0,0.326866,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.036555,0.024778,0.013705,0.0,0.0,0.0,0.012539,0.045711,0.0,0.0


# Feature Extraction

In [36]:
def feature_extraction(data):
    X_train, X_test = train_test_split(data.values, test_size=0.2, random_state=42)

    data_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    data_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    data_tensor = torch.tensor(data.values, dtype=torch.float32)

    input_dim = len(data.values[0])
    encoding_dim = 2000 # len(data.values[0]) // 5
    autoencoder = Autoencoder(input_dim, encoding_dim)


    autoencoder = train_autoencoder(
                    model=Autoencoder(input_dim, encoding_dim), 
                    train_data=data_train_tensor,
                    test_data=data_test_tensor, 
                    optimizer=optim.Adam(autoencoder.parameters(), lr=0.001), 
                    criterion=nn.MSELoss(), 
                    n_epoch=10)


    data_dim_red = autoencoder.encoder(data_tensor)
    return data_dim_red

def remove_zero_columns(data):
    return data.loc[:, (data != 0.0).any(axis=0)]


# PCA

In [26]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

def pca_feature_selection(data, feature_number):
    pca = PCA(n_components=feature_number)
    pca_result = pca.fit_transform(data)
    return pca_result

def pca_feature_selection_pytorch(data, feature_number):
    data_tensor = torch.tensor(data.values, dtype=torch.float32)

    mean = torch.mean(data_tensor, dim=0)
    data_centered = data_tensor - mean

    cov_matrix = torch.mm(data_centered.t(), data_centered) / (data_centered.size(0) - 1)

    eigenvalues, eigenvectors = torch.linalg.eig(cov_matrix)
    eigenvalues = eigenvalues.real
    eigenvectors = eigenvectors.real

    _, indices = torch.sort(eigenvalues, descending=True)
    top_k_eigenvectors = eigenvectors[:, indices[:feature_number]]

    data_pca = torch.mm(data_centered, top_k_eigenvectors)
    
    pca_data = pd.DataFrame(data_pca.numpy(), index=data.index)
    
    return pca_data


def variance_based_feature_selection(data, threshold=0.1):
    variance_selector = VarianceThreshold(threshold=threshold)
    selected_features = variance_selector.fit_transform(data)
    return selected_features


In [27]:
merged_data_pca = pca_feature_selection(merged_datas_imputted, feature_number=100)

In [11]:
merged_data_pca_pytorch = pca_feature_selection_pytorch(merged_datas_imputted, feature_number=100)

In [None]:
merged_data_pca_pytorch

In [43]:
merged_data_pca.shape

(988, 900)

In [28]:
merged_data_variance_based = variance_based_feature_selection(merged_datas_imputted, threshold=0.12)

In [29]:
merged_data_variance_based.shape

(988, 1258)

# Variance-Based Feature Selection

In [30]:
merged_data_variance_based = variance_based_feature_selection(merged_datas_imputted, threshold=0.12)

In [None]:
merged_data_variance_based.shape

# Autoencoder feature extraction

In [63]:
ov_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(ov_data_imputted).detach().numpy())))
rc18flow_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(rc18flow_data_imputted).detach().numpy())))
rc18mr_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(rc18mr_data_imputted).detach().numpy())))
rc12_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(rc12_data_imputted).detach().numpy())))
pdac_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(pdac_data_imputted).detach().numpy())))
brca_terunuma_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(brca_terunuma_data_imputted).detach().numpy())))
coad_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(coad_data_imputted).detach().numpy())))
prad_2019a_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(prad_2019a_data_imputted).detach().numpy())))
prad_2019b_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(prad_2019b_data_imputted).detach().numpy())))
prad_2014_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(prad_2014_data_imputted).detach().numpy())))
hcc_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(hcc_data_imputted).detach().numpy())))
dlbcl_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(dlbcl_data_imputted).detach().numpy())))
brca_tang_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(brca_tang_data_imputted).detach().numpy())))
rc20_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(rc20_data_imputted).detach().numpy())))
gbm_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(gbm_data_imputted).detach().numpy())))
lica1_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(lica1_data_imputted).detach().numpy())))
lica2_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(lica2_data_imputted).detach().numpy())))


Test Loss: 0.0665
Epoch [10/10], Loss: 0.0681
Test Loss: 0.1247
Epoch [10/10], Loss: 0.1314
Test Loss: 0.1053
Epoch [10/10], Loss: 0.1052
Test Loss: 0.1112
Epoch [10/10], Loss: 0.1089
Test Loss: 0.0730
Epoch [10/10], Loss: 0.0665
Test Loss: 0.0548
Epoch [10/10], Loss: 0.0543
Test Loss: 0.0584
Epoch [10/10], Loss: 0.0579
Test Loss: 0.0483
Epoch [10/10], Loss: 0.0518
Test Loss: 0.0667
Epoch [10/10], Loss: 0.0640
Test Loss: 0.0626
Epoch [10/10], Loss: 0.0674
Test Loss: 0.1206
Epoch [10/10], Loss: 0.1246
Test Loss: 0.1240
Epoch [10/10], Loss: 0.1266
Test Loss: 0.1374
Epoch [10/10], Loss: 0.1388
Test Loss: 0.1316
Epoch [10/10], Loss: 0.1285
Test Loss: 0.1375
Epoch [10/10], Loss: 0.1397
Test Loss: 0.0615
Epoch [10/10], Loss: 0.0681
Test Loss: 0.0621
Epoch [10/10], Loss: 0.0623


In [51]:
merged_data_dim_red = remove_zero_columns((pd.DataFrame(feature_extraction(merged_datas_imputted).detach().numpy())))

Test Loss: 0.2022
Epoch [10/10], Loss: 0.2012


In [64]:
ov_data_dim_red.to_csv('edited dataset/preproccessed_data/ov_data_dim_red.csv', index=False)
rc18flow_data_dim_red.to_csv('edited dataset/preproccessed_data/rc18flow_data_dim_red.csv', index=False)
rc18mr_data_dim_red.to_csv('edited dataset/preproccessed_data/rc18mr_data_dim_red.csv', index=False)
rc12_data_dim_red.to_csv('edited dataset/preproccessed_data/rc12_data_dim_red.csv', index=False)
pdac_data_dim_red.to_csv('edited dataset/preproccessed_data/pdac_data_dim_red.csv', index=False)
brca_terunuma_data_dim_red.to_csv('edited dataset/preproccessed_data/brca_terunuma_data_dim_red.csv', index=False)
coad_data_dim_red.to_csv('edited dataset/preproccessed_data/coad_data_dim_red.csv', index=False)
prad_2019a_data_dim_red.to_csv('edited dataset/preproccessed_data/prad_2019a_data_dim_red.csv', index=False)
prad_2019b_data_dim_red.to_csv('edited dataset/preproccessed_data/prad_2019b_data_dim_red.csv', index=False)
prad_2014_data_dim_red.to_csv('edited dataset/preproccessed_data/prad_2014_data_dim_red.csv', index=False)
hcc_data_dim_red.to_csv('edited dataset/preproccessed_data/hcc_data_dim_red.csv', index=False)
dlbcl_data_dim_red.to_csv('edited dataset/preproccessed_data/dlbcl_data_dim_red.csv', index=False)
brca_tang_data_dim_red.to_csv('edited dataset/preproccessed_data/brca_tang_data_dim_red.csv', index=False)
rc20_data_dim_red.to_csv('edited dataset/preproccessed_data/rc20_data_dim_red.csv', index=False)
gbm_data_dim_red.to_csv('edited dataset/preproccessed_data/gbm_data_dim_red.csv', index=False)
lica1_data_dim_red.to_csv('edited dataset/preproccessed_data/lica1_data_dim_red.csv', index=False)
lica2_data_dim_red.to_csv('edited dataset/preproccessed_data/lica2_data_dim_red.csv', index=False)

In [52]:
merged_data_dim_red.to_csv('edited dataset/preproccessed_data/merged_data_dim_red.csv', index=False)

In [31]:
pd.DataFrame(merged_data_pca).to_csv('edited dataset/preproccessed_data/merged_data_pca.csv', index=False)

In [32]:
pd.DataFrame(merged_data_variance_based).to_csv('edited dataset/preproccessed_data/merged_data_variance_based.csv', index=False)

In [65]:
ov_labels.to_csv('edited dataset/preproccessed_data/ov_labels.csv', index=False)
rc18flow_labels.to_csv('edited dataset/preproccessed_data/rc18flow_labels.csv', index=False)
rc18mr_labels.to_csv('edited dataset/preproccessed_data/rc18mr_labels.csv', index=False)
rc12_labels.to_csv('edited dataset/preproccessed_data/rc12_labels.csv', index=False)
pdac_labels.to_csv('edited dataset/preproccessed_data/pdac_labels.csv', index=False)
brca_terunuma_labels.to_csv('edited dataset/preproccessed_data/brca_terunuma_labels.csv', index=False)
coad_labels.to_csv('edited dataset/preproccessed_data/coad_labels.csv', index=False)
prad_2019a_labels.to_csv('edited dataset/preproccessed_data/prad_2019a_labels.csv', index=False)
prad_2019b_labels.to_csv('edited dataset/preproccessed_data/prad_2019b_labels.csv', index=False)
prad_2014_labels.to_csv('edited dataset/preproccessed_data/prad_2014_labels.csv', index=False)
hcc_labels.to_csv('edited dataset/preproccessed_data/hcc_labels.csv', index=False)
dlbcl_labels.to_csv('edited dataset/preproccessed_data/dlbcl_labels.csv', index=False)
brca_tang_labels.to_csv('edited dataset/preproccessed_data/brca_tang_labels.csv', index=False)
rc20_labels.to_csv('edited dataset/preproccessed_data/rc20_labels.csv', index=False)
gbm_labels.to_csv('edited dataset/preproccessed_data/gbm_labels.csv', index=False)
lica1_labels.to_csv('edited dataset/preproccessed_data/lica1_labels.csv', index=False)
lica2_labels.to_csv('edited dataset/preproccessed_data/lica2_labels.csv', index=False)

In [34]:
merged_labels.to_csv('edited dataset/preproccessed_data/merged_labels.csv', index=False)

In [33]:
merged_labels

Unnamed: 0,Labels
0,1
1,1
2,1
3,1
4,1
...,...
983,10
984,10
985,10
986,10
