In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
filepath = "./GEO Samples/none_cancer_merged_data_with_metadata.csv"
none_cancer = pd.read_csv(filepath)

filepath = "./GEO Samples/Ov_merged_data_with_metadata.csv"
ov_samples = pd.read_csv(filepath)

# Drop ID and lable column
ov = ov_samples.drop(['ID', 'Stage'], axis=1, inplace=False)
ov_lable = ov_samples[['Stage']]

ov_lable['Stage'].value_counts()

Stage
3    331
1    265
2     72
4     32
Name: count, dtype: int64

In [3]:
# Select random Non-Cancer Samples
# The amount would be:
# 1- exact number of OV samples
# 2- double number of OV samples, then make the ov samples duplicated

# number of non cancer samples to select

# direction1 :
n= 300
nc_samples = none_cancer.sample(n= n , random_state=19)

# Drop ID and lable column
nc = nc_samples.drop(['ID', 'Stage'], axis=1, inplace=False)
nc_lable = nc_samples[['Stage']]

# Append nc to ov samples
x = pd.concat([ov, nc], ignore_index=True)
y = pd.concat([ov_lable, nc_lable], ignore_index=True)

In [4]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)


In [7]:
# Oversample rows where y == 4

# Stage 4 originally has 32 samples. We repute each one 8 times = 288
n = 8
mask = y_train['Stage'] == 4

X_Stage_4 = X_train[mask]
y_Stage_4 = y_train[mask]
y_Stage_4

X_train_oversampled = pd.concat([X_train, pd.concat([X_Stage_4]*n)], ignore_index=True)
y_train_oversampled = pd.concat([y_train, pd.concat([y_Stage_4]*n)], ignore_index=True)


# Oversample rows where y == 2
# Stage 2 originally has 72 samples. We repute each one 3 times = 288
n = 3
mask = y_train["Stage"] == 2

X_Stage_2 = X_train[mask]
y_Stage_2 = y_train[mask]

X_train_oversampled = pd.concat([X_train_oversampled, pd.concat([X_Stage_2]*n)], ignore_index=True)
y_train_oversampled = pd.concat([y_train_oversampled, pd.concat([y_Stage_2]*n)], ignore_index=True)


In [8]:
y_train_oversampled['Stage'].value_counts()

Stage
3    265
0    240
4    234
2    228
1    212
Name: count, dtype: int64

In [17]:
# Shuffle the oversampled training data
shuffled = X_train_oversampled.join(y_train_oversampled).sample(frac=1, random_state=42).reset_index(drop=True)
X_train_oversampled = shuffled.drop(columns='Stage')
y_train_oversampled = shuffled['Stage']
y_train_oversampled

0       2
1       0
2       4
3       2
4       2
       ..
1174    0
1175    4
1176    4
1177    2
1178    2
Name: Stage, Length: 1179, dtype: int64

In [18]:

# # 5. Normalization with Z-Score
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_oversampled)
# X_test_scaled = scaler.transform(X_test)

# X_train_scaled

In [19]:
print("Train min:", X_train_oversampled.min().min())
print("Train max:", X_train_oversampled.max().max())
print("Test min:", X_test.min().min())
print("Test max:", X_test.max().max())

Train min: 16.0
Train max: 65535.0
Test min: 19.0
Test max: 65535.0


In [20]:
# 5.1. Normalization with Min-Max
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Fit and transform the data
X_train_scaled = scaler.fit_transform(X_train_oversampled)
X_test_scaled = scaler.transform(X_test)


In [21]:
normalized_train_samples = pd.DataFrame(X_train_scaled, columns= x.columns)
normalized_test_samples = pd.DataFrame(X_test_scaled, columns= x.columns)
normalized_train_samples

Unnamed: 0,Age,MIMAT0004501,MIMAT0002844,MIMAT0002843,MIMAT0002824,MIMAT0002823,MIMAT0002807,MIMAT0002806,MIMAT0001635,MIMAT0001631,...,MIMAT0022965,MIMAT0022948,MIMAT0027678,MIMAT0027677,MIMAT0027662,MIMAT0027661,MIMAT0027646,MIMAT0027645,MIMAT0027630,MIMAT0027629
0,0.480000,0.321060,0.266433,0.165006,0.048775,0.151213,0.041253,0.298427,0.253691,0.052721,...,0.367587,0.035934,0.013204,0.343176,0.109860,0.049991,0.263645,0.131979,0.057850,0.287974
1,0.533333,0.651836,0.630531,0.316243,0.165648,0.392488,0.110860,0.592037,0.416767,0.020281,...,0.831395,0.173907,0.079024,0.782782,0.353564,0.169274,0.625933,0.337980,0.183165,0.642786
2,0.440000,0.292976,0.179838,0.146170,0.021972,0.155444,0.020483,0.252863,0.143605,0.005901,...,0.317303,0.013381,0.004096,0.342294,0.100369,0.006860,0.227782,0.034045,0.019502,0.252570
3,0.453333,0.429064,0.349010,0.172217,0.038008,0.210098,0.053645,0.463395,0.296785,0.012903,...,0.710883,0.093325,0.020720,0.800684,0.336016,0.068746,0.595334,0.211104,0.157941,0.550660
4,0.520000,0.368015,0.249077,0.166275,0.035217,0.178813,0.015068,0.316886,0.210087,0.013100,...,0.345389,0.025096,0.003049,0.376646,0.101470,0.017007,0.267681,0.038827,0.035250,0.294666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174,0.946667,0.481881,0.393085,0.201341,0.178634,0.194125,0.163484,0.425189,0.260895,0.010173,...,0.476980,0.180029,0.165898,0.469901,0.261573,0.174016,0.442810,0.380455,0.263986,0.442971
1175,0.573333,0.291112,0.237233,0.136013,0.027563,0.120341,0.021140,0.281499,0.178980,0.010184,...,0.267178,0.014127,0.004798,0.303132,0.080912,0.009343,0.197572,0.037733,0.016482,0.248662
1176,0.653333,0.328218,0.272279,0.149823,0.088086,0.179046,0.063104,0.271301,0.159985,0.024252,...,0.354049,0.126690,0.069371,0.330936,0.171534,0.154813,0.279304,0.158245,0.145019,0.335629
1177,0.520000,0.256050,0.215256,0.125055,0.015617,0.114777,0.014428,0.255704,0.122048,0.028619,...,0.363109,0.030488,0.009126,0.359054,0.144426,0.028518,0.236278,0.070319,0.033866,0.277698


In [29]:
# Concate normalized data and lables:

train_normalized = pd.concat([y_train_oversampled.reset_index(drop=True), normalized_train_samples.reset_index(drop=True)], axis=1)
train_normalized


Unnamed: 0,Stage,Age,MIMAT0004501,MIMAT0002844,MIMAT0002843,MIMAT0002824,MIMAT0002823,MIMAT0002807,MIMAT0002806,MIMAT0001635,...,MIMAT0022965,MIMAT0022948,MIMAT0027678,MIMAT0027677,MIMAT0027662,MIMAT0027661,MIMAT0027646,MIMAT0027645,MIMAT0027630,MIMAT0027629
0,2,0.480000,0.321060,0.266433,0.165006,0.048775,0.151213,0.041253,0.298427,0.253691,...,0.367587,0.035934,0.013204,0.343176,0.109860,0.049991,0.263645,0.131979,0.057850,0.287974
1,0,0.533333,0.651836,0.630531,0.316243,0.165648,0.392488,0.110860,0.592037,0.416767,...,0.831395,0.173907,0.079024,0.782782,0.353564,0.169274,0.625933,0.337980,0.183165,0.642786
2,4,0.440000,0.292976,0.179838,0.146170,0.021972,0.155444,0.020483,0.252863,0.143605,...,0.317303,0.013381,0.004096,0.342294,0.100369,0.006860,0.227782,0.034045,0.019502,0.252570
3,2,0.453333,0.429064,0.349010,0.172217,0.038008,0.210098,0.053645,0.463395,0.296785,...,0.710883,0.093325,0.020720,0.800684,0.336016,0.068746,0.595334,0.211104,0.157941,0.550660
4,2,0.520000,0.368015,0.249077,0.166275,0.035217,0.178813,0.015068,0.316886,0.210087,...,0.345389,0.025096,0.003049,0.376646,0.101470,0.017007,0.267681,0.038827,0.035250,0.294666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174,0,0.946667,0.481881,0.393085,0.201341,0.178634,0.194125,0.163484,0.425189,0.260895,...,0.476980,0.180029,0.165898,0.469901,0.261573,0.174016,0.442810,0.380455,0.263986,0.442971
1175,4,0.573333,0.291112,0.237233,0.136013,0.027563,0.120341,0.021140,0.281499,0.178980,...,0.267178,0.014127,0.004798,0.303132,0.080912,0.009343,0.197572,0.037733,0.016482,0.248662
1176,4,0.653333,0.328218,0.272279,0.149823,0.088086,0.179046,0.063104,0.271301,0.159985,...,0.354049,0.126690,0.069371,0.330936,0.171534,0.154813,0.279304,0.158245,0.145019,0.335629
1177,2,0.520000,0.256050,0.215256,0.125055,0.015617,0.114777,0.014428,0.255704,0.122048,...,0.363109,0.030488,0.009126,0.359054,0.144426,0.028518,0.236278,0.070319,0.033866,0.277698


In [30]:

filepath = "./GEO Samples/min_max_normalized_train_samples_with_lables.csv"
train_normalized.to_csv(filepath, index=False)

In [35]:
test_data = pd.concat([y_test.reset_index(drop=True), normalized_test_samples.reset_index(drop=True)], axis=1)
test_data


Unnamed: 0,Stage,Age,MIMAT0004501,MIMAT0002844,MIMAT0002843,MIMAT0002824,MIMAT0002823,MIMAT0002807,MIMAT0002806,MIMAT0001635,...,MIMAT0022965,MIMAT0022948,MIMAT0027678,MIMAT0027677,MIMAT0027662,MIMAT0027661,MIMAT0027646,MIMAT0027645,MIMAT0027630,MIMAT0027629
0,2,0.200000,0.347844,0.273212,0.194432,0.057426,0.181447,0.029274,0.327027,0.183723,...,0.450775,0.059212,0.017303,0.492161,0.189211,0.057020,0.391952,0.139134,0.090410,0.366676
1,3,0.560000,0.331589,0.266896,0.161195,0.043484,0.202437,0.024075,0.280805,0.177688,...,0.429135,0.034552,0.009616,0.441089,0.175657,0.043684,0.315930,0.075239,0.048455,0.276955
2,3,0.653333,0.321585,0.236663,0.142793,0.052465,0.183531,0.020157,0.252259,0.167936,...,0.460743,0.049700,0.011684,0.431627,0.173091,0.049325,0.317241,0.103767,0.044932,0.284329
3,3,0.586667,0.406954,0.278946,0.168446,0.057854,0.184028,0.027316,0.332167,0.166168,...,0.461802,0.052400,0.014137,0.459020,0.163437,0.041658,0.328858,0.108697,0.074264,0.322599
4,1,0.586667,0.275841,0.194347,0.121195,0.007864,0.135893,0.018692,0.263504,0.121969,...,0.349972,0.020304,0.004881,0.359371,0.101333,0.006552,0.226799,0.034683,0.023509,0.304060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0.533333,0.135591,0.078614,0.058614,0.035337,0.036200,0.044984,0.111415,0.068054,...,0.156048,0.041000,0.073966,0.150509,0.060803,0.056069,0.100129,0.053158,0.044713,0.104635
196,0,0.466667,0.754099,0.629188,0.319970,0.282440,0.431892,0.120748,0.675775,0.390577,...,0.765563,0.221837,0.112678,0.773918,0.396339,0.183696,0.596276,0.366675,0.209581,0.600113
197,3,0.440000,0.325287,0.254608,0.131449,0.080803,0.188087,0.037012,0.309608,0.160852,...,0.398270,0.082483,0.026052,0.390586,0.198014,0.070920,0.320527,0.159314,0.113200,0.338812
198,3,0.400000,0.328627,0.265564,0.148271,0.054078,0.177558,0.025593,0.286316,0.184556,...,0.350161,0.034935,0.007198,0.383011,0.138672,0.028514,0.279646,0.073160,0.040508,0.394572


In [36]:
filepath = "./GEO Samples/min_max_normalized_test_data_with_lables.csv"
test_data.to_csv(filepath, index=False)