# Data Preprocessing for multi-class Classification in the domain of TCGA-STAD (The Cancer Genome Atlas Stomach Adenocarcinoma)

In [1]:
import pandas as pd
import numpy as np
import h5py

In [None]:
import pandas as pd
import numpy as np
import h5py

## 1. Making the final numpy dataset of XY values for inserting into model

### 1-1. Trim & Sort

In [2]:
# # xy_STAD-related-only3_exam_features_removed.pkl 파일을 처음으로 복구하는 코드 (만일 대비)
# df_init = pd.read_pickle('xy_STAD-related-only3_exam.pkl')
# df_init.to_pickle('./xy_STAD-related-only3_exam_features_removed.pkl')
# df_init.shape

### 1-2. Extracting dataset X & Y

In [3]:
# dfs2 = pd.read_pickle('xy_STAD-related-only3_exam.pkl')
dfs2 = pd.read_pickle('xy_STAD-related-only3_exam_features_removed.pkl')
df_X = dfs2.drop(labels=['neoplasm_histologic_grade','submitter_id.samples'], axis=1)
df_Y = dfs2[['neoplasm_histologic_grade']]
print('G1:  ', len(df_Y[df_Y['neoplasm_histologic_grade']=='G1']))
print('G2: ', len(df_Y[df_Y['neoplasm_histologic_grade']=='G2']))
print('G3: ', len(df_Y[df_Y['neoplasm_histologic_grade']=='G3']))
print('GX:   ', len(df_Y[df_Y['neoplasm_histologic_grade']=='GX']))
# NLL Loss 계산을 위해 label 차원은 2차원 행렬이 아닌 평범한 1차원 벡터가 되게 바꿈 
df_Y = dfs2['neoplasm_histologic_grade']


G1:   10
G2:  150
G3:  238
GX:    9


### 1-3. Converting data for deep learning model 

In [4]:
# X 데이터 : pandas dataframe 형식에서 numpy array 형식으로 변환
X = df_X.to_numpy(dtype='float64')
X.shape, X[0].dtype

((407, 1501), dtype('float64'))

In [5]:
# Y 데이터 : 문자에서 정수로 변환 G1->1, G2->2, G3->3, GX->0
Y = df_Y.replace({'G1':1, 'G2':2, 'G3':3, 'GX':0}).to_numpy(dtype='int8')
Y.shape, Y[0].dtype

((407,), dtype('int8'))

### 1-4. Min-Max Normalization for richer transformer embedding

In [6]:
def MinMax(X):
    X_norm = (X-np.min(X)) / (np.max(X)-np.min(X))
    return X_norm

In [7]:
X_norm = MinMax(X) * 319

### 1-5. Save dataset

In [8]:
# 2. 전체 X, Y 데이터셋 
hf = h5py.File('STAD_Dataset3_ALL_exam.h5', 'w')
hf.create_dataset('X', data=X_norm)
hf.create_dataset('Y', data=Y)
hf.close()

### 1-6. Dataset Validation Check

In [9]:
hf = h5py.File('STAD_Dataset3_ALL_exam.h5', 'r')
# hf.keys()
X = np.array(hf.get('X'))
Y = np.array(hf.get('Y'))

print('X.shape :', X.shape, X[0].dtype)
print('Y.shape :', Y.shape, Y[0].dtype)

hf.close()

X.shape : (407, 1501) float64
Y.shape : (407,) int8
