In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import random
import pickle

In [2]:
from sklearn.manifold import TSNE, MDS
from sklearn.decomposition import PCA, NMF, SparsePCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from tqdm.notebook import tqdm

In [3]:
from utils import (
    train_template,
    get_metrics, 
    random_forest_train, 
    xgboost_train, 
    catboost_train
)



In [4]:
def data_loading(ga_file, fg_file, header=None):
    data_ga = pd.read_csv(ga_file, sep=";", header=header)
    data_fg = pd.read_csv(fg_file, sep=";", header=header)
    columns = data_ga.columns.values
#     print(f"data with GA samples shape: {data_ga.shape}")
#     print(f"data with FG samples shape: {data_fg.shape}")
    y_fg = data_fg[columns[0]].values
    X_fg = data_fg.drop(columns=columns[0])

    y_ga = data_ga[columns[0]].values
    X_ga = data_ga.drop(columns=columns[0])
#     print(f"shapes of X and y of FG samples respectively is: {X_fg.shape}, {y_fg.shape}")
#     print(f"shapes of X and y of GA samples respectively is: {X_ga.shape}, {y_ga.shape}")
    
    print(f"old fg classes: {set(y_fg)}")
    for index, element in enumerate(set(y_fg)):
        y_fg[y_fg == element] = index
    print(f"new fg classes: {set(y_fg)}\n")

    print(f"old ga classes: {set(y_ga)}")
    for index, element in enumerate(set(y_ga)):
        y_ga[y_ga == element] = index + len(set(y_fg))
    print(f"new ga classes: {set(y_ga)}")
    
    group1 = [0, 1, 2, 3, 4]
    group2 = [5, 6, 7, 8, 9]
    group3 = [10, 11]
    group4 = [12, 13]
    groups = [group1, group2, group3, group4]

    def add_group(y, groups):
        new_y = np.zeros((y.shape[0], 2))
        new_y[:, 1] = y
        for group_num, group in enumerate(groups):
            for class_num in group:
                new_y[y==class_num] = np.array([group_num, class_num])
        return new_y
    
    y = np.concatenate((y_fg, y_ga), axis=0)
    X = np.concatenate((X_fg, X_ga), axis=0)
    
#     X = X.reshape(X.shape[0], 1, -1)
    y = add_group(y, groups)
#     print(f"shapes of X and y is respectively: {X.shape}, {y.shape}")
    
    return X, y

X, y = data_loading(
    fg_file="dataSrc/fg_1-5_7-11.csv", 
    ga_file="dataSrc/ga_2_3_5_6.csv",
    header='infer'
)
# print(X.shape, y.shape)

old fg classes: {1, 2, 3, 4, 5, 7, 8, 9, 10, 11}
new fg classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

old ga classes: {2, 3, 5, 6}
new ga classes: {10, 11, 12, 13}


In [10]:
def split_dataset(
    X=None, 
    y=None, 
    first_file="dataSrc/ga_2_3_5_6.csv", 
    second_file="dataSrc/fg_1-5_7-11.csv", 
    header=None,
    classes='all'
):
    if X is None and y is None:
        X, y = data_loading(first_file, second_file, header)
    if classes == 'fibrinogen':
#     fibrinogen classes:
        first_dataset_classes = [0, 1, 2, 3, 4]
        second_dataset_classes = [5, 6, 7, 8, 9]
    elif classes == 'albumin':
#     albumin classes:
        first_dataset_classes = [10, 12]
        second_dataset_classes = [11, 13]
    elif classes == 'all':
#     all classes:
        first_dataset_classes = [0, 1, 2, 3, 4, 10, 12]
        second_dataset_classes = [5, 6, 7, 8, 9, 11, 13]
    
    first_x = np.array([]).reshape((0, X.shape[1]))
    first_y = np.array([]).reshape((0, y.shape[1]))
    for i in first_dataset_classes:
        first_x = np.concatenate((first_x, X[y[:, 1] == i]), axis=0)
        first_y = np.concatenate((first_y, y[y[:, 1] == i]), axis=0)

    second_x = np.array([]).reshape((0, X.shape[1]))
    second_y = np.array([]).reshape((0, y.shape[1]))
    for i in second_dataset_classes:
        second_x = np.concatenate((second_x, X[y[:, 1] == i]), axis=0)
        second_y = np.concatenate((second_y, y[y[:, 1] == i]), axis=0)
        
    for array in (first_x, second_x):
        array[np.isnan(array)] = 0
        
    print(first_x.shape, first_y.shape, second_x.shape, second_y.shape)
    return first_x, first_y, second_x, second_y

In [7]:
X_first, y_first, X_second, y_second = split_dataset(
    X=None,
    y=None,
    first_file="dataSrc/ga_2_3_5_6.csv", 
    second_file="dataSrc/fg_1-5_7-11.csv", 
    header='infer', 
    classes='fibrinogen'
)

X_first.shape, y_first.shape, X_second.shape, y_second.shape

old fg classes: {1, 2, 3, 4, 5, 7, 8, 9, 10, 11}
new fg classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

old ga classes: {2, 3, 5, 6}
new ga classes: {10, 11, 12, 13}
(340, 994) (340, 2) (229, 994) (229, 2)


((340, 994), (340, 2), (229, 994), (229, 2))

In [8]:
X_full = np.vstack((X_first, X_second))
y_full = np.vstack((y_first, y_second))

X_full.shape, y_full.shape

((569, 994), (569, 2))