## Impordid

In [21]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA

## Andmete töötlemine

In [22]:
def load_data(file):
    
    # Data
    data = pd.read_csv(file)
    pd.set_option('display.max_rows', 20)
    surnud= data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    
    # FIltreerib surnute id põhjal
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1 # Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel 
    # lihtsalt 1.
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1

    # Sama toimub ka siin, aga elus patsientidega
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]

    # Filtreeritud andmete kombineerimine üheks tabeliks
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    
    #Viimane tabel-  kus read on patsiendid, veerud protseduurid vastavalt
    # 1- protseduur tehti, 0 - patsient pole protseduuri saanud. samuti ka veerg "death": 1-surnud, 0 -"elus"
    
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    return pivot_combined_data

## Tunnuste töötlemine (PCA)

In [23]:
def identify_important_features(X, save_loadings=False):
    n_components=20
    
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # DataFrame with the principal components
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    
    # Variance ratios
    explained_variance_ratio = pca.explained_variance_ratio_ 
    
    all_pca_loadings = [loadings for loadings in pca.components_]
    
    if save_loadings:
        with pd.ExcelWriter('output.xlsx') as writer:
            for index, loadings in enumerate(all_pca_loadings):
                loadings_df = pd.DataFrame({'Feature': X.columns, f'Loading_PC{index+1}': loadings})
                loadings_df[f'Absolute_Loading_PC{index+1}'] = loadings_df[f'Loading_PC{index+1}'].abs()
                loadings_df = loadings_df.sort_values(by=f'Absolute_Loading_PC{index+1}', ascending=False)
                loadings_df.to_excel(writer, sheet_name=f'PC{index+1} - {explained_variance_ratio[index]}')
    
    loadings_pc = pca.components_[0]
    
    # saaks kätte kõige olulisemad feature nimed
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc})
    
    # Sorteeritud
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    
    return pd.DataFrame(X_pca, columns=pc_columns), explained_variance_ratio, pca, loading_df

## Mudeli hindamine

In [24]:
def evaluate_model(model, X, y):
    # Mudeli hindamine 
    y_probabilities = model.predict_proba(X)[:, 1]    
    auc_roc = roc_auc_score(y, y_probabilities)
    
    return auc_roc

## RandomForestClassifieri treenimine

In [25]:
def train_rf_model(X, y, grid_search=False):
    if not grid_search:
        rf_model = RandomForestClassifier()
        rf_model.fit(X, y)
        return rf_model
    else:
        param_grid = {
            'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [None, 10, 20, 30, 40, 50],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False],
            'max_features': ['auto', 'sqrt', 'log2']
        }

        rf_model = RandomForestClassifier()

        grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1,  scoring='roc_auc')
        grid_search.fit(X, y)

        print(grid_search.best_params_)
        best_model = grid_search.best_estimator_
        return best_model

## GradientBoostingClassifieri treenimine

In [26]:
def train_gradient_model(X, y, grid_search=False):
    if not grid_search:
        gbc_model = GradientBoostingClassifier()
        gbc_model.fit(X, y)
        return gbc_model
    else:
        param_grid = {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 10, 20],
            'min_samples_leaf': [1, 5, 15],
            'subsample': [0.6, 0.8, 1.0],
            'max_features': ['auto', None]
        }
        
        gbc_model = GradientBoostingClassifier()

        grid_search = GridSearchCV(estimator=gbc_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='roc_auc')

        grid_search.fit(X, y)

        print(grid_search.best_params_)
        best_model = grid_search.best_estimator_
        return best_model

## Töövoo jooksutamine

In [27]:
def main(file):
    # Step 1: Load data
    data = load_data(file)

    # Step 2: important features
    X = data.drop(columns=['death'])
    y = data['death']
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X)

    X_train, X_test, y_train, y_test = train_test_split(important_features, y, test_size=0.2, random_state=25)
#     rf_model = train_rf_model(X_train, y_train, grid_search=True)
    rf_model = train_rf_model(X_train, y_train)
#     gradient_model = train_gradient_model(X_train, y_train, grid_search=True)
    gradient_model = train_gradient_model(X_train, y_train)

    # Step 4: Evauation
    auc_roc = evaluate_model(rf_model, X_test, y_test)
    print(f'AUC-ROC rf: {auc_roc}')
    
    auc_roc = evaluate_model(gradient_model, X_test, y_test)
    print(f'AUC-ROC gradient: {auc_roc}')

    # Tagstada, mida vajalikuks peab (mudel ise, tähtsad tunnused jne)
    return rf_model, gradient_model, loading_df, data, important_features, X_train, X_test, y_train, y_test

# Kasutamine - file asendada kasutatava csv failiga, mis 1-1le samas formaadis csv failidega, mis meile saadeti
file = "synthetic_data_lung_cancer.csv" #synthetic_data_lung_cancer.csv, synthetic_data_pca.csv,jne...
rf_model, gradient_model, loading_df, pivot_table, important_features, X_train, X_test, y_train, y_test = main(file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 # Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


AUC-ROC rf: 0.8534621578099838
AUC-ROC gradient: 0.8725845410628019


## Mudelite AUC-ROC skoor valideerimisandmestiku põhjal

In [28]:
file2="synthetic_data_pca.csv"
data2 = load_data(file2)
X = data2.drop("death", axis = 1)
y = data2["death"]

important_features, explained_variance_ratio, pca, loading_df_validation = identify_important_features(X, save_loadings=True)
    
evaluate_rf = evaluate_model(rf_model, important_features, y) 
print(f"ROC-AUC rf on validation data: {evaluate_rf}")

evaluate_gradient = evaluate_model(gradient_model, important_features, y) 
print(f"ROC-AUC gradient on validation data: {evaluate_gradient}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 # Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


ROC-AUC rf on validation data: 0.8436961165298402
ROC-AUC gradient on validation data: 0.848264148029956


## Kõige suurema varieeruvusega PCA poolt kasutatud tunnuste kaalud

In [30]:
loading_df_validation

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1
3286,measurement_561,0.031397,0.031397
3762,measurement_990,0.030929,0.030929
3047,measurement_346,0.030577,0.030577
2638,measurement_1161,0.030404,0.030404
3422,measurement_684,0.030133,0.030133
...,...,...,...
183,condition_1163,0.001292,0.001292
1211,condition_2089,0.001037,0.001037
992,condition_1892,0.000871,0.000871
1818,condition_755,0.000766,0.000766


In [31]:
loading_df

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1
3747,measurement_637,0.029168,0.029168
4266,observation_204,0.028583,0.028583
4288,observation_224,0.028194,0.028194
3509,measurement_422,0.028162,0.028162
3065,measurement_1221,0.028010,0.028010
...,...,...,...
1292,condition_2161,0.001545,0.001545
1550,condition_2394,0.001500,0.001500
835,condition_1750,0.001423,0.001423
1878,condition_53,0.001403,0.001403


## PCAde varieeruvused

In [32]:
explained_variance_ratio

array([0.14549493, 0.05532097, 0.05112105, 0.03989248, 0.03901181,
       0.02646274, 0.02590439, 0.02476409, 0.02135614, 0.01991345,
       0.01671356, 0.01602964, 0.01506696, 0.01324156, 0.01201901,
       0.01155921, 0.01073202, 0.00957551, 0.00796088, 0.00754529])

## Mudelite poolt kasutatud PCAde kaalud

In [33]:
rf_model.feature_importances_

array([0.13456211, 0.05047798, 0.10722995, 0.05685285, 0.03806474,
       0.03476451, 0.031982  , 0.03197815, 0.0414045 , 0.0329423 ,
       0.07081092, 0.05790852, 0.04741205, 0.0360999 , 0.0312234 ,
       0.03441241, 0.04044729, 0.04269617, 0.0426258 , 0.03610446])

In [37]:
gradient_model.feature_importances_

array([0.3329323 , 0.03261782, 0.16027045, 0.01473262, 0.03730145,
       0.01169127, 0.01223958, 0.00618075, 0.02572665, 0.01790952,
       0.07127471, 0.0699224 , 0.05095522, 0.01500434, 0.01272587,
       0.02241741, 0.02582543, 0.04920273, 0.02668819, 0.00438129])