# Töövoog, et treenida andmete põhjal mudel

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import numpy as np

# Function to load data - Võtab sisse dataseti, teeb sellest tabeli kus read on patsiendid, veerud protseduurid vastavalt
# 1- protseduur tehti, 0 - patsient pole protseduuri saanud. samuti ka veerg "death": 1-surnud, 0 -"elus"
def load_data(file):
    
    # Data
    data = pd.read_csv(file)
    pd.set_option('display.max_rows', 20)
    surnud= data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    
    # FIltreerib surnute id põhjal
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1 #Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel 
    # lihtsalt 1.
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1

    # Sama toimub ka siin, aga elus patsientidega
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]

    # Filtreeritud andmete kombineerimine üheks tabeliks
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    
    #Viimane tabel-  kus read on patsiendid, veerud protseduurid vastavalt
    # 1- protseduur tehti, 0 - patsient pole protseduuri saanud. samuti ka veerg "death": 1-surnud, 0 -"elus"
    
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    return pivot_combined_data

# Function to identify important features
def identify_important_features(X, n_components):
    # Siin PCA kasutamine on loodud väga 
    # PCA kasutamine, n_components ehk kui nt n_components = 2 siis pca transformib andmed 2-dimensioonilieks (jättes väärtusliku info alles)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # DataFrame with the principal components
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    explained_variance_ratio = pca.explained_variance_ratio_ 
    
    # PC 1 loadingud, sest PC 1 on kõige tähtsam
    loadings_pc1 = pca.components_[0]

    # saaks kätte need kõige olulisemad featrue nimed millega PC1 loodi
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc1})

    # Sorteeritud
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    
    return pd.DataFrame(X_pca, columns=pc_columns),explained_variance_ratio, pca, loading_df

# Function to train a model
def train_model(X, y):
    # MUdeli treenimine (Siin saab mudelit muuta)
    rf_model = RandomForestClassifier(random_state = 42)
    
    rf_model.fit(X,y)
    
    return rf_model

# Function to evaluate the model
def evaluate_model(model, X, y):
    # Mudeli hindamine 
    y_probabilities = model.predict_proba(X)[:, 1]    
    auc_roc = roc_auc_score(y,y_probabilities)
    
    return auc_roc

# Põhifunktsioon töövoo jaoks
def main(file):
    # Step 1: Load data
    data = load_data(file)

    # Step 2: important features
    X = data.drop(columns=['death'])
    y = data['death']
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X, n_components=10)
    #n_components on siin muudetav, hetkel näiteks n_components=10

    # Step 3: Mudeli treenimine (important_features ehk saadud PCA eelnevas sammus)
    X_train, X_test, y_train, y_test = train_test_split(important_features, y, test_size=0.2, random_state=42)
    model = train_model(X_train, y_train)

    # Step 4: Evauation
    auc_roc = evaluate_model(model, X_test, y_test)
    print(f'AUC-ROC: {auc_roc}')

    # Tagstada, mida vajalikuks peab (mudel ise, tähtsad tunnused jne)
    return model, loading_df, data, important_features, X_train, X_test, y_train, y_test

# Kasutamine - file asendada kasutatava csv failiga, mis 1-1le samas formaadis csv failidega, mis meile saadeti
file = "synthetic_data_lung_cancer.csv" #synthetic_data_lung_cancer.csv, synthetic_data_pca.csv,jne...
trained_model, loading_df, pivot_table, important_features, X_train, X_test, y_train, y_test = main(file)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 #Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


AUC-ROC: 0.8221946263264845


In [15]:
pivot_table["death"].value_counts() # Siin tagastatud tabel load_data funktsioonist
                                    # ehk veeru death väärtused: 1 - surnud, 0 - elus 

0    464
1    263
Name: death, dtype: int64

# Siin saab jooksutada treenitud mudeli peal uusi andmeid

**Asendada tuleb file2 vastava andmestikuga mida jooksutada (!!! treenitud mudel on saadud kogu eelnevat koodi jooksutades, ehk kui nt vahetada üleval koodis "Kasutamine" all ära file = "synthetic_data_pca.csv" siis uus mudel oleks nende andmete peal treenitud !!!)**

In [23]:
file2="synthetic_data_pca.csv" # SIIA VASTAV ANDMESTIK
data2 = load_data(file2) # Jooksutame csv faili load_data funktsioonist läbi
X = data2.drop("death", axis = 1)
y = data2["death"]


# Peab ka teisel andmestikul kasutama "identify_important_features" funktsiooni, et treenitud mudel töötaks uute andmete peal

important_features, explained_variance_ratio, pca, loading_df_validation = identify_important_features(X, n_components=10)

# Samuti peab n_components olema sama mis treenitud mudelil, ehk praegu näiteks n_components=10


trained_model.predict_proba(important_features)[:, 1] #"synthetic_data_lung_cancer.csv" peal treenitud mudeli kasutamine
                                                        # synthetic_data_pca.cs andmete peal
    
evaluate = evaluate_model(trained_model, important_features, y) # jooksutada treenitud mudel evaluate_model funktsiooni peal
                                                                # Et saada kätte ROC-AUC score
print(f'ROC-AUC on validation data:',evaluate)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 #Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


ROC-AUC on validation data: 0.7822657436240574


In [17]:
data2["death"].value_counts() # # Siin tagastatud tabel load_data funktsioonist uute andmete peal
                                # ehk veeru death väärtused: 1 - surnud, 0 - elus 

0    637
1     61
Name: death, dtype: int64

# loading_df ehk PCA poolt "important features", mis töövoog tagastas

In [19]:
loading_df_validation # Valideerimise andmestiku tagastatud "important features"

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1
3286,measurement_561,0.031397,0.031397
3762,measurement_990,0.030929,0.030929
3047,measurement_346,0.030577,0.030577
2638,measurement_1161,0.030404,0.030404
3422,measurement_684,0.030133,0.030133
...,...,...,...
183,condition_1163,0.001292,0.001292
1211,condition_2089,0.001037,0.001037
992,condition_1892,0.000871,0.000871
1818,condition_755,0.000766,0.000766


In [20]:
loading_df # Treeningu andmete tagastatud "important features"

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1
3747,measurement_637,0.029168,0.029168
4266,observation_204,0.028583,0.028583
4288,observation_224,0.028194,0.028194
3509,measurement_422,0.028162,0.028162
3065,measurement_1221,0.028010,0.028010
...,...,...,...
1292,condition_2161,0.001545,0.001545
1550,condition_2394,0.001500,0.001500
835,condition_1750,0.001423,0.001423
1878,condition_53,0.001403,0.001403


In [21]:
# Valideerimise andmestiku tagastatud drug features
drug_features_validation = loading_df_validation[loading_df_validation["Feature"].str.contains("drug")] 
# Saaks kätte milliseid "drug" featureid mudel kasutas
drug_features_validation

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1
2281,drug_272,0.016959,0.016959
2124,drug_130,0.016428,0.016428
2266,drug_259,0.016104,0.016104
2231,drug_227,0.016028,0.016028
2275,drug_267,0.015948,0.015948
...,...,...,...
2238,drug_233,0.009586,0.009586
2176,drug_178,0.008979,0.008979
2336,drug_321,0.008690,0.008690
2376,drug_358,0.008499,0.008499


In [22]:
#Treeningu andmete tagastatud drug features
drug_features = loading_df[loading_df["Feature"].str.contains("drug")]
# Saaks kätte milliseid "drug" featureid mudel kasutas
drug_features

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1
2646,drug_321,0.011796,0.011796
2410,drug_109,0.011748,0.011748
2644,drug_32,0.011709,0.011709
2739,drug_405,0.011495,0.011495
2622,drug_30,0.011447,0.011447
...,...,...,...
2619,drug_298,0.006638,0.006638
2431,drug_128,0.006593,0.006593
2615,drug_294,0.006560,0.006560
2555,drug_24,0.006189,0.006189
