In [15]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Mudelid
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Mutual info
from sklearn.feature_selection import mutual_info_classif

# SMOTE
from imblearn.over_sampling import SMOTE

## Andmete töötlemine

In [16]:
def load_data(file, drop_values=None):
    
    # Data
    data = pd.read_csv(file)
    pd.set_option('display.max_rows', 20)
    surnud= data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    
    # Filtreerib surnute id põhjal
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    
    # Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel lihtsalt 1.
    surnud["TIME"] = 1
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1

    # Sama toimub ka siin, aga elus patsientidega
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]

    # Filtreeritud andmete kombineerimine üheks tabeliks
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    
    # Tagastatav tabel - read on patsiendid, veerud on protseduurid
    # 1 = protseduur tehti, 0 = patsient pole protseduuri saanud
    # Samuti ka veerg "death": 1 = surnud, 0 = "elus"
    
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    if drop_values:
        for value in drop_values:
            if value:
                cols_to_drop = pivot_combined_data.filter(like=value).columns
                pivot_combined_data = pivot_combined_data.drop(columns=cols_to_drop)
    return pivot_combined_data

## Tunnuste töötlemine (PCA)

In [17]:
def identify_important_features(X, save_loadings=False):
    n_components=20
    
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # DataFrame with the principal components
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    
    # Variance ratios
    explained_variance_ratio = pca.explained_variance_ratio_ 
    
    all_pca_loadings = [loadings for loadings in pca.components_]
    
    if save_loadings:
        loadings_dfs = []
        for index, loadings in enumerate(all_pca_loadings):
            loadings_df = pd.DataFrame({'Feature': X.columns, f'Loading_PC{index+1}': loadings})
            loadings_df[f'Absolute_Loading_PC{index+1}'] = loadings_df[f'Loading_PC{index+1}'].abs()
            loadings_df = loadings_df.sort_values(by=f'Absolute_Loading_PC{index+1}', ascending=False)
            loadings_dfs.append(loadings_df)
        
        combined_loadings_df = pd.concat(loadings_dfs, axis=1)
        combined_loadings_df.to_csv('combined_loadings.csv', index=False, encoding="utf-8")
    
    loadings_pc = pca.components_[0]
    
    # PC1 loadings
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc})
    # Sorteeritud
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    
    return pd.DataFrame(X_pca, columns=pc_columns), explained_variance_ratio, pca, loading_df

## Erinevate mudelite treenimine, valimaks parima mudeli

In [18]:
def train_model(X, y, model_type):
    svm_params = {'kernel': 'poly', 'gamma': 'scale', 'degree': 4, 'C': 100}
    knn_params = {'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'algorithm': 'auto'}
    rf_params = {'bootstrap': False, 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
    gradient_params = {'learning_rate': 0.2, 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200, 'subsample': 0.8}
    if model_type == 'RandomForest':
        model = RandomForestClassifier(random_state=42,**rf_params)
    elif model_type == 'SVM':
        model = SVC(probability=True, random_state=42,**svm_params)
    elif model_type == 'KNeighbors':
        model = KNeighborsClassifier(**knn_params)
    elif model_type == "Gradient":
        model = GradientBoostingClassifier(random_state=42,**gradient_params)
    
    else:
        raise ValueError(f'Invalid model type: {model_type}')
    
    model.fit(X, y)
    return model

## Mudeli hindamine

In [19]:
def evaluate_model(model, X, y):
    # Mudeli hindamine 
    y_probabilities = model.predict_proba(X)[:, 1]    
    auc_roc = roc_auc_score(y, y_probabilities)
    
    return auc_roc

## Töövoo jooksutamine
**PS! Vaata #STEP 2 juures kommentaar "save_loadings"**

In [22]:
def main(file):
    
    # PCde loomisel väljajäetavate interventionite kombinatsioonid
    drop_value_combinations = [
        
        # PCde loomine kõikide interventionitega
        [], 
        
        # Individuaalsete interventionite väljajätmine, et näha nende mõju AUC-ROC skoorile
        ["condition_"], 
        ["drug_"], 
        ["observation_"], 
        ["procedure_"], 
        ["measurement"],
        
        # Kombinatsioon, millega saavutasime parima AUC-ROC skoori
        ["drug_", "condition_", "procedure_"]
    ]
    
    results = {}
    for value_combination in drop_value_combinations:
        data = load_data(file, drop_values=value_combination)
        X = data.drop(columns=['death'])
        y = data['death']

        # Salvestame ainult parima kombinatsiooniga PCde loadingud
        save_loadings = True if value_combination == ["drug_", "condition_", "procedure_"] else False
        
        # "save_loadings=True" - tekib .csv fail, kust saab hiljem tunnuste osakaalu PC-de moodustamisel vaadata
        important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X, save_loadings=save_loadings)
        model_types = ['RandomForest','SVM', 'KNeighbors','Gradient']

        # Mudelite treenimine (x = important_features ehk mudelid treenitakse genereeritud PCde põhjal)
        X_train, X_test, y_train, y_test = train_test_split(important_features, y, test_size=0.2, random_state=42)

        for model_type in model_types:
            model = train_model(X_train, y_train, model_type=model_type)
            auc_roc = evaluate_model(model, X_test, y_test)
            results[f"{model_type} without {value_combination}"] = auc_roc

    return results

# KASUTAMINE! - File_path = jooksutav csv fail!
file_path = "synthetic_data_lung_cancer.csv"

# Run the main script
results = main(file_path)

counter = 0

print("\nAUC-ROC Scores:")
for model_type, auc_roc in results.items():
    print(f'{model_type}: {auc_roc}')
    counter += 1
    if counter == 4:
        print()
        counter = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc


AUC-ROC Scores:
RandomForest without []: 0.8177918265974261
SVM without []: 0.8776247459923233
KNeighbors without []: 0.8521110860239333
Gradient without []: 0.874689546172951

RandomForest without ['condition_']: 0.8453375479792278
SVM without ['condition_']: 0.9121697900203206
KNeighbors without ['condition_']: 0.8813501919169112
Gradient without ['condition_']: 0.9067509595845563

RandomForest without ['drug_']: 0.8600135470760895
SVM without ['drug_']: 0.8679160081282457
KNeighbors without ['drug_']: 0.8517724091216978
Gradient without ['drug_']: 0.8902686836757733

RandomForest without ['observation_']: 0.8451117633777375
SVM without ['observation_']: 0.862045608489501
KNeighbors without ['observation_']: 0.8335967487017386
Gradient without ['observation_']: 0.841724994355385

RandomForest without ['procedure_']: 0.8311131180853466
SVM without ['procedure_']: 0.9017836983517724
KNeighbors without ['procedure_']: 0.8462406863851886
Gradient without ['procedure_']: 0.84714382479114

In [23]:
loadings = pd.read_csv("combined_loadings.csv")

# Siin peaks teil loadings df avanema kui kõik töötas nii nagu pidi
loadings

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1,Feature.1,Loading_PC2,Absolute_Loading_PC2,Feature.2,Loading_PC3,Absolute_Loading_PC3,Feature.3,...,Absolute_Loading_PC17,Feature.17,Loading_PC18,Absolute_Loading_PC18,Feature.18,Loading_PC19,Absolute_Loading_PC19,Feature.19,Loading_PC20,Absolute_Loading_PC20
0,measurement_637,0.031130,0.031130,measurement_637,-0.039471,0.039471,measurement_637,-0.001863,0.001863,measurement_637,...,0.047339,measurement_637,-0.020614,0.020614,measurement_637,-0.034442,0.034442,measurement_637,0.040120,0.040120
1,measurement_422,0.030101,0.030101,measurement_422,-0.038394,0.038394,measurement_422,-0.001390,0.001390,measurement_422,...,0.007411,measurement_422,0.027246,0.027246,measurement_422,0.013040,0.013040,measurement_422,-0.005919,0.005919
2,measurement_1076,0.029839,0.029839,measurement_1076,-0.038207,0.038207,measurement_1076,-0.002525,0.002525,measurement_1076,...,0.024858,measurement_1076,0.002208,0.002208,measurement_1076,-0.005690,0.005690,measurement_1076,-0.002582,0.002582
3,measurement_1221,0.029795,0.029795,measurement_1221,-0.036160,0.036160,measurement_1221,0.004351,0.004351,measurement_1221,...,0.033278,measurement_1221,0.024304,0.024304,measurement_1221,-0.013304,0.013304,measurement_1221,-0.001510,0.001510
4,measurement_1027,0.029766,0.029766,measurement_1027,-0.041584,0.041584,measurement_1027,-0.000435,0.000435,measurement_1027,...,0.030098,measurement_1027,-0.005714,0.005714,measurement_1027,0.007213,0.007213,measurement_1027,-0.008015,0.008015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,observation_161,0.019625,0.019625,observation_161,0.023969,0.023969,observation_161,0.045309,0.045309,observation_161,...,0.009348,observation_161,-0.020881,0.020881,observation_161,0.008554,0.008554,observation_161,0.027901,0.027901
1552,observation_185,0.019429,0.019429,observation_185,0.022792,0.022792,observation_185,0.051817,0.051817,observation_185,...,0.015765,observation_185,0.013730,0.013730,observation_185,0.047438,0.047438,observation_185,0.004049,0.004049
1553,observation_3,0.019387,0.019387,observation_3,0.024326,0.024326,observation_3,0.060742,0.060742,observation_3,...,0.011045,observation_3,0.021442,0.021442,observation_3,0.036940,0.036940,observation_3,0.030185,0.030185
1554,observation_46,0.019248,0.019248,observation_46,0.026371,0.026371,observation_46,0.053802,0.053802,observation_46,...,0.011123,observation_46,-0.010246,0.010246,observation_46,0.021217,0.021217,observation_46,0.000489,0.000489


## Korrelatsioonimaatriks
**(Proovime võimalikult palju infot tunnuste kohta hankida kui võimalik juhul kui PCA loadingutest ei piisa)**

In [8]:
# LISAGE SIIA OMA FAIL
file = "synthetic_data_lung_cancer.csv"

In [9]:
file                 
data = load_data(file)
correlation_matrix = data.corr()
correlation_with_target = correlation_matrix['death'].abs().sort_values(ascending=False)
correlation_with_target.to_csv('correlation_results.csv', header=True)
correlations = pd.read_csv("correlation_results.csv")

# Siin peaks teil correlations df avanema kui kõik töötas nii nagu pidi
correlations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Unnamed: 0,DEFINITION_ID,death
0,death,1.000000
1,measurement_258,0.308031
2,measurement_601,0.307923
3,measurement_817,0.305948
4,measurement_1325,0.305563
...,...,...
4859,condition_434,0.000991
4860,condition_1834,0.000696
4861,condition_1885,0.000189
4862,condition_1685,0.000032


## Mutual information
**Mutual information measures the dependence between two random variables<br>Helps to Identify which features are more informative for predicting the target variable in a classification problem. Higher mutual information scores imply a stronger relationship between a feature and the target.**

In [10]:
# LISAGE SIIA OMA FAIL
file = "synthetic_data_lung_cancer.csv"

data = load_data(file)
X = data.drop(columns=['death'])
y = data['death']

mutual_info = mutual_info_classif(X, y, random_state=42)

# Create a DataFrame with feature names and their mutual information scores
mi_df = pd.DataFrame({'Feature': X.columns, 'Mutual_Information': mutual_info})

# Sort features by mutual information scores (descending order)
mi_df = mi_df.sort_values(by='Mutual_Information', ascending=False)

print(mi_df.head(10))

mi_df.to_csv('mutual_information_results.csv', index=False)
mutual_information = pd.read_csv("mutual_information_results.csv")
mutual_information # Siin peaks teil mutual_information df avanema kui kõik töötas nii nagu pidi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


               Feature  Mutual_Information
3321   measurement_253            0.093112
3630   measurement_531            0.087619
2233      condition_85            0.084881
3180  measurement_1325            0.084747
3969   measurement_837            0.084734
3400   measurement_324            0.083806
3862   measurement_740            0.083265
3253   measurement_192            0.082363
3620   measurement_522            0.080798
4092   measurement_948            0.080770


Unnamed: 0,Feature,Mutual_Information
0,measurement_253,0.093112
1,measurement_531,0.087619
2,condition_85,0.084881
3,measurement_1325,0.084747
4,measurement_837,0.084734
...,...,...
4858,condition_2365,0.000000
4859,condition_2367,0.000000
4860,condition_2369,0.000000
4861,condition_237,0.000000


# SMOTE kasutamine

**SMOTE kasutamises ei ole me kindlad, kas seda on õige kasutada reaalsete andmete peal, aga treening ja valideerimise andmetel läheb auc-roc skoor kõvasti paremaks<br>Sellegipoolest ei ole me kindlad, et kas seda on mõistlik terviseandmete peal kasutada seega see on hetkel pigem lihtsalt siia kerge lisa ja kui leiate, et SMOTE kasutamine oleks okei siis integreeriksime selle oma töövoogu sisse**

In [11]:
# Function to load data
def load_data(file):
    data = pd.read_csv(file)
    surnud = data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    
    return pivot_combined_data

# Function to identify important features
def identify_important_features(X, n_components):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    explained_variance_ratio = pca.explained_variance_ratio_
    loadings_pc1 = pca.components_[0]
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc1})
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    print(f'Explained Variance Ratio - PC1: {explained_variance_ratio[0]:.4f}')
    print(f'Explained Variance Ratio - PC2: {explained_variance_ratio[1]:.4f}')
    print(f'Explained Variance Ratio - PC3: {explained_variance_ratio[2]:.4f}')
    print(f'Explained Variance Ratio - PC4: {explained_variance_ratio[3]:.4f}')
    return pd.DataFrame(X_pca, columns=pc_columns), explained_variance_ratio, pca, loading_df

# Function to train a model
def train_model(X, y, model_type):
    svm_params = {'kernel': 'poly', 'gamma': 'scale', 'degree': 4, 'C': 100}
    knn_params = {'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'algorithm': 'auto'}
    
    if model_type == 'RandomForest':
        model = RandomForestClassifier(random_state=42)
    elif model_type == 'SVM':
        model = SVC(probability=True, random_state=42,**svm_params)
    elif model_type == 'KNeighbors':
        model = KNeighborsClassifier(**knn_params)
    else:
        raise ValueError(f'Invalid model type: {model_type}')
    
    model.fit(X, y)
    return model

# Function to evaluate the model
def evaluate_model(model, X, y):
    y_probabilities = model.predict_proba(X)[:, 1]
    auc_roc = roc_auc_score(y, y_probabilities)
    return auc_roc

# Function to perform the entire workflow
def main(file):
    data = load_data(file)
    X = data.drop(columns=['death'])
    y = data['death']
    
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X, n_components=25)
     
    # SMOTE kasutatakse peale PCA-d
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(important_features, y)
    
    # Resmapled df
    resampled_data = pd.DataFrame(X_resampled, columns=important_features.columns)
    resampled_data['death'] = y_resampled
    
    model_types = ['RandomForest','SVM', 'KNeighbors']
    results = {}
    X_train, X_test, y_train, y_test = train_test_split(resampled_data.drop(columns=['death']), resampled_data['death'], test_size=0.2, random_state=42)

    for model_type in model_types:
        model = train_model(X_train, y_train, model_type=model_type)
        auc_roc = evaluate_model(model, X_test, y_test)
        results[model_type] = auc_roc
        print(f'{model_type} AUC-ROC: {auc_roc}')

    return results, loading_df

# LISAGE SIIA OMA FAIL
file_path = "synthetic_data_lung_cancer.csv"

# Run the main script
results,loading_df = main(file_path)
print("\nAUC-ROC Scores:")
for model_type, auc_roc in results.items():
    print(f'{model_type}: {auc_roc}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Explained Variance Ratio - PC1: 0.2754
Explained Variance Ratio - PC2: 0.0501
Explained Variance Ratio - PC3: 0.0374
Explained Variance Ratio - PC4: 0.0312


NameError: name 'SMOTE' is not defined

# Sooviksime siis tagasi saada need kolm csv faili tunnustest
**'mutual_information_results.csv'<br>"correlation_results.csv"<br>"combined_loadings.csv"**