In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE


In [2]:
#!pip install openpyxl Juhul jui juba olemas ei ole

## Andmete töötlemine

In [3]:
def load_data(file):
    
    # Data
    data = pd.read_csv(file)
    pd.set_option('display.max_rows', 20)
    surnud= data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    
    # FIltreerib surnute id põhjal
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1 # Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel 
    # lihtsalt 1.
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1

    # Sama toimub ka siin, aga elus patsientidega
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]

    # Filtreeritud andmete kombineerimine üheks tabeliks
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    
    #Viimane tabel-  kus read on patsiendid, veerud protseduurid vastavalt
    # 1- protseduur tehti, 0 - patsient pole protseduuri saanud. samuti ka veerg "death": 1-surnud, 0 -"elus"
    
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    return pivot_combined_data

## Tunnuste töötlemine (PCA)

In [4]:
def identify_important_features(X, save_loadings=False):
    n_components=20
    
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # DataFrame with the principal components
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    
    # Variance ratios
    explained_variance_ratio = pca.explained_variance_ratio_ 
    
    all_pca_loadings = [loadings for loadings in pca.components_]
    
    if save_loadings:
        loadings_dfs = []
        for index, loadings in enumerate(all_pca_loadings):
            loadings_df = pd.DataFrame({'Feature': X.columns, f'Loading_PC{index+1}': loadings})
            loadings_df[f'Absolute_Loading_PC{index+1}'] = loadings_df[f'Loading_PC{index+1}'].abs()
            loadings_df = loadings_df.sort_values(by=f'Absolute_Loading_PC{index+1}', ascending=False)
            loadings_dfs.append(loadings_df)
        
        combined_loadings_df = pd.concat(loadings_dfs, axis=1)
        combined_loadings_df.to_csv('combined_loadings.csv', index=False, encoding="utf-8")
    print(f'Explained Variance Ratio - PC1: {explained_variance_ratio[0]:.4f}')
    print(f'Explained Variance Ratio - PC2: {explained_variance_ratio[1]:.4f}')
    print(f'Explained Variance Ratio - PC3: {explained_variance_ratio[2]:.4f}')
    print(f'Explained Variance Ratio - PC4: {explained_variance_ratio[3]:.4f}')
    
    loadings_pc = pca.components_[0]
    
    # saaks kätte kõige olulisemad feature nimed
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc})
    
    # Sorteeritud
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    
    return pd.DataFrame(X_pca, columns=pc_columns), explained_variance_ratio, pca, loading_df

## Erinevate mudelite treenimine, valimaks parima mudeli

In [5]:
def train_model(X, y, model_type):
    svm_params = {'kernel': 'poly', 'gamma': 'scale', 'degree': 4, 'C': 100}
    knn_params = {'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'algorithm': 'auto'}
    rf_params = {'bootstrap': False, 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
    gradient_params = {'learning_rate': 0.2, 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200, 'subsample': 0.8}
    if model_type == 'RandomForest':
        model = RandomForestClassifier(random_state=42,**rf_params)
    elif model_type == 'SVM':
        model = SVC(probability=True, random_state=42,**svm_params)
    elif model_type == 'KNeighbors':
        model = KNeighborsClassifier(**knn_params)
    elif model_type == "Gradient":
        model = GradientBoostingClassifier(random_state=42,**gradient_params)
    
    else:
        raise ValueError(f'Invalid model type: {model_type}')
    
    model.fit(X, y)
    return model

## Mudeli hindamine

In [6]:
def evaluate_model(model, X, y):
    # Mudeli hindamine 
    y_probabilities = model.predict_proba(X)[:, 1]    
    auc_roc = roc_auc_score(y, y_probabilities)
    
    return auc_roc

## Töövoo jooksutamine
**PS! Vaata #STEP 2 juures kommentaar "save_loadings"**

In [7]:
def main(file):
    
    # Step 1: Load data
    data = load_data(file)
    X = data.drop(columns=['death'])
    y = data['death']
    
    # Step 2: important features
        # PANNA "save_loadings=True" et tekiks uus csv fail kust saame hiljem tunnuste osakaalu PCA-de moodustamisel vaadata
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X,save_loadings=True)
    
    model_types = ['RandomForest','SVM', 'KNeighbors','Gradient'] #Different models to try
    results = {} #Results
    
    # Step 3: Mudelite treenimine (important_features ehk saadud PCA eelnevas sammus)
    X_train, X_test, y_train, y_test = train_test_split(important_features, y, test_size=0.2, random_state=42)

    for model_type in model_types:
        model = train_model(X_train, y_train, model_type=model_type)
        auc_roc = evaluate_model(model, X_test, y_test)
        results[model_type] = auc_roc
        print(f'{model_type} AUC-ROC: {auc_roc}')

    return results

# KASUTAMINE! - File_path = jooksutav csv fail!
file_path = #SIIA TEIE FAIL    #"synthetic_data_lung_cancer.csv"

# Run the main script
results = main(file_path)
print("\nAUC-ROC Scores:")
for model_type, auc_roc in results.items():
    print(f'{model_type}: {auc_roc}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 # Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Explained Variance Ratio - PC1: 0.2754
Explained Variance Ratio - PC2: 0.0501
Explained Variance Ratio - PC3: 0.0374
Explained Variance Ratio - PC4: 0.0312
RandomForest AUC-ROC: 0.8241137954391511
SVM AUC-ROC: 0.8778505305938135
KNeighbors AUC-ROC: 0.8516595168209529
Gradient AUC-ROC: 0.8814630842176564

AUC-ROC Scores:
RandomForest: 0.8241137954391511
SVM: 0.8778505305938135
KNeighbors: 0.8516595168209529
Gradient: 0.8814630842176564


In [8]:
loadings = pd.read_csv("combined_loadings.csv")
loadings # Siin peaks teil loadings df avanema kui kõik töötas nii nagu pidi

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1,Feature.1,Loading_PC2,Absolute_Loading_PC2,Feature.2,Loading_PC3,Absolute_Loading_PC3,Feature.3,...,Absolute_Loading_PC17,Feature.17,Loading_PC18,Absolute_Loading_PC18,Feature.18,Loading_PC19,Absolute_Loading_PC19,Feature.19,Loading_PC20,Absolute_Loading_PC20
0,measurement_637,0.029168,0.029168,measurement_637,-0.017191,0.017191,measurement_637,-0.035745,0.035745,measurement_637,...,0.005995,measurement_637,0.002097,0.002097,measurement_637,0.001676,0.001676,measurement_637,-0.016075,0.016075
1,observation_204,0.028583,0.028583,observation_204,0.052606,0.052606,observation_204,-0.002151,0.002151,observation_204,...,0.040111,observation_204,0.038865,0.038865,observation_204,-0.034777,0.034777,observation_204,0.007131,0.007131
2,observation_224,0.028194,0.028194,observation_224,0.041922,0.041922,observation_224,0.004933,0.004933,observation_224,...,0.030885,observation_224,-0.063765,0.063765,observation_224,-0.026130,0.026130,observation_224,0.011379,0.011379
3,measurement_422,0.028162,0.028162,measurement_422,-0.016644,0.016644,measurement_422,-0.035846,0.035846,measurement_422,...,0.000532,measurement_422,0.008315,0.008315,measurement_422,0.011477,0.011477,measurement_422,-0.023523,0.023523
4,measurement_1221,0.028010,0.028010,measurement_1221,-0.012115,0.012115,measurement_1221,-0.034729,0.034729,measurement_1221,...,0.002710,measurement_1221,0.009182,0.009182,measurement_1221,0.017111,0.017111,measurement_1221,-0.017852,0.017852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4858,condition_2161,0.001545,0.001545,condition_2161,0.004886,0.004886,condition_2161,0.002884,0.002884,condition_2161,...,0.009180,condition_2161,0.009922,0.009922,condition_2161,-0.016060,0.016060,condition_2161,-0.019381,0.019381
4859,condition_2394,0.001500,0.001500,condition_2394,0.002474,0.002474,condition_2394,0.006338,0.006338,condition_2394,...,0.003899,condition_2394,0.009965,0.009965,condition_2394,0.006358,0.006358,condition_2394,0.001303,0.001303
4860,condition_1750,0.001423,0.001423,condition_1750,0.003596,0.003596,condition_1750,0.000244,0.000244,condition_1750,...,0.003946,condition_1750,0.007396,0.007396,condition_1750,-0.013261,0.013261,condition_1750,-0.021822,0.021822
4861,condition_53,0.001403,0.001403,condition_53,0.006980,0.006980,condition_53,-0.001001,0.001001,condition_53,...,0.015942,condition_53,0.008729,0.008729,condition_53,-0.020206,0.020206,condition_53,-0.014784,0.014784


## Korrelatsioonimaatriks
**(Proovime võimalikult palju infot tunnuste kohta hankida kui võimalik juhul kui PCA loadingutest ei piisa)**

In [9]:
file ='' # LISAGE SIIA OMA FAIL

In [10]:
file                 
data = load_data(file)
correlation_matrix = data.corr()
correlation_with_target = correlation_matrix['death'].abs().sort_values(ascending=False)
correlation_with_target.to_csv('correlation_results.csv', header=True)
correlations = pd.read_csv("correlation_results.csv")
correlations # Siin peaks teil correlations df avanema kui kõik töötas nii nagu pidi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 # Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Unnamed: 0,DEFINITION_ID,death
0,death,1.000000
1,measurement_258,0.308031
2,measurement_601,0.307923
3,measurement_817,0.305948
4,measurement_1325,0.305563
...,...,...
4859,condition_434,0.000991
4860,condition_1834,0.000696
4861,condition_1885,0.000189
4862,condition_1685,0.000032


## Mutual information
**Mutual information measures the dependence between two random variables<br>Helps to Identify which features are more informative for predicting the target variable in a classification problem. Higher mutual information scores imply a stronger relationship between a feature and the target.**

In [11]:
file# = "synthetic_data_lung_cancer.csv"
data = load_data(file)
X = data.drop(columns=['death'])
y = data['death']


mutual_info = mutual_info_classif(X, y,random_state=42)

# Create a DataFrame with feature names and their mutual information scores
mi_df = pd.DataFrame({'Feature': X.columns, 'Mutual_Information': mutual_info})

# Sort features by mutual information scores (descending order)
mi_df = mi_df.sort_values(by='Mutual_Information', ascending=False)

print(mi_df.head(10))

mi_df.to_csv('mutual_information_results.csv', index=False)
mutual_information = pd.read_csv("mutual_information_results.csv")
mutual_information # Siin peaks teil mutual_information df avanema kui kõik töötas nii nagu pidi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 # Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


               Feature  Mutual_Information
3321   measurement_253            0.093112
3630   measurement_531            0.087619
2233      condition_85            0.084881
3180  measurement_1325            0.084747
3969   measurement_837            0.084734
3400   measurement_324            0.083806
3862   measurement_740            0.083265
3253   measurement_192            0.082363
3620   measurement_522            0.080798
4092   measurement_948            0.080770


Unnamed: 0,Feature,Mutual_Information
0,measurement_253,0.093112
1,measurement_531,0.087619
2,condition_85,0.084881
3,measurement_1325,0.084747
4,measurement_837,0.084734
...,...,...
4858,condition_2365,0.000000
4859,condition_2367,0.000000
4860,condition_2369,0.000000
4861,condition_237,0.000000


# SMOTE kasutamine

**SMOTE kasutamises ei ole me kindlad, kas seda on õige kasutada reaalsete andmete peal, aga treening ja valideerimise andmetel läheb auc-roc skoor kõvasti paremaks<br>Sellegipoolest ei ole me kindlad, et kas seda on mõistlik terviseandmete peal kasutada seega see on hetkel pigem lihtsalt siia kerge lisa ja kui leiate, et SMOTE kasutamine oleks okei siis integreeriksime selle oma töövoogu sisse**

In [12]:


# Function to load data
def load_data(file):
    data = pd.read_csv(file)
    surnud = data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    
    return pivot_combined_data

# Function to identify important features
def identify_important_features(X, n_components):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    explained_variance_ratio = pca.explained_variance_ratio_
    loadings_pc1 = pca.components_[0]
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc1})
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    print(f'Explained Variance Ratio - PC1: {explained_variance_ratio[0]:.4f}')
    print(f'Explained Variance Ratio - PC2: {explained_variance_ratio[1]:.4f}')
    print(f'Explained Variance Ratio - PC3: {explained_variance_ratio[2]:.4f}')
    print(f'Explained Variance Ratio - PC4: {explained_variance_ratio[3]:.4f}')
    return pd.DataFrame(X_pca, columns=pc_columns), explained_variance_ratio, pca, loading_df

# Function to train a model
def train_model(X, y, model_type):
    svm_params = {'kernel': 'poly', 'gamma': 'scale', 'degree': 4, 'C': 100}
    knn_params = {'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'algorithm': 'auto'}
    
    if model_type == 'RandomForest':
        model = RandomForestClassifier(random_state=42)
    elif model_type == 'SVM':
        model = SVC(probability=True, random_state=42,**svm_params)
    elif model_type == 'KNeighbors':
        model = KNeighborsClassifier(**knn_params)
    else:
        raise ValueError(f'Invalid model type: {model_type}')
    
    model.fit(X, y)
    return model

# Function to evaluate the model
def evaluate_model(model, X, y):
    y_probabilities = model.predict_proba(X)[:, 1]
    auc_roc = roc_auc_score(y, y_probabilities)
    return auc_roc

# Function to perform the entire workflow
def main(file):
    data = load_data(file)
    X = data.drop(columns=['death'])
    y = data['death']
    
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X, n_components=25)
     
    # SMOTE kasutatakse peale PCA-d
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(important_features, y)
    
    # Resmapled df
    resampled_data = pd.DataFrame(X_resampled, columns=important_features.columns)
    resampled_data['death'] = y_resampled
    
    model_types = ['RandomForest','SVM', 'KNeighbors']
    results = {}
    X_train, X_test, y_train, y_test = train_test_split(resampled_data.drop(columns=['death']), resampled_data['death'], test_size=0.2, random_state=42)

    for model_type in model_types:
        model = train_model(X_train, y_train, model_type=model_type)
        auc_roc = evaluate_model(model, X_test, y_test)
        results[model_type] = auc_roc
        print(f'{model_type} AUC-ROC: {auc_roc}')

    return results, loading_df

# Specify the file path
file_path = "synthetic_data_lung_cancer.csv" #"synthetic_data_lung_cancer.csv"

# Run the main script
results,loading_df = main(file_path)
print("\nAUC-ROC Scores:")
for model_type, auc_roc in results.items():
    print(f'{model_type}: {auc_roc}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Explained Variance Ratio - PC1: 0.2754
Explained Variance Ratio - PC2: 0.0501
Explained Variance Ratio - PC3: 0.0374
Explained Variance Ratio - PC4: 0.0312
RandomForest AUC-ROC: 0.9438058748403576
SVM AUC-ROC: 0.9106002554278416
KNeighbors AUC-ROC: 0.9059561128526646

AUC-ROC Scores:
RandomForest: 0.9438058748403576
SVM: 0.9106002554278416
KNeighbors: 0.9059561128526646


# Sooviksime siis tagasi saada need kolm csv faili tunnustest
**'mutual_information_results.csv'<br>"correlation_results.csv"<br>"combined_loadings.csv"**