# Töövoog, et treenida andmete põhjal mudel

In [60]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier


# Function to load data - Võtab sisse dataseti, teeb sellest tabeli kus read on patsiendid, veerud protseduurid vastavalt
# 1- protseduur tehti, 0 - patsient pole protseduuri saanud. samuti ka veerg "death": 1-surnud, 0 -"elus"
def load_data(file):
    
    # Data
    data = pd.read_csv(file)
    pd.set_option('display.max_rows', 20)
    surnud= data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    
    # FIltreerib surnute id põhjal
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1 #Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel 
    # lihtsalt 1.
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1

    # Sama toimub ka siin, aga elus patsientidega
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]

    # Filtreeritud andmete kombineerimine üheks tabeliks
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    
    #Viimane tabel-  kus read on patsiendid, veerud protseduurid vastavalt
    # 1- protseduur tehti, 0 - patsient pole protseduuri saanud. samuti ka veerg "death": 1-surnud, 0 -"elus"
    
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    return pivot_combined_data

# Function to identify important features
def identify_important_features(X, n_components):
    # Siin PCA kasutamine on loodud väga 
    # PCA kasutamine, n_components ehk kui nt n_components = 2 siis pca transformib andmed 2-dimensioonilieks (jättes väärtusliku info alles)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # DataFrame with the principal components
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    explained_variance_ratio = pca.explained_variance_ratio_ 
    
    # PC 1 loadingud, sest PC 1 on kõige tähtsam
    loadings_pc1 = pca.components_[0]

    # saaks kätte need kõige olulisemad featrue nimed millega PC1 loodi
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc1})

    # Sorteeritud
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    
    print(f'Explained Variance Ratio - PC1: {explained_variance_ratio[0]:.4f}')
    print(f'Explained Variance Ratio - PC2: {explained_variance_ratio[1]:.4f}')
    print(f'Explained Variance Ratio - PC3: {explained_variance_ratio[2]:.4f}')
    print(f'Explained Variance Ratio - PC4: {explained_variance_ratio[3]:.4f}')
    
    return pd.DataFrame(X_pca, columns=pc_columns),explained_variance_ratio, pca, loading_df

# Function to train a model
def train_model(X, y):
    # MUdeli treenimine (Siin saab mudelit muuta)
    model = MLPClassifier(random_state=42) #DecisionTreeClassifier(random_state=42) #KNeighborsClassifier() #SVC(probability=True, random_state=42)
    
    model.fit(X,y)
    
    return model

# Function to evaluate the model
def evaluate_model(model, X, y):
    # Mudeli hindamine 
    y_probabilities = model.predict_proba(X)[:, 1]    
    auc_roc = roc_auc_score(y,y_probabilities)
    
    return auc_roc

# Põhifunktsioon töövoo jaoks
def main(file):
    # Step 1: Load data
    data = load_data(file)

    # Step 2: important features
    X = data.drop(columns=['death'])
    y = data['death']
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X, n_components=10)
    #n_components on siin muudetav, hetkel näiteks n_components=10
    
    # Step 3: Mudeli treenimine (important_features ehk saadud PCA eelnevas sammus)
    X_train, X_test, y_train, y_test = train_test_split(important_features, y, test_size=0.2, random_state=42)
    model = train_model(X_train, y_train)

    # Step 4: Evauation
    auc_roc = evaluate_model(model,X_test, y_test)
    print(f'AUC-ROC: {auc_roc}')


    # Tagstada, mida vajalikuks peab (mudel ise, tähtsad tunnused jne)
    return model, loading_df, data, important_features, X_train, X_test, y_train, y_test

# Kasutamine - file asendada kasutatava csv failiga, mis 1-1le samas formaadis csv failidega, mis meile saadeti
file = "synthetic_data_lung_cancer.csv" #synthetic_data_lung_cancer.csv, synthetic_data_pca.csv,jne...
trained_model, loading_df, pivot_table, important_features, X_train, X_test, y_train, y_test = main(file)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 #Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Explained Variance Ratio - PC1: 0.2754
Explained Variance Ratio - PC2: 0.0501
Explained Variance Ratio - PC3: 0.0374
Explained Variance Ratio - PC4: 0.0312
AUC-ROC: 0.8096635809437797




In [12]:
pivot_table["death"].value_counts() # Siin tagastatud tabel load_data funktsioonist
                                    # ehk veeru death väärtused: 1 - surnud, 0 - elus 

0    464
1    263
Name: death, dtype: int64

# Erinevad Mudelid

**See oleks siis see kood kuhu markus saaks oma andmeid jooksutada ja võtaksime siit parima mudeli, hetkel nagu näha siis def train_model juures olen pannud svm_params ja knn_params ehk need millega ma eile erinevaid mudeleid/parameetreid testides parima tulemuse sain**

In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Function to load data
def load_data(file):
    data = pd.read_csv(file)
    surnud = data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    return pivot_combined_data

# Function to identify important features
def identify_important_features(X, n_components):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    explained_variance_ratio = pca.explained_variance_ratio_
    loadings_pc1 = pca.components_[0]
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc1})
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    print(f'Explained Variance Ratio - PC1: {explained_variance_ratio[0]:.4f}')
    print(f'Explained Variance Ratio - PC2: {explained_variance_ratio[1]:.4f}')
    print(f'Explained Variance Ratio - PC3: {explained_variance_ratio[2]:.4f}')
    print(f'Explained Variance Ratio - PC4: {explained_variance_ratio[3]:.4f}')
    return pd.DataFrame(X_pca, columns=pc_columns), explained_variance_ratio, pca, loading_df

# Function to train a model
def train_model(X, y, model_type):
    svm_params = {'kernel': 'poly', 'gamma': 'scale', 'degree': 4, 'C': 100}
    knn_params = {'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'algorithm': 'auto'}
    
    if model_type == 'RandomForest':
        model = RandomForestClassifier(random_state=42)
    elif model_type == 'SVM':
        model = SVC(probability=True, random_state=42,**svm_params)
    elif model_type == 'KNeighbors':
        model = KNeighborsClassifier(**knn_params)
    else:
        raise ValueError(f'Invalid model type: {model_type}')
    
    model.fit(X, y)
    return model

# Function to evaluate the model
def evaluate_model(model, X, y):
    y_probabilities = model.predict_proba(X)[:, 1]
    auc_roc = roc_auc_score(y, y_probabilities)
    return auc_roc

# Function to perform the entire workflow
def main(file, n_components=10):
    data = load_data(file)
    X = data.drop(columns=['death'])
    y = data['death']
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X, n_components=20)
    
    model_types = ['RandomForest','SVM', 'KNeighbors']
    results = {}
    
    X_train, X_test, y_train, y_test = train_test_split(important_features, y, test_size=0.2, random_state=42)

    for model_type in model_types:
        model = train_model(X_train, y_train, model_type=model_type)
        auc_roc = evaluate_model(model, X_test, y_test)
        results[model_type] = auc_roc
        print(f'{model_type} AUC-ROC: {auc_roc}')

    return results

# Specify the file path
file_path = "synthetic_data_lung_cancer.csv"

# Run the main script
results = main(file_path)
print("\nAUC-ROC Scores:")
for model_type, auc_roc in results.items():
    print(f'{model_type}: {auc_roc}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Explained Variance Ratio - PC1: 0.2754
Explained Variance Ratio - PC2: 0.0501
Explained Variance Ratio - PC3: 0.0374
Explained Variance Ratio - PC4: 0.0312
RandomForest AUC-ROC: 0.8243395800406412
SVM AUC-ROC: 0.8783020997967939
KNeighbors AUC-ROC: 0.8509821630164822

AUC-ROC Scores:
RandomForest: 0.8243395800406412
SVM: 0.8783020997967939
KNeighbors: 0.8509821630164822


# Hyperparams on VAL set

**Siin saab suht runnida grid searchi erinevate mudelite peal, hetkel file_path on see validation set**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.decomposition import PCA
import numpy as np

# Function to load data
def load_data(file):
    data = pd.read_csv(file)
    surnud = data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    return pivot_combined_data

# Function to identify important features
def identify_important_features(X, n_components):
    
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    explained_variance_ratio = pca.explained_variance_ratio_
    loadings_pc1 = pca.components_[0]
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc1})
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    print(f'Explained Variance Ratio - PC1: {explained_variance_ratio[0]:.4f}')
    print(f'Explained Variance Ratio - PC2: {explained_variance_ratio[1]:.4f}')
    print(f'Explained Variance Ratio - PC3: {explained_variance_ratio[2]:.4f}')
    print(f'Explained Variance Ratio - PC4: {explained_variance_ratio[3]:.4f}')
    
    return pd.DataFrame(X_pca, columns=pc_columns), explained_variance_ratio, pca, loading_df

# Function to train a model with hyperparameter tuning


def train_model(X, y, model_type='RandomForest', n_estimators=None):
    if model_type == 'RandomForest':
        model = RandomForestClassifier(random_state=42)
        param_dist = {
            'n_estimators': [n_estimators] if n_estimators else [int(x) for x in np.linspace(10, 200, 10)],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth': [int(x) for x in np.linspace(10, 110, 11)],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
    elif model_type == 'SVM':
        model = SVC(probability=True, random_state=42)
        param_dist = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'degree': [2, 3, 4, 5],
            'gamma': ['scale', 'auto']
        }
    elif model_type == 'KNeighbors':
        model = KNeighborsClassifier()
        param_dist = {
            'n_neighbors': [int(x) for x in np.linspace(1, 20, 20)],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'p': [1, 2]
        }
    else:
        raise ValueError(f'Invalid model type: {model_type}')

    randomized_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=10,  # Number of parameter settings that are sampled
        scoring='roc_auc',  # Using AUC-ROC as the metric
        cv=5,  # Number of folds in a (Stratified)KFold
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    randomized_search.fit(X, y)
    best_model = randomized_search.best_estimator_

    print(f'Best parameters for {model_type}: {randomized_search.best_params_}')
    return best_model

# Function to evaluate the model
def evaluate_model(model, X, y):
    y_probabilities = model.predict_proba(X)[:, 1]
    auc_roc = roc_auc_score(y, y_probabilities)
    return auc_roc

# Function to perform the entire workflow
def main(file, n_components=10):
    data = load_data(file)
    X = data.drop(columns=['death'])
    y = data['death']
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X, n_components=10)

    # Specify the models you want to include
    model_types = ['RandomForest', 'SVM', 'KNeighbors']
    results = {}

    for model_type in model_types:
        X_train, X_test, y_train, y_test = train_test_split(important_features, y, test_size=0.2, random_state=42)
        model = train_model(X_train, y_train, model_type=model_type)
        auc_roc = evaluate_model(model, X_test, y_test)
        results[model_type] = auc_roc
        print(f'{model_type} AUC-ROC: {auc_roc}')

    return results

# Specify the file path
file_path = "synthetic_data_pca.csv" 

# Run the main script
results = main(file_path)
print("\nAUC-ROC Scores:")
for model_type, auc_roc in results.items():
    print(f'{model_type}: {auc_roc}')




In [75]:
results

{'RandomForest': 0.788665613005193,
 'SVM': 0.8254685030480922,
 'KNeighbors': 0.8234364416346805}

# Train set results with val params.
**Suht sama kood, mis "Erinevad Mudelid" all aga siin saab lihtsalt käsitsi ühe mudeli haaval vaadata/treenida**

In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier


# Function to load data - Võtab sisse dataseti, teeb sellest tabeli kus read on patsiendid, veerud protseduurid vastavalt
# 1- protseduur tehti, 0 - patsient pole protseduuri saanud. samuti ka veerg "death": 1-surnud, 0 -"elus"
def load_data(file):
    
    # Data
    data = pd.read_csv(file)
    pd.set_option('display.max_rows', 20)
    surnud= data[data["DEFINITION_ID"] == "death"]
    subject_ids = surnud["SUBJECT_ID"]
    
    # FIltreerib surnute id põhjal
    surnud = data[data["SUBJECT_ID"].isin(subject_ids)]
    surnud["TIME"] = 1 #Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel 
    # lihtsalt 1.
    elus = data[data["DEFINITION_ID"] != "death"]
    elus["TIME"] = 1

    # Sama toimub ka siin, aga elus patsientidega
    elus_filtered = elus[~elus["SUBJECT_ID"].isin(surnud["SUBJECT_ID"])]

    # Filtreeritud andmete kombineerimine üheks tabeliks
    combined_data = pd.concat([surnud, elus_filtered])
    combined_data.sort_values(by='SUBJECT_ID', inplace=True)
    combined_data.reset_index(drop=True, inplace=True)
    
    #Viimane tabel-  kus read on patsiendid, veerud protseduurid vastavalt
    # 1- protseduur tehti, 0 - patsient pole protseduuri saanud. samuti ka veerg "death": 1-surnud, 0 -"elus"
    
    pivot_combined_data = combined_data.pivot_table(index='SUBJECT_ID', columns='DEFINITION_ID', values='TIME', aggfunc='sum', fill_value=0)
    return pivot_combined_data

# Function to identify important features
def identify_important_features(X, n_components):
    # Siin PCA kasutamine on loodud väga 
    # PCA kasutamine, n_components ehk kui nt n_components = 2 siis pca transformib andmed 2-dimensioonilieks (jättes väärtusliku info alles)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # DataFrame with the principal components
    pc_columns = [f'PC_{i+1}' for i in range(n_components)]
    explained_variance_ratio = pca.explained_variance_ratio_ 
    
    # PC 1 loadingud, sest PC 1 on kõige tähtsam
    loadings_pc1 = pca.components_[0]

    # saaks kätte need kõige olulisemad featrue nimed millega PC1 loodi
    loading_df = pd.DataFrame({'Feature': X.columns, 'Loading_PC1': loadings_pc1})

    # Sorteeritud
    loading_df['Absolute_Loading_PC1'] = loading_df['Loading_PC1'].abs()
    loading_df = loading_df.sort_values(by='Absolute_Loading_PC1', ascending=False)
    
    print(f'Explained Variance Ratio - PC1: {explained_variance_ratio[0]:.4f}')
    print(f'Explained Variance Ratio - PC2: {explained_variance_ratio[1]:.4f}')
    print(f'Explained Variance Ratio - PC3: {explained_variance_ratio[2]:.4f}')
    print(f'Explained Variance Ratio - PC4: {explained_variance_ratio[3]:.4f}')
    
    return pd.DataFrame(X_pca, columns=pc_columns),explained_variance_ratio, pca, loading_df

# Function to train a model
def train_model(X, y):
    # MUdeli treenimine (Siin saab mudelit muuta)
    svm_params = {'kernel': 'poly', 'gamma': 'scale', 'degree': 4, 'C': 100}
    knn_params = {'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'algorithm': 'auto'}
    model = model = SVC(probability=True,random_state=42,**svm_params)#DecisionTreeClassifier(random_state=42) #KNeighborsClassifier() #SVC(probability=True, random_state=42)
    
    model.fit(X,y)
    
    return model

# Function to evaluate the model
def evaluate_model(model, X, y):
    # Mudeli hindamine 
    y_probabilities = model.predict_proba(X)[:, 1]    
    auc_roc = roc_auc_score(y,y_probabilities)
    
    return auc_roc


# Põhifunktsioon töövoo jaoks
def main(file):
    # Step 1: Load data
    data = load_data(file)

    # Step 2: important features
    X = data.drop(columns=['death'])
    y = data['death']
    important_features, explained_variance_ratio, pca, loading_df = identify_important_features(X, n_components=20)
    #n_components on siin muudetav, hetkel näiteks n_components=10
    
    # Step 3: Mudeli treenimine (important_features ehk saadud PCA eelnevas sammus)
    X_train, X_test, y_train, y_test = train_test_split(important_features, y, test_size=0.2, random_state=42)
    model = train_model(X_train, y_train)

    # Step 4: Evauation
    auc_roc = evaluate_model(model,X_test, y_test)
    print(f'AUC-ROC: {auc_roc}')
    


    # Tagstada, mida vajalikuks peab (mudel ise, tähtsad tunnused jne)
    return model, loading_df, data, important_features, X_train, X_test, y_train, y_test

# Kasutamine - file asendada kasutatava csv failiga, mis 1-1le samas formaadis csv failidega, mis meile saadeti
file = "synthetic_data_lung_cancer.csv" #synthetic_data_lung_cancer.csv, synthetic_data_pca.csv,jne...
trained_model, loading_df, pivot_table, important_features, X_train, X_test, y_train, y_test = main(file)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 #Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Explained Variance Ratio - PC1: 0.2754
Explained Variance Ratio - PC2: 0.0501
Explained Variance Ratio - PC3: 0.0374
Explained Variance Ratio - PC4: 0.0312
AUC-ROC: 0.8780763151953036


In [112]:
#Ei mäleta täpselt millise dataseti peal siin need mudelid treenitud on aga idee selles et siia saab lih lisad/saveda
# mudelid kui vaja
pd.set_option('display.max_colwidth', None)
models_df = pd.DataFrame(columns=['Model Type', 'AUC-ROC'])
models_df.loc[0] = ["RandomForestClassifier(random_state=42)","0.8212914879205238"]
models_df.loc[1] = ["KNeighborsClassifier'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'algorithm': 'auto'","0.8241137954391511"]
models_df.loc[2] = ["SVC(probability=True,random_state=42'kernel': 'poly', 'gamma': 'scale', 'degree': 4, 'C': 100","0.0.825468503048092"]
models_df

Unnamed: 0,Model Type,AUC-ROC
0,RandomForestClassifier(random_state=42),0.8212914879205238
1,"KNeighborsClassifier'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'algorithm': 'auto'",0.8241137954391511
2,"SVC(probability=True,random_state=42'kernel': 'poly', 'gamma': 'scale', 'degree': 4, 'C': 100",0.0.825468503048092


# Siin saab jooksutada treenitud mudeli peal uusi andmeid

**Asendada tuleb file2 vastava andmestikuga mida jooksutada (!!! treenitud mudel on saadud kogu eelnevat koodi jooksutades, ehk kui nt vahetada üleval koodis "Kasutamine" all ära file = "synthetic_data_pca.csv" siis uus mudel oleks nende andmete peal treenitud !!!)**

In [58]:
file2="synthetic_data_pca.csv" # SIIA VASTAV ANDMESTIK
data2 = load_data(file2) # Jooksutame csv faili load_data funktsioonist läbi
X = data2.drop("death", axis = 1)
y = data2["death"]


# Peab ka teisel andmestikul kasutama "identify_important_features" funktsiooni, et treenitud mudel töötaks uute andmete peal

important_features, explained_variance_ratio, pca, loading_df_validation = identify_important_features(X, n_components=10)

# Samuti peab n_components olema sama mis treenitud mudelil, ehk praegu näiteks n_components=10


trained_model.predict_proba(important_features)[:, 1] #"synthetic_data_lung_cancer.csv" peal treenitud mudeli kasutamine
                                                        # synthetic_data_pca.cs andmete peal
    
evaluate = evaluate_model(trained_model, important_features, y) # jooksutada treenitud mudel evaluate_model funktsiooni peal
                                                                # Et saada kätte ROC-AUC score
print(f'ROC-AUC on validation data:',evaluate)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surnud["TIME"] = 1 #Muudame aja väärtuse 1-ks, ehk esialgsetes andmetes "DEFINITION_ID" = TIME, nüüd selle asemel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elus["TIME"] = 1


Explained Variance Ratio - PC1: 0.1455
Explained Variance Ratio - PC2: 0.0553
Explained Variance Ratio - PC3: 0.0511
Explained Variance Ratio - PC4: 0.0399
ROC-AUC on validation data: 0.7488226059654631


In [14]:
data2["death"].value_counts() # # Siin tagastatud tabel load_data funktsioonist uute andmete peal
                                # ehk veeru death väärtused: 1 - surnud, 0 - elus 

0    637
1     61
Name: death, dtype: int64

# loading_df ehk PCA poolt "important features", mis töövoog tagastas

In [39]:
#loading_df_validation # Valideerimise andmestiku tagastatud "important features"

In [35]:
pd.set_option('display.max_rows', 90)
loading_df.nlargest(90,'Absolute_Loading_PC1') # Treeningu andmete tagastatud "important features"

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1
3747,measurement_637,0.029168,0.029168
4266,observation_204,0.028583,0.028583
4288,observation_224,0.028194,0.028194
3509,measurement_422,0.028162,0.028162
3065,measurement_1221,0.02801,0.02801
2903,measurement_1076,0.027903,0.027903
2849,measurement_1027,0.027786,0.027786
2941,measurement_111,0.027614,0.027614
4192,observation_138,0.027611,0.027611
3840,measurement_720,0.027608,0.027608


In [37]:
# Valideerimise andmestiku tagastatud drug features
drug_features_validation = loading_df_validation[loading_df_validation["Feature"].str.contains("drug")] 
# Saaks kätte milliseid "drug" featureid mudel kasutas
#drug_features_validation

In [36]:
#Treeningu andmete tagastatud drug features
drug_features = loading_df[loading_df["Feature"].str.contains("drug")]
# Saaks kätte milliseid "drug" featureid mudel kasutas
drug_features.nlargest(90,'Absolute_Loading_PC1')

Unnamed: 0,Feature,Loading_PC1,Absolute_Loading_PC1
2646,drug_321,0.011796,0.011796
2410,drug_109,0.011748,0.011748
2644,drug_32,0.011709,0.011709
2739,drug_405,0.011495,0.011495
2622,drug_30,0.011447,0.011447
2658,drug_332,0.011435,0.011435
2643,drug_319,0.011434,0.011434
2744,drug_41,0.011342,0.011342
2620,drug_299,0.011328,0.011328
2556,drug_240,0.011316,0.011316
