In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

path_to_data = r'C:\Users\roee hilel\Computer\Documents\vs-code\apllied_statistics\Brain-MRI\kaggle_3m\data.csv'
data = pd.read_csv(path_to_data)
data.head()

Unnamed: 0,Patient,RNASeqCluster,MethylationCluster,miRNACluster,CNCluster,RPPACluster,OncosignCluster,COCCluster,histological_type,neoplasm_histologic_grade,tumor_tissue_site,laterality,tumor_location,gender,age_at_initial_pathologic,race,ethnicity,death01
0,TCGA_CS_4941,2.0,4.0,2,2.0,,3.0,2,1.0,2.0,1.0,3.0,2.0,2.0,67.0,3.0,2.0,1.0
1,TCGA_CS_4942,1.0,5.0,2,1.0,1.0,2.0,1,1.0,2.0,1.0,3.0,2.0,1.0,44.0,2.0,,1.0
2,TCGA_CS_4943,1.0,5.0,2,1.0,2.0,2.0,1,1.0,2.0,1.0,1.0,2.0,2.0,37.0,3.0,,0.0
3,TCGA_CS_4944,,5.0,2,1.0,2.0,1.0,1,1.0,1.0,1.0,3.0,6.0,2.0,50.0,3.0,,0.0
4,TCGA_CS_5393,4.0,5.0,2,1.0,2.0,3.0,1,1.0,2.0,1.0,1.0,6.0,2.0,39.0,3.0,,0.0


In [11]:
data = data.drop(columns='Patient')
df = data.dropna(subset=['death01'])
df.head()

Unnamed: 0,RNASeqCluster,MethylationCluster,miRNACluster,CNCluster,RPPACluster,OncosignCluster,COCCluster,histological_type,neoplasm_histologic_grade,tumor_tissue_site,laterality,tumor_location,gender,age_at_initial_pathologic,race,ethnicity,death01
0,2.0,4.0,2,2.0,,3.0,2,1.0,2.0,1.0,3.0,2.0,2.0,67.0,3.0,2.0,1.0
1,1.0,5.0,2,1.0,1.0,2.0,1,1.0,2.0,1.0,3.0,2.0,1.0,44.0,2.0,,1.0
2,1.0,5.0,2,1.0,2.0,2.0,1,1.0,2.0,1.0,1.0,2.0,2.0,37.0,3.0,,0.0
3,,5.0,2,1.0,2.0,1.0,1,1.0,1.0,1.0,3.0,6.0,2.0,50.0,3.0,,0.0
4,4.0,5.0,2,1.0,2.0,3.0,1,1.0,2.0,1.0,1.0,6.0,2.0,39.0,3.0,,0.0


MISSING ELEMENTS

In [12]:
for col in df.columns:
    print(f'\nUnique values in {col}: {df[col].unique()}\n,valuecounts:\n{df[col].value_counts()}\nNAN:{df[col].isna().sum()}')


Unique values in RNASeqCluster: [ 2.  1. nan  4.  3.]
,valuecounts:
RNASeqCluster
2.0    28
4.0    27
1.0    25
3.0    12
Name: count, dtype: int64
NAN:17

Unique values in MethylationCluster: [ 4.  5.  3.  2.  1. nan]
,valuecounts:
MethylationCluster
5.0    36
3.0    34
4.0    21
2.0    13
1.0     4
Name: count, dtype: int64
NAN:1

Unique values in miRNACluster: [2 1 3 4]
,valuecounts:
miRNACluster
2    61
1    33
3     8
4     7
Name: count, dtype: int64
NAN:0

Unique values in CNCluster: [ 2.  1.  3. nan]
,valuecounts:
CNCluster
1.0    58
3.0    29
2.0    20
Name: count, dtype: int64
NAN:2

Unique values in RPPACluster: [nan  1.  2.  3.  4.]
,valuecounts:
RPPACluster
2.0    31
1.0    27
4.0    23
3.0    17
Name: count, dtype: int64
NAN:11

Unique values in OncosignCluster: [ 3.  2.  1. nan]
,valuecounts:
OncosignCluster
2.0    57
1.0    29
3.0    18
Name: count, dtype: int64
NAN:5

Unique values in COCCluster: [2 1 3]
,valuecounts:
COCCluster
1    55
3    30
2    24
Name: count, dt

as we can see there are not a lpt pf Nan, lets replace them with new cluster -1

In [13]:
df = df.fillna(-1)

In [14]:
df = df.replace(['missing', 'NA', '?'], np.nan) #first we will replace the string missing into NAN than we will find the missing values
def find_missing(df):
    missing_values = df.isnull()
    missing_locations = [(index, col) for index, row in missing_values.iterrows() for col in row[row].index]
    return missing_locations

missing_val = find_missing(df)
print(missing_val)
for row, column in missing_val:
    print(f'Missing values found at Row: {row}, Column: {column}')

[]


In [15]:
y = df['death01']
X=df.drop(columns='death01')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Define models
catboost_model = CatBoostClassifier(random_state=42, verbose=False)
lgbm_model = LGBMClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)

models = {
    'CatBoost': catboost_model,
    'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}



param_grid = {
    'learning_rate':[0.01,0.1,0.2,0.3],
    'max_depth':[3,4,5,6,7]
}
grid_search_xgb = GridSearchCV(estimator=xgb_model,param_grid=param_grid,cv=5)
grid_search_xgb.fit(X_train,y_train)

grid_search_lgbtm = GridSearchCV(estimator=lgbm_model,param_grid=param_grid,cv=5)
grid_search_lgbtm.fit(X_train,y_train)


grid_search_cat = GridSearchCV(estimator=catboost_model,param_grid=param_grid,cv=5)
grid_search_cat.fit(X_train,y_train)


[LightGBM] [Info] Number of positive: 15, number of negative: 54
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 69, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217391 -> initscore=-1.280934
[LightGBM] [Info] Start training from score -1.280934
[LightGBM] [Info] Number of positive: 15, number of negative: 54
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 69, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.217391 -> initscore=-1.280934
[LightGBM] [Info] Start training from score -1.280934
[LightGBM] [Info] Number of positi

In [17]:
from sklearn.metrics import accuracy_score

y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test)
val_accuracy_xgb = accuracy_score(y_test,y_pred_xgb)

y_pred_lgbtm = grid_search_lgbtm.best_estimator_.predict(X_test)
val_accuracy_lgbtm = accuracy_score(y_test,y_pred_lgbtm)

y_pred_cat = grid_search_cat.best_estimator_.predict(X_test)
val_accuracy_cat = accuracy_score(y_test,y_pred_cat)






print(f"val accuracy xgb: {val_accuracy_xgb:.5f},val accuracy lgbtm: {val_accuracy_lgbtm:.5f},val accuracy cat: {val_accuracy_cat:.5f}")



val accuracy xgb: 0.59091,val accuracy lgbtm: 0.54545,val accuracy cat: 0.63636
