In [111]:
import pandas as pd
import os
import glob
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import random

In [94]:
df = pd.read_csv("../clean_dataset.csv")
df.drop(["tbp_tile_type", "tbp_lv_location_simple", "iddx_1"], axis=1, inplace=True)

## One Hot Encoding de Colunas Categoricas

In [95]:
#df = pd.get_dummies(df, columns=['sex', 'anatom_site_general'], dtype=int)

In [96]:
#Quantidade total de pacientes
df["patient_id"].nunique()

1042

## Garantindo Aleatoriedade de Sample

In [97]:
pacientes_cancer = df[df["target"]==1]["patient_id"].unique()
print(len(pacientes_cancer))

259


In [98]:
pacientes_not_cancer = df[~df["patient_id"].isin(pacientes_cancer)]["patient_id"].unique()
print(len(pacientes_not_cancer))

783


In [99]:
pacientes_not_cancer = random.sample(sorted(pacientes_not_cancer), k=200)

In [100]:
df = df[df["patient_id"].isin(pacientes_not_cancer) | df["patient_id"].isin(pacientes_cancer)]

## Estratificação de Grupos de Forma que Pereça o balanço da Classe Alvo

In [101]:
#Quantidade de instâncias da Classe Alvo
len(df[df["target"]==1])

393

In [102]:
#Ignorar a unicidade da especificação da classe alvo por pessoa
df.drop(["patient_id", "isic_id"], axis=1, inplace=True)


In [103]:
df = df.reset_index(drop=True)

In [104]:
df

Unnamed: 0,target,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,tbp_lv_L,tbp_lv_Lext,tbp_lv_areaMM2,tbp_lv_area_perim_ratio,tbp_lv_color_std_mean,tbp_lv_deltaA,tbp_lv_deltaB,tbp_lv_deltaL,tbp_lv_deltaLB,tbp_lv_deltaLBnorm,tbp_lv_eccentricity,tbp_lv_minorAxisMM,tbp_lv_nevi_confidence,tbp_lv_norm_border,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,tbp_lv_dnn_lesion_confidence
0,0,60.0,male,head/neck,1.10,31.712570,25.364740,26.331000,24.549290,41.219030,35.299260,39.702910,44.064040,48.861520,55.362360,0.919497,12.235290,0.000000,6.347830,1.781713,-6.500838,6.839008,4.987244,0.639885,0.821918,1.334303e-07,2.116402,0.000000,3.354148,0.000000,0.853227,3.912844,0.285714,55,-0.078308,1575.687000,57.174500,3.141455
1,0,60.0,male,posterior torso,3.40,22.575830,17.128170,37.970460,33.485410,44.174920,37.611800,59.265850,62.909730,53.961180,61.670520,3.265153,24.184620,0.000000,5.447655,4.485044,-7.709336,9.092376,6.290359,0.932147,1.194905,2.959177e-04,4.798335,0.000000,8.886309,0.000000,1.743651,1.950777,0.361905,105,123.649700,1472.010000,232.908900,99.804040
2,0,65.0,male,anterior torso,3.22,14.242329,12.164757,21.448144,21.121356,25.746200,24.374023,56.414429,60.060388,18.649518,23.314841,6.079940,14.889242,0.514520,2.077572,0.326788,-4.665323,4.783413,6.400196,0.654458,2.481328,2.198945e+01,1.975874,1.771705,9.514499,0.664690,1.258541,1.573733,0.209581,130,-141.024780,1442.185791,58.359802,99.989998
3,0,55.0,male,anterior torso,2.73,24.725520,20.057470,26.464900,25.710460,36.217980,32.608740,46.946070,52.041180,46.276310,54.855740,2.101708,19.902560,0.000000,4.668053,0.754434,-8.579431,9.148495,6.531302,0.946448,0.929916,1.378832e-03,3.658854,0.000000,6.467562,0.000000,2.085409,2.480509,0.313433,20,-72.315640,1488.720000,21.428960,70.442510
4,0,75.0,female,head/neck,2.54,22.129183,19.911424,24.106614,23.649642,32.723533,30.915536,47.448956,49.904853,38.814170,43.913206,2.964909,20.893506,0.000000,2.217759,0.456972,-5.099036,5.292604,4.518609,0.810116,1.728243,7.528896e-02,5.361201,0.000000,7.870664,0.000000,0.773993,2.790337,0.518868,55,-11.959961,1436.027466,101.334656,99.619603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214326,0,55.0,male,anterior torso,6.33,19.489550,10.815540,25.631740,25.398830,32.199820,27.605740,52.751800,66.934330,37.303120,54.084020,22.799770,20.922360,2.511755,8.674015,0.232904,-16.780900,16.944260,13.364340,0.681945,4.696272,9.590226e+01,3.413541,7.402617,21.840900,2.429209,8.024342,2.464136,0.255224,35,183.882800,1390.055000,-58.013670,99.999990
214327,0,70.0,male,posterior torso,3.12,21.434500,15.039810,35.298110,30.623150,41.296420,34.117050,58.732110,63.843170,49.537300,57.206530,4.353537,19.754680,0.622947,6.394694,4.674955,-7.669223,9.107529,6.758377,0.851768,1.855944,3.196843e+01,3.652224,1.573159,9.273765,0.430509,1.486522,1.765237,0.316177,165,-111.353700,1056.274000,141.926100,99.999850
214328,0,65.0,male,anterior torso,9.47,17.954859,14.184649,26.891501,24.584631,32.334653,28.383240,56.269842,60.016248,54.805237,61.536188,28.879715,39.543633,1.196745,3.770211,2.306870,-6.730951,7.267322,5.073752,0.828994,5.256093,1.447149e-03,8.130620,3.131946,33.793621,0.898869,1.280199,3.318665,0.433862,5,129.817780,1521.300659,9.833496,98.780584
214329,0,65.0,female,anterior torso,2.05,17.332567,12.364397,29.845326,26.500073,34.513206,29.242644,59.854275,64.987196,39.413663,47.840501,2.758491,13.050010,0.498197,4.968169,3.345253,-8.426838,9.267366,7.842607,0.649815,1.595713,5.994798e+01,1.444499,1.660411,5.999862,0.607554,1.702824,2.205272,0.183099,40,54.622246,1065.263672,-106.833740,99.999416


## Transformação de Cada Tipo de Coluna

In [105]:
colunas_categoricas = df.select_dtypes(exclude='number').columns.tolist()
colunas_numericas = df.drop("target", axis=1).select_dtypes(include='number').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), colunas_numericas),
        ('cat', OneHotEncoder(), colunas_categoricas)
    ]
)

In [108]:
#Criação de Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeClassifier(random_state=100))
])

param_grid = {
    'tree__criterion': ['gini', 'entropy'],
    'tree__max_depth': [None, 3, 5, 10, 50, 100],
    'tree__min_samples_split': [2, 5, 10, 20]
}

cv_stratified = StratifiedKFold(n_splits=3, shuffle=True, random_state=100)

## Treinamento Modelo Básico

In [None]:
X = df.drop("target", axis=1)
y = df["target"]

grid_search = GridSearchCV(
    estimator=pipeline,   
    param_grid=param_grid,
    cv=cv_stratified, 
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X, y)

melhor_modelo = grid_search.best_estimator_

preprocessor = melhor_modelo.named_steps['preprocessor']

feature_names = preprocessor.get_feature_names_out()

df_importancia = pd.DataFrame({
    'feature': feature_names,
    'importance': modelo_arvore.feature_importances_
})

df_importancia = df_importancia.sort_values(by='importance', ascending=False)

df_importancia

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
media = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]
desvio = grid_search.cv_results_['std_test_score'][grid_search.best_index_]

print(f"Score: {media:.4f} ± {desvio:.4f}")

In [None]:
import matplotlib.pyplot as plt
# Passo F: Plotar o gráfico
plt.figure(figsize=(10, 6))
plt.barh(df_importancia['feature'][:15], df_importancia['importance'][:15], color='skyblue')
plt.gca().invert_yaxis() # Inverter para a mais importante ficar no topo
plt.xlabel('Importância (Gini)')
plt.title('Feature Importance - Random Forest')
plt.show()

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('rf', RandomForestClassifier(random_state=100)) 
])

param_grid = {
    'rf__n_estimators': [50, 100, 200, 300],  
    'rf__max_depth': [None, 10, 20],      
    'rf__min_samples_split': [2, 5, 10, 15],  
    'rf__class_weight': ['balanced'] 
}

cv_stratified = StratifiedKFold(n_splits=3, shuffle=True, random_state=100)

X = df.drop("target", axis=1)
y = df["target"]


grid_search = GridSearchCV(
    estimator=pipeline,   
    param_grid=param_grid,
    cv=cv_stratified, 
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1
)

print("Iniciando treinamento do Random Forest...")
grid_search.fit(X, y)

print(f"Melhor F1 Macro: {grid_search.best_score_:.4f}")
print(f"Melhores Parâmetros: {grid_search.best_params_}")



best_model = grid_search.best_estimator_

rf_model = best_model.named_steps['rf']

feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

df_importancia = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_model.feature_importances_
})

df_importancia = df_importancia.sort_values(by='importance', ascending=False)

print("\n--- Top 10 Features Mais Importantes ---")
print(df_importancia.head(10))

plt.figure(figsize=(10, 6))
plt.barh(df_importancia['feature'][:15], df_importancia['importance'][:15], color='skyblue')
plt.gca().invert_yaxis() 
plt.xlabel('Importância (Gini)')
plt.title('Feature Importance - Random Forest')
plt.show()

In [None]:
df_importancia

In [114]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('svm', SVC(probability=True, random_state=100)) 
])

param_grid = {
    'svm__C': [0.1, 1, 10, 50, 100],         
    'svm__kernel': ['rbf'],    
    'svm__gamma': ['scale', 'auto'],      
    'svm__class_weight': ['balanced', None]
}

cv_stratified = StratifiedKFold(n_splits=3, shuffle=True, random_state=100)

X = df.drop("target", axis=1)
y = df["target"]

grid_search = GridSearchCV(
    estimator=pipeline,   
    param_grid=param_grid,
    cv=cv_stratified, 
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X, y)



Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

In [None]:
media = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]
desvio = grid_search.cv_results_['std_test_score'][grid_search.best_index_]

print(f"Score: {media:.4f} ± {desvio:.4f}")