In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")



#### Try Oversampling

In [40]:
data_path = '../../../dataset/lungcancerdataset.csv'
df = pd.read_csv(data_path)
df = df.drop(columns=['followup-time', 'serno'])
df = df.dropna()

X = df.drop(columns=['lung cancer'])
y = df['lung cancer']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_resampled, y_train_resampled)

y_pred = rf.predict(X_test_scaled)
y_pred_proba = rf.predict_proba(X_test_scaled)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

cv_scores = cross_val_score(
    rf, X_train_resampled, y_train_resampled, 
    cv=5, scoring='roc_auc'
)
print("\nCross-validation ROC AUC scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
print("CV Score Std:", cv_scores.std())


Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      3428
         1.0       0.04      0.05      0.04        65

    accuracy                           0.96      3493
   macro avg       0.51      0.51      0.51      3493
weighted avg       0.96      0.96      0.96      3493


Confusion Matrix:
[[3348   80]
 [  62    3]]

ROC AUC Score: 0.7164258145588367

Top 10 Most Important Features:
                            feature  importance
7   smoke_never(0)_ex(1)_current(2)    0.158827
11                             amed    0.122612
4                     age_interview    0.103831
3                               sex    0.069159
12                             dash    0.067220
19                        pgs000721    0.056407
18                        pgs000070    0.048630
14                              DBP    0.047128
13                              SBP    0.044395
1                               pc2    0.042384

Cross-

#### Try Downsampling

In [41]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

data_path = '../../../dataset/lungcancerdataset.csv'
df = pd.read_csv(data_path)

df = df.drop(columns=['serno', 'followup-time', 'pc1', 'pc2', 'pc3', 'amed', 'dash'])
df = df.dropna(subset=['lung cancer'])

X = df.drop(columns=['lung cancer'])
y = df['lung cancer']

print("Original class distribution:")
print(pd.Series(y).value_counts())

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)

print("\nResampled class distribution:")
print(pd.Series(y_train_resampled).value_counts())

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_resampled, y_train_resampled)

y_pred = rf.predict(X_test_scaled)
y_pred_proba = rf.predict_proba(X_test_scaled)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

cv_scores = cross_val_score(
    rf, X_train_resampled, y_train_resampled, 
    cv=5, scoring='roc_auc'
)
print("\nCross-validation ROC AUC scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
print("CV Score Std:", cv_scores.std())

Original class distribution:
lung cancer
0.0    22452
1.0      449
Name: count, dtype: int64

Resampled class distribution:
lung cancer
0.0    359
1.0    359
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.70      0.82      4491
         1.0       0.04      0.61      0.07        90

    accuracy                           0.70      4581
   macro avg       0.51      0.65      0.45      4581
weighted avg       0.97      0.70      0.80      4581


Confusion Matrix:
[[3131 1360]
 [  35   55]]

ROC AUC Score: 0.7286944258888147

Top 10 Most Important Features:
                            feature  importance
1                     age_interview    0.155004
4   smoke_never(0)_ex(1)_current(2)    0.124856
14                        pgs000721    0.102951
13                        pgs000070    0.100348
3                   telomere length    0.094330
7                     ahei2010score    0.085702
2              

#### Output Probabilities

In [42]:
predicted_probabilities = rf.predict_proba(X)

prob_df = pd.DataFrame(predicted_probabilities, columns=[f'Prob_Class_{i}' for i in range(predicted_probabilities.shape[1])])

result_df = pd.concat([X.reset_index(drop=True), prob_df.reset_index(drop=True)], axis=1)

result_df.to_excel('lung_cancer_predicted_probabilities.xlsx', index=False)

print("Predicted probabilities exported successfully to 'lung_cancer_probabilities.xlsx'.")


Predicted probabilities exported successfully to 'lung_cancer_probabilities.xlsx'.
