In [6]:
# import required libraries 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import ADASYN
import joblib

## Data Preprocessing

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
import joblib

# Load the dataset
file_path = '../data/survey_lung_cancer.csv'
df = pd.read_csv(file_path)

# Convert column names to lowercase and remove leading/trailing spaces
df.columns = df.columns.str.strip().str.lower()

# Replace 'YES'/'NO' with 1/0 in the 'lung_cancer' column
df['lung_cancer'] = df['lung_cancer'].map({'YES': 1, 'NO': 0})

# Encode categorical 'gender' and other columns using LabelEncoder
le = LabelEncoder()
categorical_columns = ['gender', 'smoking', 'yellow_fingers', 'anxiety', 'peer_pressure',
                       'chronic_disease', 'fatigue', 'allergy', 'wheezing', 'alcohol_consuming',
                       'shortness_of_breath', 'swallowing_difficulty', 'chest_pain']

for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

# Separate features (X) and target (y)
X = df.drop(columns=['lung_cancer'])
y = df['lung_cancer']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance using ADASYN
ada = ADASYN(sampling_strategy='minority')
X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)

# Standardize the features (important for certain models)
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Save the scaler object and feature names as a .pkl file using joblib
scaler_path = '../artifacts/scaler.pkl'
joblib.dump(scaler, scaler_path)

# Save feature names to ensure consistent scaling later
feature_names = X.columns.tolist()
joblib.dump(feature_names, '../artifacts/feature_names.pkl')


['../artifacts/feature_names.pkl']

In [13]:
df.columns

Index(['gender', 'age', 'smoking', 'yellow_fingers', 'anxiety',
       'peer_pressure', 'chronic_disease', 'fatigue', 'allergy', 'wheezing',
       'alcohol_consuming', 'coughing', 'shortness_of_breath',
       'swallowing_difficulty', 'chest_pain', 'lung_cancer'],
      dtype='object')

## Model Building and Training

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [9]:
# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Model Evaluation on Test Data
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Print accuracy and other evaluation metrics
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report for Logistic Regression:\n", classification_report(y_test, y_pred_log_reg))
print("\nClassification Report for Random Forest:\n", classification_report(y_test, y_pred_rf))

# Confusion Matrix for Random Forest
print("\nConfusion Matrix for Random Forest:\n", confusion_matrix(y_test, y_pred_rf))

Logistic Regression Accuracy: 0.9032258064516129
Random Forest Accuracy: 0.9032258064516129

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.62      0.62      0.62         8
           1       0.94      0.94      0.94        54

    accuracy                           0.90        62
   macro avg       0.78      0.78      0.78        62
weighted avg       0.90      0.90      0.90        62


Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       0.58      0.88      0.70         8
           1       0.98      0.91      0.94        54

    accuracy                           0.90        62
   macro avg       0.78      0.89      0.82        62
weighted avg       0.93      0.90      0.91        62


Confusion Matrix for Random Forest:
 [[ 7  1]
 [ 5 49]]


## Model Tuning

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
# Example: Tuning hyperparameters for Random Forest
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}


In [12]:
best_rf_model

## Save the Model for Production

In [13]:
import joblib

In [15]:
# Save the best model
joblib.dump(best_rf_model, '../artifacts/lung_cancer_model.pkl')

['../artifacts/lung_cancer_model.pkl']

## Model Evaluation 

In [16]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score

In [17]:
# Evaluate AUC, Precision, Recall for the best model
y_pred = best_rf_model.predict(X_test)
print("AUC Score:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

AUC Score: 0.7222222222222222
Precision: 0.9272727272727272
Recall: 0.9444444444444444
