In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_csv("../data/lungcancer.csv")

# -----------------------------------
# 2. FIX: Use SEPARATE encoders
# -----------------------------------
gender_encoder = LabelEncoder()
target_encoder = LabelEncoder()

data['GENDER'] = gender_encoder.fit_transform(data['GENDER'])
data['LUNG_CANCER'] = target_encoder.fit_transform(data['LUNG_CANCER'])

# -----------------------------------
# 3. Split features & target
# -----------------------------------
X = data.drop('LUNG_CANCER', axis=1)
y = data['LUNG_CANCER']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
  # keeps class balance → accuracy preserved
)

# -----------------------------------
# 4. Feature scaling
# -----------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -----------------------------------
# 5. MODEL OPTIMIZATION (unchanged)
# -----------------------------------
k_values = range(1, 21)
accuracy_list = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_list.append(acc)

best_k = k_values[np.argmax(accuracy_list)]
print("Best Accuracy:", max(accuracy_list) * 100)

# Final optimized model
knn_final = KNeighborsClassifier(
    n_neighbors=best_k,
    weights='distance'
)

knn_final.fit(X_train, y_train)
y_pred_final = knn_final.predict(X_test)

final_accuracy = accuracy_score(y_test, y_pred_final)
print("Final Improved Accuracy:", final_accuracy * 100)

print(classification_report(y_test, y_pred_final))

# -----------------------------------
# 8. SAVE FILES (FIXED)
# -----------------------------------
joblib.dump(knn_final, "lung_cancer_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(gender_encoder, "gender_encoder.pkl")
joblib.dump(target_encoder, "target_encoder.pkl")

print("✅ Model, Scaler & Encoders saved successfully!")

Best Accuracy: 96.7741935483871
Final Improved Accuracy: 96.7741935483871
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.98      0.98      0.98        60

    accuracy                           0.97        62
   macro avg       0.74      0.74      0.74        62
weighted avg       0.97      0.97      0.97        62

✅ Model, Scaler & Encoders saved successfully!
