In [15]:
# Step 1: Install and Import Necessary Libraries
print("Step 1: Importing necessary libraries...")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Libraries imported successfully.\n")

# Step 2: Load the Dataset
print("Step 2: Loading the dataset...")
df = pd.read_csv("LPD.csv", encoding='latin-1')
print("Dataset loaded successfully!")
print("First five rows of the dataset:\n", df.head(), "\n")

# Step 3: Check for Missing Values
print("Step 3: Checking for missing values...")
print("Missing values per column:\n", df.isnull().sum(), "\n")

# Step 5.1: Clean Feature Names
print("\nStep 5.1: Cleaning feature names...")
df.columns = df.columns.str.strip().str.replace('\xa0', '', regex=False)
print("Feature names cleaned successfully!")
print("Updated feature names:\n", df.columns.tolist(), "\n")

# Step 4: Clean 'Gender of the patient'
print("Step 4: Cleaning 'Gender of the patient' column...")
print("Unique values before cleaning:", df['Gender of the patient'].unique())
df['Gender of the patient'] = df['Gender of the patient'].apply(lambda x: 1 if x == 1 else 0).astype(int)
print("Unique values after cleaning:", df['Gender of the patient'].unique())
print("\nSample of cleaned data:")
print(df[['Gender of the patient']].head())

# Step 5: Handling Missing Values
print("\nStep 5: Handling missing values...")
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mean(), inplace=True)
print("Missing values handled using mean imputation.")
print("Current dataset shape:", df.shape, "\n")

# Step 6: Split Features and Target Variable
print("Step 6: Splitting features and target variable...")
X = df.drop('Result', axis=1)  # Features
y = df['Result']  # Target variable
print("Features and target split successfully.\n")

# Step 7: Train-Test Split
print("Step 7: Splitting the data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train-test split completed.\n")

# Step 8: Handle Imbalanced Dataset Using SMOTE on Training Data
print("Step 8: Handling class imbalance using SMOTE on training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("SMOTE applied. Training data balanced.\n")

# Step 9: Feature Scaling on Training Data Only
print("Step 9: Scaling features for training data...")
scaler = StandardScaler().fit(X_train_resampled)
X_train_resampled_scaled = scaler.transform(X_train_resampled)
print("Feature scaling applied to training data.\n")

# Step 10: Hyperparameter Tuning and Cross-Validation on Training Data
print("Step 10: Hyperparameter tuning and cross-validation for XGBoost...")

xgb_model = XGBClassifier(
    max_depth=6,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    learning_rate=0.01,
    n_estimators=500,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

cv_scores_xgb = cross_val_score(xgb_model, X_train_resampled_scaled, y_train_resampled, cv=5, scoring='accuracy')
print(f"XGBoost Cross-validation scores: {cv_scores_xgb}")
print(f"Mean XGBoost cross-validation score: {cv_scores_xgb.mean()}\n")

# Step 11: Train XGBoost Model on Resampled and Scaled Training Data
print("Step 11: Training XGBoost model on training data...")
xgb_model.fit(X_train_resampled_scaled, y_train_resampled)
print("XGBoost model trained successfully.\n")

# Step 12: Feature Importance Analysis
print("Step 12: Analyzing feature importance...")
feature_importances = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print("Feature importance calculated. Top features:")
print(feature_importance_df.head())

# Step 13: Select Important Features for Prediction
print("\nStep 13: Selecting important features for prediction...")
# Select the top 5 important features (adjust based on your analysis)
important_features = feature_importance_df.head(5)['Feature'].tolist()
print(f"Selected important features: {important_features}")



# Create a new dataset with only the important features
X_important = df[important_features]  # Use only the selected features
print("Dataset reduced to important features.\n")

# Step 14: Split Features and Target Variable
print("Step 14: Splitting features and target variable for important features...")
X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(
    X_important, y, test_size=0.2, random_state=42, stratify=y
)
print("Train-test split completed.\n")



Step 1: Importing necessary libraries...
Libraries imported successfully.

Step 2: Loading the dataset...
Dataset loaded successfully!
First five rows of the dataset:
    Age of the patient  Gender of the patient  Total Bilirubin  \
0                65.0                    1.0              0.7   
1                62.0                    0.0             10.9   
2                62.0                    0.0              7.3   
3                58.0                    0.0              1.0   
4                72.0                    0.0              3.9   

   Direct Bilirubin   Alkphos Alkaline Phosphotase  \
0               0.1                          187.0   
1               5.5                          699.0   
2               4.1                          490.0   
3               0.4                          182.0   
4               2.0                          195.0   

    Sgpt Alamine Aminotransferase  Sgot Aspartate Aminotransferase  \
0                            16.0             

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


XGBoost Cross-validation scores: [0.94613083 0.94385065 0.9468358  0.94940137 0.94811859]
Mean XGBoost cross-validation score: 0.946867446366204

Step 11: Training XGBoost model on training data...
XGBoost model trained successfully.

Step 12: Analyzing feature importance...
Feature importance calculated. Top features:
                                Feature  Importance
3                      Direct Bilirubin    0.240747
1                 Gender of the patient    0.130164
5         Sgpt Alamine Aminotransferase    0.100998
4          Alkphos Alkaline Phosphotase    0.092869
9  A/G Ratio Albumin and Globulin Ratio    0.090229

Step 13: Selecting important features for prediction...
Selected important features: ['Direct Bilirubin', 'Gender of the patient', 'Sgpt Alamine Aminotransferase', 'Alkphos Alkaline Phosphotase', 'A/G Ratio Albumin and Globulin Ratio']
Dataset reduced to important features.

Step 14: Splitting features and target variable for important features...
Train-test split

In [16]:

# Step 15: Train XGBoost Model with Important Features
print("Step 15: Training XGBoost model with important features...")
xgb_model_imp = XGBClassifier(
    max_depth=6,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    learning_rate=0.01,
    n_estimators=500,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

# Train the model on the training data with important features
xgb_model_imp.fit(X_train_imp, y_train_imp)
print("XGBoost model trained successfully with important features.\n")

# Step 16: Make Predictions with Important Features
print("Step 16: Making predictions with important features...")
y_pred_imp = xgb_model_imp.predict(X_test_imp)



# Step 17: Evaluate Model with Important Features
print("\nStep 17: Evaluating model performance with important features...")
accuracy_imp = accuracy_score(y_test_imp, y_pred_imp)
print(f"Accuracy with important features: {accuracy_imp}")
print("Classification Report:\n", classification_report(y_test_imp, y_pred_imp))
print("Confusion Matrix:\n", confusion_matrix(y_test_imp, y_pred_imp))
print("Model evaluation with important features completed.")


Step 15: Training XGBoost model with important features...
XGBoost model trained successfully with important features.

Step 16: Making predictions with important features...

Step 17: Evaluating model performance with important features...
Accuracy with important features: 0.9287925696594427
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95      4385
           1       0.97      0.78      0.86      1752

    accuracy                           0.93      6137
   macro avg       0.94      0.88      0.91      6137
weighted avg       0.93      0.93      0.93      6137

Confusion Matrix:
 [[4339   46]
 [ 391 1361]]
Model evaluation with important features completed.


In [17]:
import pickle
with open ('mlmodel.pkl','wb') as files:
    pickle.dump(xgb_model_imp,files)