In [20]:
# Step 1: Install and Import Necessary Libraries
print("Step 1: Importing necessary libraries...")
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
print("Libraries imported successfully.\n")

# Step 2: Load the Dataset
df = pd.read_csv("LPD.csv", encoding='latin-1')
print("Dataset loaded successfully!")
print("First five rows of the dataset:\n", df.head(), "\n")

# Step 3: Check for Missing Values
print("Step 3: Checking for missing values...")
print("Missing values per column:\n", df.isnull().sum(), "\n")

# Check and convert values in 'Gender of the patient'
print("Unique values before cleaning:", df['Gender of the patient'].unique())

# Convert any other value to 0 or 1 (example: treat invalid values as 0)
df['Gender of the patient'] = df['Gender of the patient'].apply(lambda x: 1 if x == 1 else 0)

# Ensure the column is of integer type
df['Gender of the patient'] = df['Gender of the patient'].astype(int)

print("\nUnique values after cleaning:", df['Gender of the patient'].unique())
print("\nSample of cleaned data:")
print(df[['Gender of the patient']].head())

# Step 4: Handling Missing Values
print("Step 4: Handling missing values...")

# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Replace missing values for numerical columns with mean
for col in numerical_columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mean(), inplace=True)

print("Missing values handled using imputation.")
print("Current dataset shape:", df.shape, "\n")

# Step 5: Identify outliers using IQR
Q1 = df[numerical_columns].quantile(0.25)
Q3 = df[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find rows where any numerical column has values outside the bounds
outliers = ((df[numerical_columns] < lower_bound) | (df[numerical_columns] > upper_bound))

# Step 6: Handle outliers (Imputation with median)
df_imputed = df.copy()
for col in numerical_columns:
    median_value = df[col].median()
    df_imputed[col] = np.where((df[col] < lower_bound[col]) | (df[col] > upper_bound[col]), median_value, df[col])

# Print confirmation message
print("Outliers handled:")
print(f"Original shape: {df.shape}")
print(f"Imputed shape (outliers imputed): {df_imputed.shape}")

# Step 7: Split Features and Target Variable
print("Step 7: Splitting features and target variable...")
X = df.drop('Result', axis=1)  # Features
y = df['Result']  # Target variable
print("Features and target split successfully.\n")

# Step 8: Handle Imbalanced Dataset using SMOTE (Synthetic Minority Over-sampling Technique)
print("Step 8: Handling class imbalance with SMOTE...")

from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)  # Random seed for reproducibility
X_resampled, y_resampled = smote.fit_resample(X, y)

print("SMOTE applied. The dataset has been balanced.\n")

# Step 9: Feature Scaling
print("Step 9: Scaling features...")

from sklearn.preprocessing import StandardScaler

# Scale features (important for algorithms sensitive to feature scale like Decision Tree)
scaler = StandardScaler().fit(X_resampled)  # Fit the scaler on the resampled data
X_resampled_scaled = scaler.transform(X_resampled)  # Scale the features
print("Feature scaling applied to resampled data.\n")

# Step 10: Train-Test Split
print("Step 10: Splitting the data into training and test sets...")

from sklearn.model_selection import train_test_split

# Split the resampled data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled_scaled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

print("Train-test split completed.\n")

# Step 11: Hyperparameter Tuning and Model Training (Decision Tree)
print("Step 11: Hyperparameter tuning and training the Decision Tree model...")



from sklearn.tree import DecisionTreeClassifier
# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(
    max_depth=10,              # Maximum depth of the tree
    random_state=42,           # Random seed for reproducibility
    class_weight='balanced'    # Handle imbalance by adjusting class weights
)

# Perform 5-fold cross-validation on the training data


from sklearn.model_selection import cross_val_score
cv_scores_dt = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f"Decision Tree Cross-validation scores: {cv_scores_dt}")
print(f"Mean Decision Tree cross-validation score: {cv_scores_dt.mean()}")

# Train the Decision Tree model on the resampled and scaled training data
dt_model.fit(X_train, y_train)

print("Decision Tree model trained successfully on the balanced dataset.\n")

# Step 12: Making Predictions Without Adjusting the Threshold (Decision Tree)
print("Step 12: Making predictions without adjusting the threshold for Decision Tree...")

# Predict the classes directly (using the default threshold of 0.5)
y_pred_dt = dt_model.predict(X_test)

# Evaluate the Decision Tree model performance with the default threshold

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Decision Tree Accuracy without adjusted threshold:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report without adjusted threshold:\n", classification_report(y_test, y_pred_dt))
print("Decision Tree Confusion Matrix without adjusted threshold:\n", confusion_matrix(y_test, y_pred_dt))

print("Decision Tree model evaluation completed.")

Step 1: Importing necessary libraries...
Libraries imported successfully.

Dataset loaded successfully!
First five rows of the dataset:
    Age of the patient  Gender of the patient  Total Bilirubin  \
0                65.0                    1.0              0.7   
1                62.0                    0.0             10.9   
2                62.0                    0.0              7.3   
3                58.0                    0.0              1.0   
4                72.0                    0.0              3.9   

   Direct Bilirubin   Alkphos Alkaline Phosphotase  \
0               0.1                          187.0   
1               5.5                          699.0   
2               4.1                          490.0   
3               0.4                          182.0   
4               2.0                          195.0   

    Sgpt Alamine Aminotransferase  Sgot Aspartate Aminotransferase  \
0                            16.0                             18.0   
1      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


Decision Tree Cross-validation scores: [0.87059997 0.86031927 0.8868301  0.86958381 0.84720639]
Mean Decision Tree cross-validation score: 0.8669079076404932
Decision Tree model trained successfully on the balanced dataset.

Step 12: Making predictions without adjusting the threshold for Decision Tree...
Decision Tree Accuracy without adjusted threshold: 0.8801732983696272
Decision Tree Classification Report without adjusted threshold:
               precision    recall  f1-score   support

           0       0.92      0.84      0.87      4386
           1       0.85      0.92      0.89      4385

    accuracy                           0.88      8771
   macro avg       0.88      0.88      0.88      8771
weighted avg       0.88      0.88      0.88      8771

Decision Tree Confusion Matrix without adjusted threshold:
 [[3673  713]
 [ 338 4047]]
Decision Tree model evaluation completed.


In [21]:
with open ('dt_model.pkl','wb') as files:
    pickle.dump(dt_model,files)