# Step 1: Install and Import Necessary Libraries

In [66]:
#Step 1: Install and Import Necessary Libraries
print("Step 1: Importing necessary libraries...")
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
print("Libraries imported successfully.\n")

Step 1: Importing necessary libraries...
Libraries imported successfully.



#Step 2: Load the Dataset

In [67]:
#Step 2: Load the Dataset
df = pd.read_csv("LPD.csv",encoding='latin-1')
print("Dataset loaded successfully!")
print("First five rows of the dataset:\n", df.head(), "\n")

Dataset loaded successfully!
First five rows of the dataset:
    Age of the patient  Gender of the patient  Total Bilirubin  \
0                65.0                    1.0              0.7   
1                62.0                    0.0             10.9   
2                62.0                    0.0              7.3   
3                58.0                    0.0              1.0   
4                72.0                    0.0              3.9   

   Direct Bilirubin   Alkphos Alkaline Phosphotase  \
0               0.1                          187.0   
1               5.5                          699.0   
2               4.1                          490.0   
3               0.4                          182.0   
4               2.0                          195.0   

    Sgpt Alamine Aminotransferase  Sgot Aspartate Aminotransferase  \
0                            16.0                             18.0   
1                            64.0                            100.0   
2           

In [68]:
# Assuming your DataFrame is named 'df'
df.drop_duplicates(inplace=True)

Deleting features left space

In [69]:
# Assuming your DataFrame is named 'df'
df.columns = df.columns.str.lstrip()

In [70]:
# Step 3: Check for Missing Values
print("Step 3: Checking for missing values...")
print("Missing values per column:\n", df.isnull().sum(), "\n")

Step 3: Checking for missing values...
Missing values per column:
 Age of the patient                        1
Gender of the patient                   694
Total Bilirubin                         521
Direct Bilirubin                        447
Alkphos Alkaline Phosphotase            608
Sgpt Alamine Aminotransferase           419
Sgot Aspartate Aminotransferase         366
Total Protiens                          338
ALB Albumin                             381
A/G Ratio Albumin and Globulin Ratio    411
Result                                    0
dtype: int64 



In [71]:

# Check and convert values in 'Gender of the patient'
print("Unique values before cleaning:", df['Gender of the patient'].unique())

# Convert any other value to 0 or 1 (example: treat invalid values as 0)
df['Gender of the patient'] = df['Gender of the patient'].apply(lambda x: 1 if x == 1 else 0)

# Ensure the column is of integer type
df['Gender of the patient'] = df['Gender of the patient'].astype(int)

print("\nUnique values after cleaning:", df['Gender of the patient'].unique())
print("\nSample of cleaned data:")
print(df[['Gender of the patient']].head())


Unique values before cleaning: [ 1.  0. nan]

Unique values after cleaning: [1 0]

Sample of cleaned data:
   Gender of the patient
0                      1
1                      0
2                      0
3                      0
4                      0


In [1]:
# Step 4: Handling Missing Values
print("Step 4: Handling missing values...")

# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Replace missing values for numerical columns with mean
for col in numerical_columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mean(), inplace=True)


print("Missing values handled using imputation.")
print("Current dataset shape:", df.shape, "\n")


Step 4: Handling missing values...


NameError: name 'df' is not defined

In [73]:


# Step 1: Identify outliers using IQR
Q1 = df[numerical_columns].quantile(0.25)
Q3 = df[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find rows where any numerical column has values outside the bounds
outliers = ((df[numerical_columns] < lower_bound) | (df[numerical_columns] > upper_bound))

# Step 2: Handle outliers

# Option 1: Impute outliers with the median (for example)
df_imputed = df.copy()
for col in numerical_columns:
    median_value = df[col].median()
    df_imputed[col] = np.where((df[col] < lower_bound[col]) | (df[col] > upper_bound[col]), median_value, df[col])

# Print confirmation message
print("Outliers handled:")
print(f"Original shape: {df.shape}")
print(f"Imputed shape (outliers imputed): {df_imputed.shape}")


Outliers handled:
Original shape: (19039, 11)
Imputed shape (outliers imputed): (19039, 11)


In [74]:
# Step 6: Split Features and Target Variable
print("Step 6: Splitting features and target variable...")
X = df.drop('Result', axis=1)  # Features
y = df['Result']  # Target variable
print("Features and target split successfully.\n")



Step 6: Splitting features and target variable...
Features and target split successfully.



In [75]:

# Step 7: Feature Scaling
print("Step 7: Scaling the features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Feature scaling complete. Scaled features shape:", X_scaled.shape, "\n")

Step 7: Scaling the features...
Feature scaling complete. Scaled features shape: (19039, 10) 



In [77]:
# Step 8: Split into Train and Test Sets
print("Step 8: Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print("Data split successfully!")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape, "\n")

print("Preprocessing completed successfully! You can now use this data for machine learning models.")

Step 8: Splitting data into training and testing sets...
Data split successfully!
Training set shape: (15231, 10)
Testing set shape: (3808, 10) 

Preprocessing completed successfully! You can now use this data for machine learning models.


In [78]:
# Step 1: Import Libraries for Models
print("Step 1: Importing libraries for models...")

from sklearn.tree import DecisionTreeClassifier  # Import the Decision Tree classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

print("Libraries imported successfully.\n")

Step 1: Importing libraries for models...
Libraries imported successfully.



In [79]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model with tuned hyperparameters
rf_model = RandomForestClassifier(
    n_estimators=100,          # Number of trees
    max_depth=10,              # Maximum depth of the tree
    min_samples_split=10,      # Minimum samples to split a node
    min_samples_leaf=4,        # Minimum samples at leaf node
    max_features='sqrt',       # Number of features to consider for the best split
    random_state=42            # Random seed for reproducibility
)

# Perform 5-fold cross-validation on the training data
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

print("Cross-Validation Scores on Training Data:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation of CV Accuracy:", cv_scores.std(), "\n")

# Step: Train and Evaluate on the Test Set
print("Step: Training the model on the full training set and evaluating on the test set...")

# Train the Random Forest model on the full training set
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_test_predictions = rf_model.predict(X_test)

# Evaluate the performance on the test set
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Set Accuracy:", accuracy_score(y_test, y_test_predictions))
print("Classification Report on Test Set:\n", classification_report(y_test, y_test_predictions))
print("Confusion Matrix on Test Set:\n", confusion_matrix(y_test, y_test_predictions), "\n")

Cross-Validation Scores on Training Data: [0.98555957 0.97866054 0.98752462 0.98686802 0.98522653]
Mean CV Accuracy: 0.9847678555767047
Standard Deviation of CV Accuracy: 0.0031669221086723922 

Step: Training the model on the full training set and evaluating on the test set...
Test Set Accuracy: 0.9842436974789915
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      2733
           1       0.99      0.96      0.97      1075

    accuracy                           0.98      3808
   macro avg       0.99      0.98      0.98      3808
weighted avg       0.98      0.98      0.98      3808

Confusion Matrix on Test Set:
 [[2720   13]
 [  47 1028]] 



In [81]:
with open ('dt_model.pkl','wb') as files:
    pickle.dump(rf_model,files)

In [82]:
df.tail(20)

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
29361,61.0,1,10.2,4.2,232.0,58.0,140.0,7.0,2.7,0.6,0
29440,58.0,0,2.9,1.3,482.0,22.0,34.0,7.0,2.4,0.5,0
29442,28.0,0,1.9,1.0,231.0,16.0,55.0,4.3,1.6,0.6,0
29897,35.0,1,3.3,1.5,214.0,54.0,152.0,5.1,1.8,0.5,0
29901,14.0,0,1.4,0.4,298.0,509.0,623.0,3.6,1.0,0.3,0
30097,50.0,0,2.2,1.0,610.0,17.0,28.0,7.3,2.6,0.55,0
30099,54.0,0,6.8,3.0,542.0,116.0,66.0,6.4,3.1,0.9,0
30100,48.0,1,1.9,1.0,231.0,16.0,55.0,4.3,1.6,0.6,0
30108,26.0,1,0.9,0.2,154.0,16.0,12.0,7.0,3.5,1.0,0
30139,46.0,1,14.2,7.8,374.0,38.0,77.0,4.3,2.0,0.8,0
