In [2]:
!pip install catboost





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'D:/medical report summarization/dataset/Liver Disease/indian_liver_patient.csv'  # Update with your actual file path
df = pd.read_csv(file_path)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df['Albumin_and_Globulin_Ratio'] = imputer.fit_transform(df[['Albumin_and_Globulin_Ratio']])

# Encode the categorical 'Gender' column
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Split data into features and target
X = df.drop('Dataset', axis=1)
y = df['Dataset']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define parameter grids for Random Forest and Gradient Boosting
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Perform GridSearchCV for Gradient Boosting
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)

# Get the best models from GridSearchCV
best_rf = grid_search_rf.best_estimator_
best_gb = grid_search_gb.best_estimator_

# Define the meta-model
meta_model = LogisticRegression()

# Define the stacking model with the tuned base models
stacking_model_tuned = StackingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb)],
    final_estimator=meta_model,
    cv=5
)

# Train the tuned stacking model
stacking_model_tuned.fit(X_train, y_train)

# Make predictions and calculate the accuracy
y_pred_tuned = stacking_model_tuned.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)

print(f"Tuned Model Accuracy: {accuracy_tuned:.4f}")


Tuned Model Accuracy: 0.7607


In [5]:
# Prediction Code
def predict_liver_disease(input_data):
    # Convert input data to a numpy array
    input_data = np.array(input_data).reshape(1, -1)
    
    # Standardize the input data
    input_data = scaler.transform(input_data)
    
    # Predict using the stacking model
    prediction = stacking_model_tuned.predict(input_data)
    
    # Return the prediction
    return 'Liver Disease' if prediction[0] == 1 else 'No Liver Disease'

# Example usage:
# input_data = [Age, Gender (0 for Female, 1 for Male), Total_Bilirubin, Direct_Bilirubin, 
#               Alkaline_Phosphotase, Alamine_Aminotransferase, Aspartate_Aminotransferase, 
#               Total_Protiens, Albumin, Albumin_and_Globulin_Ratio]
example_input = [30, 1, 0.20, 0.10, 150, 21.0, 11.0, 7.00, 4.00, 1.33]
prediction = predict_liver_disease(example_input)
print(prediction)

Liver Disease


