In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import joblib
import warnings
warnings.filterwarnings("ignore")

In [None]:
kidney_df = pd.read_csv('/content/kidney_disease - kidney_disease.csv')
liver_df = pd.read_csv('/content/indian_liver_patient - indian_liver_patient.csv')
parkinsons_df = pd.read_csv('/content/parkinsons - parkinsons.csv')

In [None]:
# Preview the data
print("Initial shape:", liver_df.shape)
liver_df.head(10)

In [None]:
liver_df['Dataset'].value_counts()


In [None]:

#check for duplicates
print("\n duplicate values:", liver_df.duplicated().sum())

In [None]:

#check for duplicates
print("\n duplicate values:", liver_df.duplicated().sum())

In [None]:
# Shape and preview
liver_df.shape

In [None]:



# Check null values column-wise
liver_df.isna().sum()

In [None]:

# Check total NaNs in the dataset
liver_df.isna().sum().sum()


In [None]:

# Check fully NaN rows
liver_df[liver_df.isna().all(axis=1)]

In [None]:


# Count of fully NaN rows
liver_df.isna().all(axis=1).sum()

In [None]:

# Drop Fully Empty Rows
liver_df = liver_df.dropna(how='all')

In [None]:

liver_df['Albumin_and_Globulin_Ratio'].isna().sum()
liver_df['Albumin_and_Globulin_Ratio'].describe()

In [None]:
liver_df['Albumin_and_Globulin_Ratio'].fillna(liver_df['Albumin_and_Globulin_Ratio'].median(), inplace=True)

In [None]:
# Check duplicates
liver_df.duplicated().sum()

In [None]:
# Drop them
liver_df = liver_df.drop_duplicates()

In [None]:
# Final shape
liver_df.shape

In [None]:


print(liver_df.dtypes)


In [None]:
liver_df.head(10)

In [None]:

#preprocessing label encoder
#for object coloums
le = LabelEncoder()
liver_df['Gender'] = le.fit_transform(liver_df['Gender'])

In [None]:
liver_df['Dataset'].value_counts()

In [None]:
# Convert 1 (has disease) → 1
# Convert 2 (no disease) → 0
liver_df['Dataset'] = liver_df['Dataset'].replace({2: 0})


In [None]:
#save clean preprocessed liverdisease data
liver_df.to_csv('liver_disease_clean.csv', index=False)

# **kindey datset preproesssing**

In [None]:
print("intial shape", kidney_df.shape)
print("missing value",kidney_df.isnull().sum())

In [None]:
# Check for missing values
kidney_df.isnull().sum()

# Visualize missing data
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(kidney_df.isnull(), cbar=False)
plt.show()

In [None]:

# Count of fully NaN rows
kidney_df.isna().all(axis=1).sum()

In [None]:
# Drop Fully Empty Rows
kidney_df = kidney_df.dropna(how='all')

In [None]:

# Check total NaNs in the dataset
kidney_df.isna().sum().sum()


In [None]:

import numpy as np

# Replace '?' with NaN
kidney_df.replace('?', np.nan, inplace=True)

In [None]:
kidney_df['pcv'] = pd.to_numeric(kidney_df['pcv'], errors='coerce')
kidney_df['wc'] = pd.to_numeric(kidney_df['wc'], errors='coerce')
kidney_df['rc'] = pd.to_numeric(kidney_df['rc'], errors='coerce')

In [None]:
# Numerical columns
numeric_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']

# Binary categorical columns (yes/no, good/poor etc.)
binary_categoricals = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']

# Target column
target_col = 'classification'

In [None]:
#label encoding for binary
label_encoders = {}

for col in binary_categoricals:
    le = LabelEncoder()
    kidney_df[col] = le.fit_transform(kidney_df[col])
    label_encoders[col] = le
    print(f"Label Encoder for {col}: {le.classes_}")

In [None]:
target_encoder = LabelEncoder()
kidney_df[target_col] = target_encoder.fit_transform(kidney_df[target_col])

In [None]:

kidney_df.isnull().sum()  # Should all be 0 now

In [None]:

kidney_df.head()

In [None]:
kidney_df['classification'].value_counts()

In [None]:

#save the preprocessed file
kidney_df.to_csv('kidney_disease_clean.csv', index=False)

In [None]:
#parkinsons disease
# Preview the data
print("Initial shape:", liver_df.shape)
parkinsons_df.head()

In [None]:
#check for duplicates
print("\n duplicate values:", parkinsons_df.duplicated().sum())

In [None]:

# Check total NaNs in the dataset
parkinsons_df.isna().sum().sum()


In [None]:
print("missing values:\n", parkinsons_df.isnull().sum())

In [None]:
#since there is no nan values,missing values, duplicate values encoding alone
#dropping non feature column
parkinsons_df = parkinsons_df.drop('name', axis=1)


In [None]:
#save the cleaned data set
parkinsons_df.to_csv('parkinsons_disease_clean.csv', index=False)

# **kidney disease model**

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

In [None]:
df_kidney = pd.read_csv("kidney_disease_clean.csv")

# Features & Target
X_kidney = df_kidney.drop(['id', 'classification'], axis=1)
y_kidney = df_kidney['classification']

# ---- FIX: Handle Missing Values ----
imputer = SimpleImputer(strategy='median')   # or 'mean'
X_kidney_imputed = imputer.fit_transform(X_kidney)

# Train-test split
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(
    X_kidney_imputed, y_kidney, test_size=0.2, random_state=42
)

# Scaling
scaler = MinMaxScaler()
X_train_s = scaler.fit_transform(X_train_k)
X_test_s = scaler.transform(X_test_k)

# Train Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_s, y_train_k)

# Predictions
y_pred_k = model.predict(X_test_s)

# Evaluation
print("Kidney Disease Model")
print("Accuracy:", accuracy_score(y_test_k, y_pred_k))
print(classification_report(y_test_k, y_pred_k))


In [None]:
#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
cm = confusion_matrix(y_test_k, y_pred_k, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix for kidney disease")
plt.show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Pipeline for CV
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler()),
    ("model", LogisticRegression(max_iter=1000))
])

# Cross-validation
scores = cross_val_score(pipeline, X_kidney, y_kidney, cv=5)

print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())


In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test_k, y_pred_k)
roc_auc = auc(fpr, tpr)
print("ROC AUC:", roc_auc)
print("Thresholds:", thresholds)


In [None]:
#save the model in joblib
import joblib
joblib.dump(model, 'kidney_model.joblib')

In [None]:

#save scalar file in joblib
import joblib
joblib.dump(scaler, 'minmax_scaler_kidney.joblib')

# **LIVER DISEASE**

In [None]:
#liver disease

# Load dataset
df_liver = pd.read_csv("liver_disease_clean.csv")

# Features & Target
X_liver = df_liver.drop('Dataset', axis=1)
y_liver = df_liver['Dataset']


# Train-Test Split
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_liver, y_liver, test_size=0.2, random_state=42,stratify= y_liver)

# Scaling
scaler = MinMaxScaler()
X_liver_scaled = scaler.fit_transform(X_liver)
X_train_ls = scaler.transform(X_train_l)
X_test_ls = scaler.transform(X_test_l)

# Train & Evaluate
model = RandomForestClassifier()
model.fit(X_train_ls, y_train_l)
y_pred_l = model.predict(X_test_ls)

print("Liver Disease Model")
print("Accuracy:", accuracy_score(y_test_l, y_pred_l))
print(classification_report(y_test_l, y_pred_l))


In [None]:
#cross validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_liver, y_liver, cv=5)
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())

In [None]:
#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
cm = confusion_matrix(y_test_l, y_pred_l, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)

as it is imbalanced letz try xgboost, for this scaling no need

In [None]:

!pip install xgboost scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score

#load the dataset
df_liver = pd.read_csv("liver_disease_clean.csv")
# feature and target
X_liver = df_liver.drop('Dataset', axis=1)
y_liver = df_liver['Dataset'].astype(int)

#train test split
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_liver, y_liver, test_size=0.2, random_state=42,stratify= y_liver)

# 3) Handle imbalance with scale_pos_weight = (#neg / #pos)
pos = (y_train_l == 1).sum()
neg = (y_train_l == 0).sum()
spw = neg / pos if pos > 0 else 1.0

# 4) Train XGBoost (no scaling needed)
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=spw,      # helps reduce false positives when positives dominate
    n_estimators=400,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
)
model.fit(X_train_l, y_train_l)

# 5) Predict classes & probs
y_pred_l = model.predict(X_test_l)
y_proba_l = model.predict_proba(X_test_l)[:, 1]

# 6) Metrics
print("Liver Disease Model")
print("Accuracy:", accuracy_score(y_test_l, y_pred_l))
print(classification_report(y_test_l, y_pred_l, target_names=["Healthy(0)","Disease(1)"]))
print("ROC AUC:", roc_auc_score(y_test_l, y_proba_l))
# 7) Confusion matrix
cm = confusion_matrix(y_test_l, y_pred_l, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Healthy","Disease"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix — Liver")
plt.show()
# 8) ROC curve (optional visual)
fpr, tpr, thr = roc_curve(y_test_l, y_proba_l)
plt.plot(fpr, tpr, lw=2)
plt.plot([0,1],[0,1],'--', lw=1)
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title("ROC — Liver")
plt.show()



In [None]:

#saving model
import joblib
joblib.dump(model, 'liver_model.joblib')

# **Parkinsons disease**

In [None]:
# parkinsons disease
#Load dataset
df_parkinson = pd.read_csv("parkinsons_disease_clean.csv")

# Features & Target
X_park = df_parkinson.drop('status', axis=1)
y_park = df_parkinson['status'].astype(int)



# Train-Test Split
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_park, y_park, test_size=0.2, random_state=42,stratify=y_park)

#imbalance handling
pos = (y_train_p == 1).sum()
neg = (y_train_p == 0).sum()
spw = neg / pos if pos > 0 else 1.0


# Train & Evaluate
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=spw,      # helps reduce false positives when positives dominate
    n_estimators=400,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1
)
model.fit(X_train_p, y_train_p)

y_pred_p = model.predict(X_test_p)

print("Parkinson’s Disease Model")
print("Accuracy:", accuracy_score(y_test_p, y_pred_p))
print(classification_report(y_test_p, y_pred_p))


In [None]:
#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
cm = confusion_matrix(y_test_p, y_pred_p, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)

In [None]:
import joblib
#save model

joblib.dump(model, 'parkinsons_model.joblib')