In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data=pd.read_csv("/content/Indian Liver Patient Dataset (ILPD).csv")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               583 non-null    int64  
 1   gender            583 non-null    object 
 2   tot_bilirubin     583 non-null    float64
 3   direct_bilirubin  583 non-null    float64
 4   tot_proteins      583 non-null    int64  
 5   albumin           583 non-null    int64  
 6   ag_ratio          583 non-null    int64  
 7   sgpt              583 non-null    float64
 8   sgot              583 non-null    float64
 9   alkphos           579 non-null    float64
 10  is_patient        583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [None]:
data.isnull().sum()

age                 0
gender              0
tot_bilirubin       0
direct_bilirubin    0
tot_proteins        0
albumin             0
ag_ratio            0
sgpt                0
sgot                0
alkphos             4
is_patient          0
dtype: int64

In [None]:
data["gender"]

0      Female
1        Male
2        Male
3        Male
4        Male
        ...  
578      Male
579      Male
580      Male
581      Male
582      Male
Name: gender, Length: 583, dtype: object

In [None]:
data["gender"].replace({"Male":0,"Female":1},inplace=True)

In [None]:
data.fillna(data.mean(),inplace=True)

In [None]:
data.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

In [None]:
#removing ouliers using z_score
z_scores = np.abs((data - data.mean()) / data.std())
data=data[(z_scores<3).all(axis=1)]
print(data.shape)

(536, 11)


In [None]:
#scaling the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
data_scaled=scaler.fit_transform(data)


In [None]:
target=data["Dataset"]
data.drop("Dataset",axis=1,inplace=True)
X=data.iloc[:,:-1]
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 536 entries, 0 to 582
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         536 non-null    int64  
 1   Gender                      536 non-null    int64  
 2   Total_Bilirubin             536 non-null    float64
 3   Direct_Bilirubin            536 non-null    float64
 4   Alkaline_Phosphotase        536 non-null    int64  
 5   Alamine_Aminotransferase    536 non-null    int64  
 6   Aspartate_Aminotransferase  536 non-null    int64  
 7   Total_Protiens              536 non-null    float64
 8   Albumin                     536 non-null    float64
 9   Albumin_and_Globulin_Ratio  536 non-null    float64
dtypes: float64(5), int64(5)
memory usage: 46.1 KB


In [None]:
#feature extraction using pca
from sklearn.decomposition import PCA
pca=PCA(n_components=0.95)
pca.fit(data_scaled)
data_pca=pca.transform(data_scaled)

print("Shape of original data:", data_scaled.shape)
print("Shape of reduced data using PCA:", data_pca.shape)

# Store the resultant reduced feature vectors
X_reduced_pca = pd.DataFrame(data_pca)
X_reduced_pca.to_csv("reduced_feature_vectors_pca.csv", index=False)

Shape of original data: (536, 11)
Shape of reduced data using PCA: (536, 8)


In [None]:
#FACTOR ANALYSIS
from sklearn.decomposition import FactorAnalysis
fa=FactorAnalysis(n_components=10)
fa.fit(data_scaled)
data_fa=fa.transform(data_scaled)
print(data_fa.shape)
X_reduced_fa = pd.DataFrame(data_fa)
X_reduced_fa.to_csv("reduced_feature_vectors_fa.csv", index=False)
#check column names after Pca


(536, 10)


In [None]:
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda=LinearDiscriminantAnalysis(n_components=None)
lda.fit(data_scaled,target)
data_lda=lda.transform(data_scaled)
print(data_lda.shape)
X_reduced_lda = pd.DataFrame(data_lda)
X_reduced_lda.to_csv("reduced_feature_vectors_lda.csv", index=False)

(536, 1)


In [None]:
#integrate all the stored features into a new matrix space
X_reduced = pd.concat([X_reduced_pca, X_reduced_fa, X_reduced_lda], axis=1)
print(X_reduced.shape)
X_reduced.to_csv("reduced_feature_vectors.csv", index=False)


(536, 19)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold,cross_val_score
import pandas as pd

# Load your processed data after PCA, LDA, and Factor Analysis
processed_data = pd.read_csv("reduced_feature_vectors.csv")


# Prepare the models
lr = LogisticRegression(max_iter=10000)
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier()
svm = SVC()
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)  # Adjust MLP parameters as needed

models = {
    "LogisticRegression": lr,
    "KNeighborsClassifier": knn,
    "RandomForestClassifier": rf,
    "SVC": svm,
    "MLPClassifier": mlp
}

# An ensemble classifier
ensemble_model = VotingClassifier(estimators=[("knn", knn), ("rf", rf), ("svm", svm), ("mlp", mlp)], voting="hard")
models["ensemble"] = ensemble_model

# Define the 10-fold cross-validation strategy
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Prepare the dictionary to store results
results = {}

# Perform 10-fold cross-validation for each model
for model_name, model in models.items():
    cv_results = cross_val_score(model, processed_data, target, cv=kf, scoring="accuracy")
    results[model_name] = cv_results
    print(f"{model_name} accuracy: {cv_results.mean():.4f} (+/- {cv_results.std():.4f})")

for model_name, model in models.items():
    y_pred = cross_val_predict(model, processed_data, target, cv=kf)
    cm = confusion_matrix(target, y_pred)
    print(f"Confusion Matrix for {model_name}:\n{cm}")

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stab

LogisticRegression accuracy: 0.6904 (+/- 0.0063)
KNeighborsClassifier accuracy: 1.0000 (+/- 0.0000)
RandomForestClassifier accuracy: 1.0000 (+/- 0.0000)
SVC accuracy: 1.0000 (+/- 0.0000)
MLPClassifier accuracy: 1.0000 (+/- 0.0000)
ensemble accuracy: 1.0000 (+/- 0.0000)
Confusion Matrix for LogisticRegression:
[[370   0]
 [166   0]]

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stab


Confusion Matrix for KNeighborsClassifier:
[[370   0]
 [  0 166]]
Confusion Matrix for RandomForestClassifier:
[[370   0]
 [  0 166]]
Confusion Matrix for SVC:
[[370   0]
 [  0 166]]
Confusion Matrix for MLPClassifier:
[[370   0]
 [  0 166]]
Confusion Matrix for ensemble:
[[370   0]
 [  0 166]]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(processed_data,target,test_size=0.25,random_state=42)
from imblearn.over_sampling import SMOTE

smote=SMOTE(random_state=42)
xtrainba,ytrainba=smote.fit_resample(xtrain,ytrain)
lr=LogisticRegression(class_weight="balanced")
knn=KNeighborsClassifier()
nb=GaussianNB()
svm=SVC(class_weight="balanced")
rf=RandomForestClassifier()
dt=DecisionTreeClassifier()
models=[lr,knn,nb,svm,rf,dt]
from sklearn.neural_network import MLPClassifier

# Assuming you have already imported necessary libraries and split your data
# xtrain, xtest, ytrain, ytest = train_test_split(X, target, test_size=0.25, random_state=42)

# Define MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Add MLP to your list of models
models.append(mlp)

# Train and evaluate models
for model in models:
    model.fit(xtrainba, ytrainba)
    ypred = model.predict(xtest)
    print(model)
    print("Accuracy:", accuracy_score(ytest, ypred))
    print("Confusion Matrix:\n", confusion_matrix(ytest, ypred))




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced')
Accuracy: 0.7313432835820896
Confusion Matrix:
 [[98  0]
 [36  0]]
KNeighborsClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
GaussianNB()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
SVC(class_weight='balanced')
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
RandomForestClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
DecisionTreeClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]


In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(processed_data,target,test_size=0.25,random_state=42)
from imblearn.under_sampling import RandomUnderSampler

rus=RandomUnderSampler(random_state=42)
xtrainbal,ytrainbal=rus.fit_resample(xtrain,ytrain)
lr=LogisticRegression(class_weight="balanced")
knn=KNeighborsClassifier()
nb=GaussianNB()
svm=SVC(class_weight="balanced")
rf=RandomForestClassifier()
dt=DecisionTreeClassifier()
models=[lr,knn,nb,svm,rf,dt]
from sklearn.neural_network import MLPClassifier

# Assuming you have already imported necessary libraries and split your data
# xtrain, xtest, ytrain, ytest = train_test_split(X, target, test_size=0.25, random_state=42)

# Define MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Add MLP to your list of models
models.append(mlp)

# Train and evaluate models
for model in models:
    model.fit(xtrainbal, ytrainbal)
    ypred = model.predict(xtest)
    print(model)
    print("Accuracy:", accuracy_score(ytest, ypred))
    print("Confusion Matrix:\n", confusion_matrix(ytest, ypred))




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced')
Accuracy: 0.7313432835820896
Confusion Matrix:
 [[98  0]
 [36  0]]
KNeighborsClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
GaussianNB()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
SVC(class_weight='balanced')
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
RandomForestClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
DecisionTreeClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]


In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(processed_data,target,test_size=0.25,random_state=42)
from imblearn.under_sampling import RandomUnderSampler,NearMiss

nm=NearMiss()
xtrainbal,ytrainbal=nm.fit_resample(xtrain,ytrain)
lr=LogisticRegression(class_weight="balanced")
knn=KNeighborsClassifier()
nb=GaussianNB()
svm=SVC(class_weight="balanced")
rf=RandomForestClassifier()
dt=DecisionTreeClassifier()
models=[lr,knn,nb,svm,rf,dt]
from sklearn.neural_network import MLPClassifier

# Assuming you have already imported necessary libraries and split your data
# xtrain, xtest, ytrain, ytest = train_test_split(X, target, test_size=0.25, random_state=42)

# Define MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Add MLP to your list of models
models.append(mlp)

# Train and evaluate models
for model in models:
    model.fit(xtrainbal, ytrainbal)
    ypred = model.predict(xtest)
    print(model)
    print("Accuracy:", accuracy_score(ytest, ypred))
    print("Confusion Matrix:\n", confusion_matrix(ytest, ypred))




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced')
Accuracy: 0.7313432835820896
Confusion Matrix:
 [[98  0]
 [36  0]]
KNeighborsClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
GaussianNB()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
SVC(class_weight='balanced')
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
RandomForestClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
DecisionTreeClassifier()
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
Accuracy: 1.0
Confusion Matrix:
 [[98  0]
 [ 0 36]]
