# Liver disease prediction

In [18]:
## Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

In [2]:
## Dataset
data = pd.read_csv('indian_liver_patient.csv')

In [3]:
X = data.drop('Dataset', axis=1)
y = data['Dataset']

In [23]:
## Data preprocessing
# Handle missing values for numerical columns
numerical_cols = X.select_dtypes(include=['number']).columns
imputer = SimpleImputer(strategy='mean')
X[numerical_cols] = imputer.fit_transform(X[numerical_cols])

In [24]:
# One-hot encode categorical variable (Gender)
categorical_cols = ['Gender']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [25]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Preprocess data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [27]:
## KNN model
k = 5  # Number of neighbors
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train_processed, y_train)

In [28]:
# predictions
y_pred = knn_model.predict(X_test_processed)


In [29]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6581196581196581


In [30]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.80      0.72      0.76        87
           2       0.37      0.47      0.41        30

    accuracy                           0.66       117
   macro avg       0.58      0.60      0.59       117
weighted avg       0.69      0.66      0.67       117



In [31]:
# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[63 24]
 [16 14]]


### Comparing with Logistic regression

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train_processed, y_train)


In [34]:
# Make predictions
y_pred = logistic_model.predict(X_test_processed)

In [35]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7606837606837606


In [36]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.79      0.92      0.85        87
           2       0.56      0.30      0.39        30

    accuracy                           0.76       117
   macro avg       0.68      0.61      0.62       117
weighted avg       0.73      0.76      0.73       117



In [37]:
# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[80  7]
 [21  9]]
