In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier
# import xgboost as xgb
%matplotlib inline

In [None]:
data= pd.read_csv('/kaggle/input/indian-liver-patient-records/indian_liver_patient.csv')

In [None]:
data.head(10)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

# replacing missing values with mean & get rid of infinite values

In [None]:
data.Albumin_and_Globulin_Ratio.fillna(data.Albumin_and_Globulin_Ratio.mean(), inplace=True)
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
data.info()

# encode of Gender

In [None]:
data['Gender']=data['Gender'].apply(lambda x:1 if x=='Male' else 0)

# No liver disease then:=0 for patients having liver disease then:=1

In [None]:
data['Dataset'] = data['Dataset'].map({2:0,1:1})

# Visualizations

In [None]:
data['Gender'].value_counts().plot.bar(color='peachpuff')

In [None]:
data['Dataset'].value_counts().plot.bar(color='blue')

In [None]:
plt.rcParams['figure.figsize']=(10,10)
sns.pairplot(data,hue='Gender')

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(x="Albumin", y="Albumin_and_Globulin_Ratio",color='mediumspringgreen',data=data);
plt.show()

In [None]:
plt.figure(figsize=(8,6))
data.groupby('Gender').sum()["Total_Protiens"].plot.bar(color='coral')

In [None]:
plt.figure(figsize=(8,6))
data.groupby('Gender').sum()['Albumin'].plot.bar(color='midnightblue')

In [None]:
plt.figure(figsize=(8,6))
data.groupby('Gender').sum()['Total_Bilirubin'].plot.bar(color='fuchsia')

In [None]:
corr=data.corr()
plt.figure(figsize=(20,10)) 
sns.heatmap(corr,cmap="Blues",annot=True)

# split data

In [None]:
X=data.iloc[:,:-1]

y = data['Dataset']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Standardize the data & Applying PCA

In [None]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
#Determine the number of components to retain
pca = PCA()
pca.fit(X_train_std)
explained_variances = pca.explained_variance_ratio_
cumulative_variances = np.cumsum(explained_variances)
threshold = 0.95
num_components = np.argmax(cumulative_variances >= threshold) + 1
#Applying PCA 
pca = PCA(n_components=num_components)  
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)


# LogisticRegression with PCA

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_pca, y_train)
predictions1 = logreg.predict(X_test_pca)
logreg_acc = accuracy_score(y_test, predictions1)
print("Accuracy of the Logistic Regression Model is: ", logreg_acc)

# LogisticRegression without PCA

In [None]:
logreg = LogisticRegression(max_iter=900)
# fit the model on the training data
logreg.fit(X_train, y_train)
# make predictions on the testing data
y_pred = logreg.predict(X_test)
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Support Vector Machine

In [None]:
svm_model = SVC(kernel='linear')

# Train the SVM model on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_model.predict(X_test)

# Calculate the accuracy of the SVM model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Gradient Boosting Classifier

In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the Gradient Boosting Classifier on the training data
gb_classifier.fit(X_train_pca, y_train)

# Make predictions on the test data
y_pred = gb_classifier.predict(X_test_pca)

# Calculate the accuracy of the Gradient Boosting Classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Decision Tree Classifier

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=3, random_state=42)

# Train the Decision Tree Classifier on the training data
dt_classifier.fit(X_train_pca, y_train)

# Make predictions on the test data
y_pred = dt_classifier.predict(X_test_pca)

# Calculate the accuracy of the Decision Tree Classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Random Forest Classifier without PCA

In [None]:
model = RandomForestClassifier(n_estimators=100)

# train the model using the training data
model.fit(X_train, y_train)

# test the model using the testing data
accuracy = model.score(X_test, y_test)
print(f"Model accuracy: {accuracy}")

# Random Forest Classifier with PCA

In [None]:
model = RandomForestClassifier(n_estimators=100)

# train the model using the training data
model.fit(X_train_pca, y_train)

# test the model using the testing data
accuracy = model.score(X_test_pca, y_test)
print(f"Model accuracy: {accuracy}")

# Random Forest Classifier with Feature Selection 

In [None]:
from sklearn.feature_selection import SelectFromModel #filter method - select from model

In [None]:
X.shape

In [None]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 20)) 
sel.fit(X,y)
selected_features = sel.transform(X)
sel.get_support()

In [None]:
selected_features.shape

# Random Forest Classifier with PCA & Feature Selection

In [None]:
model = RandomForestClassifier()

# train the model using the training data
model.fit(X,y)

# test the model using the testing data
accuracy = model.score(X,y)
print(f"Model accuracy: {accuracy}")