<a href="https://colab.research.google.com/github/sumanankur108/eda-theory-da/blob/main/module_2_21bds0097.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!ls

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("liver_data.csv")

# Display basic information
print(df.info())

# Display the first few rows
print(df.head())

In [None]:
# Ensure column names are stripped of leading/trailing spaces
df.columns = df.columns.str.strip()

# Select only numerical columns for mean imputation
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Drop rows with missing categorical values
df.dropna(subset=['Gender of the patient'], inplace=True)

# Display updated dataset information
print(df.info())

In [None]:
# Clean column names (remove leading/trailing spaces)
df.columns = df.columns.str.strip()

# Rename specific columns if needed
df.rename(columns={'Alkphos Alkaline Phosphotase': 'Alkaline_Phosphatase',
                   'Sgpt Alamine Aminotransferase': 'SGPT',
                   'Sgot Aspartate Aminotransferase': 'SGOT',
                   'Total Protiens': 'Total_Proteins',
                   'ALB Albumin': 'Albumin',
                   'A/G Ratio Albumin and Globulin Ratio': 'A_G_Ratio'},
          inplace=True)

# Display updated column names
print(df.columns)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histogram of Total Bilirubin
plt.figure(figsize=(8, 5))
sns.histplot(df['Total Bilirubin'], bins=30, kde=True)
plt.title('Distribution of Total Bilirubin')
plt.show()

# Convert categorical column ('Gender of the patient') into numeric values (Male = 1, Female = 0)
df['Gender of the patient'] = df['Gender of the patient'].map({'Male': 1, 'Female': 0})

# Drop any remaining non-numeric columns (if any)
df_numeric = df.select_dtypes(include=['number'])

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
# Convert Gender column to numerical (Male=1, Female=0)
df['Gender of the patient'] = df['Gender of the patient'].map({'Male': 1, 'Female': 0})

# Check unique values after conversion
print(df['Gender of the patient'].unique())


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop(columns=['Result'])  # Exclude target column
y = df['Result']  # Target variable

# Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")


In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("liver_data.csv")

# Ensure column names are stripped of leading/trailing spaces
df.columns = df.columns.str.strip()

# Convert categorical column ('Gender of the patient') to numeric
df['Gender of the patient'] = df['Gender of the patient'].map({'Male': 1, 'Female': 0})

# Select only numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns

# Compute IQR
Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_filtered = df[~((df[numerical_cols] < lower_bound) | (df[numerical_cols] > upper_bound)).any(axis=1)]

# Display dataset shape before and after removing outliers
print(f"Original dataset size: {df.shape}")
print(f"Filtered dataset size: {df_filtered.shape}")


In [None]:
from scipy.stats import zscore

# Compute Z-scores for numerical columns
z_scores = df[numerical_cols].apply(zscore)

# Define threshold for outliers (commonly 3)
threshold = 3

# Filter dataset by removing outliers
df_filtered_z = df[(z_scores < threshold).all(axis=1)]

# Display dataset shape before and after removing outliers
print(f"Original dataset size: {df.shape}")
print(f"Filtered dataset size: {df_filtered_z.shape}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot boxplots for numerical columns
plt.figure(figsize=(12, 8))
df[numerical_cols].boxplot(rot=45)
plt.title("Boxplot for Outlier Detection")
plt.show()

In [None]:
from scipy.stats.mstats import winsorize

# Apply Winsorization (Cap at 5th and 95th percentile)
df_winsorized = df.copy()
for col in numerical_cols:
    df_winsorized[col] = winsorize(df[col], limits=[0.05, 0.05])  # Limits: 5% lower & upper

print(df_winsorized.describe())

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("liver_data.csv")

# Ensure column names are stripped of leading/trailing spaces
df.columns = df.columns.str.strip()

# Create a derived feature (Bilirubin Ratio = Total Bilirubin × Direct Bilirubin)
df['Bilirubin Ratio'] = df['Total Bilirubin'] * df['Direct Bilirubin']

# Display the dataset with the new column
df.head()

In [None]:
import numpy as np

# Define threshold for high Bilirubin Ratio
threshold = 10

# Identify outlier cases
outliers = df[df['Bilirubin Ratio'] > threshold]

# Display the rows with extreme values
outliers


In [None]:
# Show records where Bilirubin Ratio > 15
df[df['Bilirubin Ratio'] > 15]

In [None]:
# Create a new feature: Liver Enzyme Ratio (SGOT / SGPT)
df['Liver Enzyme Ratio'] = df['Sgot Aspartate Aminotransferase'] / df['Sgpt Alamine Aminotransferase']

# Display the dataset with the new column
df.head()

In [None]:
# Identify cases with high Liver Enzyme Ratio (>2.0)
high_enzyme_cases = df[df['Liver Enzyme Ratio'] > 2.0]

# Display the high-risk cases
high_enzyme_cases

In [None]:
df[df['Liver Enzyme Ratio'] > 3.0]

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("liver_data.csv")

# Ensure column names are stripped of leading/trailing spaces
df.columns = df.columns.str.strip()

# Define age bins and labels
age_bins = [0, 30, 40, 50, 60, 70, 100]  # Define bins
age_labels = ['Below 30', '30-40', '40-50', '50-60', '60-70', 'Above 70']  # Define category labels

# Apply binning
df['Age Group'] = pd.cut(df['Age of the patient'], bins=age_bins, labels=age_labels)

# Display the dataset with new binning column
df[['Age of the patient', 'Age Group']].head()

In [None]:
# Define Bilirubin bins and labels
bilirubin_bins = [0, 0.5, 1.2, 3.0, 10.0, 50.0]  # Define ranges
bilirubin_labels = ['Normal', 'Mild', 'Moderate', 'High', 'Critical']  # Define labels

# Apply binning
df['Bilirubin Level'] = pd.cut(df['Total Bilirubin'], bins=bilirubin_bins, labels=bilirubin_labels)

# Display dataset with new binning column
df[['Total Bilirubin', 'Bilirubin Level']].head()
