In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import kagglehub
import os
import warnings
import torch
from torch import nn
from sklearn.model_selection import train_test_split

In [49]:
warnings.filterwarnings("ignore")

## Load Dataset

In [50]:
path = kagglehub.dataset_download("fedesoriano/cirrhosis-prediction-dataset")

path



'/root/.cache/kagglehub/datasets/fedesoriano/cirrhosis-prediction-dataset/versions/2'

In [51]:
os.listdir(path)[0]

'cirrhosis.csv'

## Data PreProcessing

In [52]:
df = pd.read_csv(f'{path}/{os.listdir(path)[0]}')

df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [53]:
# check for missing values

def missing_values(df):
  missing_values = df[df.columns[1:]].isnull()
  return missing_values.sum(), len(df.columns)

missing_values(df)

(N_Days             0
 Status             0
 Drug             106
 Age                0
 Sex                0
 Ascites          106
 Hepatomegaly     106
 Spiders          106
 Edema              0
 Bilirubin          0
 Cholesterol      134
 Albumin            0
 Copper           108
 Alk_Phos         106
 SGOT             106
 Tryglicerides    136
 Platelets         11
 Prothrombin        2
 Stage              6
 dtype: int64,
 20)

In [54]:
# drop columns with missing values more than threshold

threshold = 0.8 * len(df)

df = df.dropna(thresh = threshold, axis = 1)

df.head()

Unnamed: 0,ID,N_Days,Status,Age,Sex,Edema,Bilirubin,Albumin,Platelets,Prothrombin,Stage
0,1,400,D,21464,F,Y,14.5,2.6,190.0,12.2,4.0
1,2,4500,C,20617,F,N,1.1,4.14,221.0,10.6,3.0
2,3,1012,D,25594,M,S,1.4,3.48,151.0,12.0,4.0
3,4,1925,D,19994,F,S,1.8,2.54,183.0,10.3,4.0
4,5,1504,CL,13918,F,N,3.4,3.53,136.0,10.9,3.0


In [55]:
missing_values(df)

(N_Days          0
 Status          0
 Age             0
 Sex             0
 Edema           0
 Bilirubin       0
 Albumin         0
 Platelets      11
 Prothrombin     2
 Stage           6
 dtype: int64,
 11)

In [56]:
# fill missing values

df['Platelets'] = df['Platelets'].fillna(df['Platelets'].median())
df['Prothrombin'] = df['Prothrombin'].fillna(df['Prothrombin'].mean())
df['Stage'] = df['Stage'].fillna(df['Stage'].median())

missing_values(df)

(N_Days         0
 Status         0
 Age            0
 Sex            0
 Edema          0
 Bilirubin      0
 Albumin        0
 Platelets      0
 Prothrombin    0
 Stage          0
 dtype: int64,
 11)

In [57]:
print(df['Sex'].unique())
print(df['Edema'].unique())
print(df['Status'].unique())

['F' 'M']
['Y' 'N' 'S']
['D' 'C' 'CL']


In [58]:
# converting str values in int

df['Sex'] = df['Sex'].map({'M': 0, 'F': 1})
df['Edema'] = df['Edema'].map({'N': 0, 'Y': 1, 'S': 0})
df['Status'] = df['Status'].map({'D': 0, 'C': 1, 'CL': 2})

df.head()

Unnamed: 0,ID,N_Days,Status,Age,Sex,Edema,Bilirubin,Albumin,Platelets,Prothrombin,Stage
0,1,400,0,21464,1,1,14.5,2.6,190.0,12.2,4.0
1,2,4500,1,20617,1,0,1.1,4.14,221.0,10.6,3.0
2,3,1012,0,25594,0,0,1.4,3.48,151.0,12.0,4.0
3,4,1925,0,19994,1,0,1.8,2.54,183.0,10.3,4.0
4,5,1504,2,13918,1,0,3.4,3.53,136.0,10.9,3.0


In [59]:
df = df.drop('ID', axis = 1)

df.head()

Unnamed: 0,N_Days,Status,Age,Sex,Edema,Bilirubin,Albumin,Platelets,Prothrombin,Stage
0,400,0,21464,1,1,14.5,2.6,190.0,12.2,4.0
1,4500,1,20617,1,0,1.1,4.14,221.0,10.6,3.0
2,1012,0,25594,0,0,1.4,3.48,151.0,12.0,4.0
3,1925,0,19994,1,0,1.8,2.54,183.0,10.3,4.0
4,1504,2,13918,1,0,3.4,3.53,136.0,10.9,3.0


In [60]:
print(f"We have to classify the data in {len(df['Stage'].unique())} classes ==> {df['Stage'].unique()}")

We have to classify the data in 4 classes ==> [4. 3. 2. 1.]


## Data Scaling

In [61]:
# function to apply standard scaler
def standard_scaler(df):

  new_df = pd.DataFrame()

  for c in df.columns:
    if c == 'Stage':
      new_df[c] = df[c]
      continue
    mean = df[c].mean()
    std = df[c].std()
    new_df[c] = (df[c] - mean) / std

  return new_df

df = standard_scaler(df)

## Split and Convert Data

In [62]:
# split the data in test train
X = df.drop(['Stage'], axis=1)
y = df['Stage']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [63]:
# convert data to tensors
X_train = torch.from_numpy(X_train.values).type(torch.float)
X_test = torch.from_numpy(X_test.values).type(torch.float)
y_train = torch.from_numpy(y_train.values).type(torch.LongTensor)
y_test = torch.from_numpy(y_test.values).type(torch.LongTensor)

In [64]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([334, 9]),
 torch.Size([84, 9]),
 torch.Size([334]),
 torch.Size([84]))

In [67]:
torch.unique(y_train)

tensor([1, 2, 3, 4])

## Creating the Neural Network

In [68]:
# device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"

device

'cpu'

In [21]:
# accuracy function
def accuracy_f(y_test, y_pred):
  correct = torch.eq(y_test, y_pred).sum().item()
  acc = (correct / len(y_pred)) * 100
  return acc

In [69]:
class MyModel(nn.Module):
  def __init__(self, input_features, output_features, hidden_units):

    super().__init__()
    self.linear_layer_stack = nn.Sequential(
        nn.Linear(in_features=input_features, out_features=hidden_units),
        nn.ReLU(),
        nn.Linear(in_features=hidden_units, out_features=hidden_units),
        nn.ReLU(),
        nn.Linear(in_features=hidden_units, out_features=output_features)
    )

  def forward(self, x):
    self.linear_layer_stack(x)

In [72]:
my_model = MyModel(input_features=X_train.shape[1], output_features= len(torch.unique(y_train)), hidden_units=8).to(device)
my_model

MyModel(
  (linear_layer_stack): Sequential(
    (0): Linear(in_features=9, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=8, bias=True)
    (3): ReLU()
    (4): Linear(in_features=8, out_features=4, bias=True)
  )
)

In [None]:
# loss function
loss_fn = nn.CrossEntropyLoss()

In [None]:
# optimizer
optimizer = torch.optim.Adam(params=my_model.parameters(), lr=0.01)