In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import tensorflow as tf

# Data

In [2]:
raw_train_data = pd.read_csv("/kaggle/input/playground-series-s3e26/train.csv")
raw_test_data = pd.read_csv("/kaggle/input/playground-series-s3e26/test.csv")

In [3]:
raw_train_data

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,7900,1166,D-penicillamine,16839,F,N,N,N,N,0.8,309.0,3.56,38.0,1629.0,79.05,224.0,344.0,9.9,2.0,C
7901,7901,1492,Placebo,17031,F,N,Y,N,N,0.9,260.0,3.43,62.0,1440.0,142.00,78.0,277.0,10.0,4.0,C
7902,7902,1576,D-penicillamine,25873,F,N,N,Y,S,2.0,225.0,3.19,51.0,933.0,69.75,62.0,200.0,12.7,2.0,D
7903,7903,3584,D-penicillamine,22960,M,N,Y,N,N,0.7,248.0,2.75,32.0,1003.0,57.35,118.0,221.0,10.6,4.0,D


In [4]:
raw_train_data.isna().sum()

id               0
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
dtype: int64

In [5]:
y_train = raw_train_data["Status"]
X_train = raw_train_data.drop(columns=["id","Status"])

In [6]:
def convert_cat(x, cats):
    data = x.copy()
    for col in cats:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    return data

In [7]:
le_cats = ["Drug","Sex","Ascites","Hepatomegaly","Spiders","Edema"]
X_train = convert_cat(X_train, le_cats)

In [8]:
X_train

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,999,0,21532,1,0,0,0,0,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0
1,2574,1,19237,0,0,0,0,0,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0
2,3428,1,13727,0,0,1,1,2,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0
3,2576,1,18460,0,0,0,0,0,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0
4,788,1,16658,0,0,1,0,0,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,1166,0,16839,0,0,0,0,0,0.8,309.0,3.56,38.0,1629.0,79.05,224.0,344.0,9.9,2.0
7901,1492,1,17031,0,0,1,0,0,0.9,260.0,3.43,62.0,1440.0,142.00,78.0,277.0,10.0,4.0
7902,1576,0,25873,0,0,0,1,1,2.0,225.0,3.19,51.0,933.0,69.75,62.0,200.0,12.7,2.0
7903,3584,0,22960,1,0,1,0,0,0.7,248.0,2.75,32.0,1003.0,57.35,118.0,221.0,10.6,4.0


In [9]:
ss_cats = ["Bilirubin","Cholesterol","Albumin","Copper","Alk_Phos","SGOT","Tryglicerides","Platelets","Prothrombin"]
ss = StandardScaler()
X_train[ss_cats] = ss.fit_transform(X_train[ss_cats])

le = LabelEncoder()
y_train = pd.DataFrame(le.fit_transform(y_train))

# Split Data

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

# XGBoost Classifer

In [11]:
param_grid = {
    "eta":[0.01, 0.02, 0.05, 0.1, 0.2],
    "max_depth":list(range(3,11)),
    "colsample_bytree":[0.5,1],
    "objective":["multi:softprob"]
}
xgb_grid = GridSearchCV(XGBClassifier(), param_grid, n_jobs=-1)
xgb_grid.fit(X_train, y_train)
xgb_pred = pd.DataFrame(xgb_grid.predict_proba(X_valid))
print(xgb_pred)

             0         1         2
0     0.989372  0.000735  0.009892
1     0.833941  0.103677  0.062382
2     0.986767  0.000460  0.012772
3     0.994630  0.000939  0.004431
4     0.012459  0.002849  0.984691
...        ...       ...       ...
1576  0.400094  0.512257  0.087649
1577  0.984211  0.000567  0.015222
1578  0.743720  0.019092  0.237188
1579  0.966413  0.002615  0.030972
1580  0.739873  0.049127  0.211000

[1581 rows x 3 columns]


# Tensorflow

In [12]:
def train_model_split(X_train, y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs):
    neuralnet = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation="relu", input_shape=(18,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation="relu"),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    neuralnet.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                     loss="binary_crossentropy",
                     metrics=["accuracy"])
    history = neuralnet.fit(X_train,
                            y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_split=0.2,
                            verbose=0
                           )
    return neuralnet, history

In [13]:
def train_model_data(X_train, y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs):
    neuralnet = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation="relu", input_shape=(18,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation="relu"),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    neuralnet.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                     loss="binary_crossentropy",
                     metrics=["accuracy"])
    history = neuralnet.fit(X_train,
                            y_train,
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_data=(X_valid, y_valid),
                            verbose=0
                           )
    return neuralnet, history

In [14]:
least_validation_loss_split = float("inf")
least_validation_loss_data = float("inf")
least_loss_model_split = None
least_loss_model_data = None

epochs = 100
for num_nodes in [16,32,64]:
  for dropout_prob in [0,0.2]:
    for learning_rate in [0.01,0.005,0.001]:
      for batch_size in [32,64,128]:
        print(f"nodes: {num_nodes}, dropout: {dropout_prob}, learning rate: {learning_rate}, batch size: {batch_size}")
        print("validation_split = 0.2")
        model,history = train_model_split(X_train,y_train,num_nodes,dropout_prob,learning_rate,batch_size,epochs)
        val_loss, val_accuracy = model.evaluate(X_valid,y_valid)
        if val_loss < least_validation_loss_split:
          least_validation_loss_split = val_loss
          least_loss_model_split = model

        print("validation_data = (X_valid,y_valid)")
        model,history = train_model_data(X_train,y_train,num_nodes,dropout_prob,learning_rate,batch_size,epochs)
        val_loss, val_accuracy = model.evaluate(X_valid,y_valid)
        if val_loss < least_validation_loss_data:
          least_validation_loss_data = val_loss
          least_loss_model_data = model

nodes: 16, dropout: 0, learning rate: 0.01, batch size: 32
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.01, batch size: 64
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.01, batch size: 128
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.005, batch size: 32
validation_split = 0.2
validation_data = (X_valid,y_valid)
nodes: 16, dropout: 0, learning rate: 0.005, batch size: 64
validation_split = 0.2
validation_data = (X_valid,y_valid)


KeyboardInterrupt: 

Idk why Tensorflow is performing so poorly and I'm kind of lazy to debug 🤔.

# Test Data

In [15]:
raw_test_data

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.90,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.90,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.00,126.0,221.0,9.8,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13171,2870,Placebo,12279,F,N,N,N,N,1.3,302.0,3.43,75.0,1345.0,145.00,44.0,181.0,10.6,3.0
5267,13172,1770,Placebo,24803,F,N,N,N,N,0.5,219.0,4.09,121.0,663.0,79.05,94.0,311.0,9.7,3.0
5268,13173,3707,D-penicillamine,16990,F,N,Y,N,N,0.8,315.0,4.09,13.0,1637.0,170.50,70.0,426.0,10.9,3.0
5269,13174,1216,Placebo,11773,F,N,N,N,N,0.7,329.0,3.80,52.0,678.0,57.00,126.0,306.0,10.2,1.0


In [16]:
test_data = convert_cat(raw_test_data.drop(columns=["id"]), le_cats)
ss = StandardScaler()
test_data[ss_cats] = ss.fit_transform(test_data[ss_cats])

In [17]:
y_pred = pd.DataFrame(xgb_grid.predict_proba(test_data))
y_pred.rename(columns = {0:"Status_C", 1:"Status_CL", 2:"Status_D"}, inplace=True)
print(y_pred)

      Status_C  Status_CL  Status_D
0     0.581837   0.012583  0.405579
1     0.794302   0.135331  0.070368
2     0.021444   0.012666  0.965890
3     0.983463   0.002006  0.014530
4     0.937892   0.014538  0.047570
...        ...        ...       ...
5266  0.974090   0.011070  0.014840
5267  0.986944   0.002436  0.010620
5268  0.953538   0.002780  0.043683
5269  0.995082   0.002298  0.002620
5270  0.204976   0.005953  0.789071

[5271 rows x 3 columns]


In [19]:
final = pd.concat([raw_test_data["id"], y_pred], axis=1)
final.to_csv("submission.csv", index=False)