In [None]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
train, test = pd.read_csv("/kaggle/input/playground-series-s3e22/train.csv"), pd.read_csv("/kaggle/input/playground-series-s3e22/test.csv")

In [None]:
train.describe()

In [None]:
len(train["hospital_number"].unique())

In [None]:
train = train.drop(["id", "hospital_number"], axis=1)

In [None]:
train

Converting categorical columns from object to categorical, then convert it to numerical

In [None]:
def cat_to_numeric(df):
    for col_name in df.columns:
        if(df[col_name].dtype == 'object'):
            df[col_name]= df[col_name].astype('category')
            df[col_name] = df[col_name].cat.codes
    return df

In [None]:
train = cat_to_numeric(train)

Check for correlation

In [None]:
corr_matrix = train.corr()

In [None]:
corr_matrix

In [None]:
sns.heatmap(corr_matrix)
plt.show()

In [None]:
train[["surgery", "outcome"]].hist()
plt.show()

In [None]:
# rectal_temp, mucous_membrane, pain, nasogastric_tube, nasogastric_reflux, nasogastric_reflux_ph, abdomo_protein, lesion_1,	lesion_2,	lesion_3,	cp_data

# Naive approach: Random Forest on the full data

In [None]:
X_train = train[[
    "surgery",
    "age",
    "respiratory_rate",
    "peripheral_pulse",
    "peristalsis",
    "abdominal_distention",
    "rectal_exam_feces",
    "abdomen",
    "rectal_temp", 
    "mucous_membrane", 
    "pain", 
    "nasogastric_tube", 
    "nasogastric_reflux", 
    "nasogastric_reflux_ph", 
    "abdomo_protein", 
    "total_protein",
    "lesion_1",	
    "lesion_2",	
    "lesion_3",	
    "cp_data"]]
y_train = train["outcome"]

In [None]:
clf = XGBClassifier(random_state=42)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(cross_val_score(estimator = clf,
                      X = X_train, 
                      y = y_train, 
                      scoring="f1_micro", 
                      cv=5, 
                      verbose=4))

# Create submission

In [None]:
def create_submission(model, test_df):
    ids = test_df["id"]
    
    test_df = test_df[[
        "surgery",
        "age",
        "respiratory_rate",
        "peripheral_pulse",
        "peristalsis",
        "abdominal_distention",
        "rectal_exam_feces",
        "abdomen",
        "rectal_temp", 
        "mucous_membrane", 
        "pain", 
        "nasogastric_tube", 
        "nasogastric_reflux", 
        "nasogastric_reflux_ph", 
        "abdomo_protein", 
        "total_protein",
        "lesion_1",	
        "lesion_2",	
        "lesion_3",	
        "cp_data"
    ]]
    test_df = cat_to_numeric(test_df)
    
    pred = pd.Series(model.predict(test_df))
 
    pred[pred == 0] = "died"
    pred[pred == 1] = "euthanized"
    pred[pred == 2] = "lived"

    submission_df = pd.DataFrame({
        "id": ids,
        "outcome": pred
    })
    
    submission_df.to_csv("submission.csv", index=False)

In [None]:
create_submission(clf, test)