In [0]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import seaborn as sns

In [0]:
df = pd.read_csv('/Volumes/workspace/default/rawdata/data.csv')

In [0]:
display(df)

Data set description

In [0]:
# Display basic info and summary statistics of the DataFrame
display(df.info())

In [0]:
df.describe(include='all')

Data analysis

In [0]:
# Check for missing values
display(df.isnull().sum())

In [0]:
# Display data types
display(df.dtypes)

In [0]:
# Show unique values for each column
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

In [0]:
# Visualize distributions for numeric columns
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols].hist(figsize=(12, 8))
plt.tight_layout()
plt.show()

Correlations

In [0]:
# Correlation matrix
corr_matrix = df[numeric_cols].corr()
display(corr_matrix)

In [0]:
# Generate a heatmap for the correlation matrix of numeric columns
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()

Data pre-processing

In [0]:
# Drop rows with missing values in the id column
df = df.dropna(subset=['id'])

In [0]:
# Drop some unused columns
df = df.drop(columns=['symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6'])

In [0]:
# Encode categorical columns: location, country, gender
df = pd.get_dummies(df, columns=['location', 'country', 'gender'])

In [0]:
# Change the date format
df['sym_on'] = pd.to_datetime(df['sym_on'], errors='coerce')
df['hosp_vis'] = pd.to_datetime(df['hosp_vis'], errors='coerce')

In [0]:
# Encode boolean columns to integer columns
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

In [0]:
# Convert date column to int columns
datetime_cols = df.select_dtypes(include=['datetime64[ns]']).columns.tolist()
df[datetime_cols] = df[datetime_cols].astype(int)

Split data into train and test

In [0]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['death'])  # Replace 'target' with your actual target column name
y = df['death']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

I will include all these 3 steps 
e. Evaluation metrics, including accuracy, precision, recall and F1 Score;
f. Results;
g. Hyperparameter optimisation

-- Random forest

In [0]:
# Train
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [0]:
# Evaluate model performance
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

In [0]:
# Results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [0]:
import matplotlib.pyplot as plt

plt.bar(
    ['Accuracy', 'F1 Score', 'Recall Score', 'Precision Score'],
    [accuracy, f1, recall, precision],
    color=['red', 'green', 'purple', 'orange']
)
plt.plot(
    [accuracy, f1, recall, precision],
    color='black'
)
plt.title('Evaluation Metrics for Decision Tree')
plt.show()

In [0]:
# Hyperparameter optimisation

# Define hyperparameter grid for fine-tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, scoring='f1', n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best estimator after fine-tuning
best_rf = grid_search.best_estimator_

In [0]:
# Evaluate the fine-tuned model performance
y_pred = best_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

In [0]:
# Results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [0]:
import matplotlib.pyplot as plt

plt.bar(
    ['Accuracy', 'F1 Score', 'Recall Score', 'Precision Score'],
    [accuracy, f1, recall, precision],
    color=['red', 'green', 'purple', 'orange']
)
plt.plot(
    [accuracy, f1, recall, precision],
    color='black'
)
plt.title('Evaluation Metrics for Decision Tree')
plt.show()

Now I am gonna do the same things with other model to compare their performance

-- XGBoost

In [0]:
%pip install xgboost
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

In [0]:
# Evaluate model performance
y_pred = xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

In [0]:
# Results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [0]:
import matplotlib.pyplot as plt

plt.bar(
    ['Accuracy', 'F1 Score', 'Recall Score', 'Precision Score'],
    [accuracy, f1, recall, precision],
    color=['red', 'green', 'purple', 'orange']
)
plt.plot(
    [accuracy, f1, recall, precision],
    color='black'
)
plt.title('Evaluation Metrics for Decision Tree')
plt.show()

In [0]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)
best_xgb = grid_search.best_estimator_

In [0]:
# Results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [0]:
import matplotlib.pyplot as plt

plt.bar(
    ['Accuracy', 'F1 Score', 'Recall Score', 'Precision Score'],
    [accuracy, f1, recall, precision],
    color=['red', 'green', 'purple', 'orange']
)
plt.plot(
    [accuracy, f1, recall, precision],
    color='black'
)
plt.title('Evaluation Metrics for Decision Tree')
plt.show()

-- Decision tree classifier

In [0]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

In [0]:
# Evaluate model performance
y_pred = dt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

In [0]:
# Results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [0]:
import matplotlib.pyplot as plt

plt.bar(
    ['Accuracy', 'F1 Score', 'Recall Score', 'Precision Score'],
    [accuracy, f1, recall, precision],
    color=['red', 'green', 'purple', 'orange']
)
plt.plot(
    [accuracy, f1, recall, precision],
    color='black'
)
plt.title('Evaluation Metrics for Decision Tree')
plt.show()