In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s6e2/sample_submission.csv
/kaggle/input/playground-series-s6e2/train.csv
/kaggle/input/playground-series-s6e2/test.csv


In [2]:
fast = True

In [3]:
pd.read_csv("/kaggle/input/playground-series-s6e2/sample_submission.csv").head(2)

Unnamed: 0,id,Heart Disease
0,630000,0
1,630001,0


In [4]:
train = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")

In [5]:
train.head(2)

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence


In [6]:
bool_feats = [feat for feat in test.select_dtypes('number') if test[feat].nunique()==2]

num_feats = [feat for feat in test.select_dtypes(include='number').columns.tolist()]
# num_feats = [feat for feat in num_feats if feat not in bool_feats]

cat_feats = test.select_dtypes(exclude='number').columns.tolist()

In [7]:
target = "Heart Disease"
train[target] = train[target].map({"Presence": 1, "Absence": 0})

### Distributions

In [8]:
def plot_num_distribution_grid(df, num_feats=num_feats, grouper=target, bins=50, ncols=3):
    """
    Plot histograms + boxplots for numerical features in a grid layout.
    Each feature gets two stacked subplots (hist + box) with boxplot 1/3 height of histplot.
    Both plots share the same x-axis.
    """
    df[target] = df[target].astype('category')
    nrows = int(np.ceil(len(num_feats) / ncols))

    fig = plt.figure(figsize=(ncols*6, nrows*4))
    gs = fig.add_gridspec(nrows=nrows, ncols=ncols, hspace=0.6)

    for i, feat in enumerate(num_feats):
        row = i // ncols
        col = i % ncols

        # Sub-grid with height ratio 3:1 and shared x-axis
        sub_gs = gs[row, col].subgridspec(2, 1, height_ratios=[3, 1])
        ax_hist = fig.add_subplot(sub_gs[0])
        ax_box = fig.add_subplot(sub_gs[1], sharex=ax_hist)

        # Histogram
        # sns.histplot(data=df, x=feat, hue=grouper, bins=bins, kde=True,
        #              palette=palette, multiple="dodge", ax=ax_hist)
        sns.kdeplot(data=df, x=feat, hue=grouper, fill=True, ax=ax_hist)
        ax_hist.set_title(f"{feat} Distribution", fontsize=10, color='white', backgroundcolor='black')
        ax_hist.set_xlabel('')  # Remove x-label from histogram
        ax_hist.tick_params(axis='x', labelbottom=False)  # Hide x-ticks on histogram

        # Boxplot (horizontal to match x-axis)
        sns.boxplot(data=df, x=feat, y=grouper,
                    ax=ax_box,
                    flierprops=dict(marker='o', markerfacecolor='red',
                                    markeredgecolor='black', markersize=3))
        plt.ylabel('')

    plt.show()

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
if not fast:
    plot_num_distribution_grid(train)

# Feature Engineering

In [10]:
train.head(2)

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,1
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,0


In [13]:
for df in (test, train):
    df["high_bp"] = (df["BP"] >= 140).astype(int)
    df["high_chol"] = (df["Cholesterol"] >= 240).astype(int)
    df["high_risk_count"] = df["high_bp"] + df["high_chol"] + df["FBS over 120"]

    df["ST_pos"] = (df["ST depression"] > 0).astype(int)
    
    df["MaxHR_pred"] = 220 - df["Age"]
    df["HR_ratio"] = df["Max HR"] / df["MaxHR_pred"].replace(0, np.nan)
    df["HR_deficit"] = df["MaxHR_pred"] - df["Max HR"]

    df["vessels_any"] = (df["Number of vessels fluro"] > 0).astype(int)
    df["vessels_2plus"] = (df["Number of vessels fluro"] >= 2).astype(int)

    # Chest pain + thallium flags
    df["cp_asymptomatic"] = (df["Chest pain type"] == 4).astype(int)
    df["thal_normal"] = (df["Thallium"] == 3).astype(int)
    df["thal_fixed"] = (df["Thallium"] == 6).astype(int)
    df["thal_reversible"] = (df["Thallium"] == 7).astype(int)

    # A couple interactions
    df["exang_x_st"] = df["Exercise angina"] * df["ST depression"]
    df["vessels_x_thal"] = df["Number of vessels fluro"] * df["Thallium"]

    # Bool feature from exang_x_st, a 1 in this feature is a very likely indicator of heart disease
    df["exang_x_st_bool"] = (df["exang_x_st"] > 0).astype(int)

    # Age over 50
    df["age_over_50"] = (df["Age"] > 50).astype(int)
    # Chest pain 4
    df["chest_pain_4"] = (df["Chest pain type"] == 4).astype(int)

In [15]:
bool_feats = [feat for feat in test.select_dtypes('number') if test[feat].nunique()==2]

num_feats = [feat for feat in test.select_dtypes(include='number').columns.tolist()]
# num_feats = [feat for feat in num_feats if feat not in bool_feats]

cat_feats = test.select_dtypes(exclude='number').columns.tolist()

In [None]:
if not fast:
    plot_num_distribution_grid(train, num_feats=num_feats)

In [None]:
feature = "Slope of ST"
experiment = train[[feature, target]].copy()
# experiment = experiment[experiment[feature] <= 2][[feature, target]]
experiment.shape

In [None]:
plot_num_distribution_grid(experiment, num_feats=[feature])

# Training