# Simple dataset review (initial not preprocessed dataset)

In [None]:
import os

import pandas as pd

from src.config import get_data_cfg
from src.constants import PROJECT_ROOT

## Read and describe the data

**Set path to the data config in the `CONFIG_PATH` env variable below** (path must be relative to the `PROJECT_ROOT`). If not set, default config will be used.

In [None]:
%env CONFIG_PATH=configs/mlp_heart_data_config.yaml

In [None]:
cfg = get_data_cfg(os.getenv('DATASET_PATH'))
csv_path = cfg.raw_csv_path
df = pd.read_csv(PROJECT_ROOT / csv_path)

In [None]:
df

In [None]:
df.describe()

## Find and drop missing values

In [None]:
df.isnull().sum()

0 values of `Cholesterol` and `RestingBP` may be considered missing and thus should be removed or imputed.

In [None]:
df[(df['Cholesterol'] == 0) | (df['RestingBP'] == 0)]

Drop these missing values

In [None]:
df_no_na = df[(df['Cholesterol'] > 0) & (df['RestingBP'] > 0)]
df_no_na

In [None]:
df_no_na.describe()

## Categorical variables

In [None]:
def count_cats(col: pd.Series) -> pd.Series:
    return col.value_counts(normalize=True).apply(lambda freq: round(freq, 2))

In [None]:
categorical_cols = ['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

for col in categorical_cols:
    print(count_cats(df_no_na[col]))
    print()

## Target class balance

In [None]:
count_cats(df_no_na['HeartDisease'])