## Data pre-processing

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import train_test_split

In [2]:
path_data = "./data/BRFSS-2015_binary.parquet"
df0 = pd.read_parquet(path_data)

### Select rows

In [3]:
# Remove age less than 30
df = df0.copy()
conditions = ~(df.Age.isin([1,2]))
df = df[conditions]

In [4]:
df.shape

(238501, 22)

### Feature engineering

In [5]:
scaler = StandardScaler(with_std=True)
df["BMI_scaled"] = scaler.fit_transform(df[["BMI"]]).flatten()
bins = [-1, 0, 7, 14, 21, 28, 30]
labels = ['0', '1-7', '8-14', '15-21', '22-28', '29-30']
df["PhysHlth_group"] = pd.cut(df["PhysHlth"], bins=bins, labels=labels)
df["MentHlth_group"] = pd.cut(df["MentHlth"], bins=bins, labels=labels)
ordinal_encoder = OrdinalEncoder(categories=[["0", "1-7", "8-14", "15-21", "22-28", "29-30"]])
df["PhysHlth_encoded"] = ordinal_encoder.fit_transform(df[["PhysHlth_group"]])
df["MentHlth_encoded"] = ordinal_encoder.fit_transform(df[["MentHlth_group"]])

In [6]:
df = df.drop(columns=["BMI", "MentHlth", "PhysHlth", "PhysHlth_group", "MentHlth_group"])

In [7]:
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'DiffWalk', 'Sex', 'Age', 'Education', 'Income', 'BMI_scaled',
       'PhysHlth_encoded', 'MentHlth_encoded'],
      dtype='object')

### Save data

In [8]:
df.to_parquet("./data/BRFSS-2015_binary_clean.parquet")

### Data partition

In [9]:
X = df.iloc[:, 1:]
y = df.iloc[:,0]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [10]:
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

Training set: (166950, 21), (166950,)
Validation set: (35775, 21), (35775,)
Test set: (35776, 21), (35776,)


### Borderline SMOTE

In [11]:
borderline_smote = BorderlineSMOTE(random_state=42, k_neighbors=5, m_neighbors=10)
X_train_resampled, y_train_resampled = borderline_smote.fit_resample(X_train, y_train)

In [12]:
print("Original class distribution:")
print(y_train.value_counts())
print("\nClass distribution after Borderline SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Original class distribution:
Diabetes_binary
0.0    139171
1.0     27779
Name: count, dtype: int64

Class distribution after Borderline SMOTE:
Diabetes_binary
0.0    139171
1.0    139171
Name: count, dtype: int64


### Export data for modelling

In [15]:
df_train_resampled = X_train_resampled.copy()
df_train_resampled.insert(0, y.name, y_train_resampled)
df_train_resampled.to_parquet("./data/data_train_borderline_smote.parquet")

df_val = X_val.copy()
df_val.insert(0, y.name, y_val)
df_val.to_parquet("./data/data_val.parquet")

df_test = X_test.copy()
df_test.insert(0, y.name, y_test)
df_test.to_parquet("./data/data_test.parquet")