In [None]:
# IMPORT LIBRARIES
##################

import pandas as pd
import numpy as np
import warnings
import os
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.stats import skew
from collections import defaultdict
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# DATA LOADING
##############

print("LOADING DATASETS")
print("="*50)

anthro_data = pd.read_csv('anthro_data.csv')
clinical_data = pd.read_csv('clinical_data.csv')
dietary_data = pd.read_csv('dietary_data.csv')
socio_data = pd.read_csv('socio_data.csv')

print(f"1. Anthropometric data: {anthro_data.shape}")
print(f"2. Clinical data: {clinical_data.shape}")
print(f"3. Dietary data: {dietary_data.shape}")
print(f"4. Socioeconomic data: {socio_data.shape}")

# MERGING DATASETS
##################

print("\nMERGING DATASETS")
print("="*50)

# Merge on both hhnum and member_code
merged_data = pd.merge(
    clinical_data,
    anthro_data,
    on=['hhnum', 'member_code'],
    how='inner',
    suffixes=('_clinical', '_anthro')
)

merged_data = pd.merge(
    merged_data,
    dietary_data,
    on=['hhnum', 'member_code'],
    how='inner',
    suffixes=('', '_dietary')
)

merged_data = pd.merge(
    merged_data,
    socio_data,
    on=['hhnum', 'member_code'],
    how='inner',
    suffixes=('', '_socio')
)
print(f"Merged Data: {merged_data.shape}")

LOADING DATASETS
1. Anthropometric data: (444400, 18)
2. Clinical data: (351701, 29)
3. Dietary data: (234293, 75)
4. Socioeconomic data: (654425, 20)

MERGING DATASETS
Merged Data: (175899, 136)

DROPPING ROWS WITH NO FBS VALUES
Before Cleaning
   Total rows: 175899
   Missing FBS values: 101895

After Cleaning
   Total rows: 74004
   Missing FBS values: 0

REMOVING DUPLICATE COLUMNS
Administrative/Duplicate Columns to be removed (26):
                           
0                     hhnum
1               member_code
2          regcode_clinical
3          provhuc_clinical
4        enns_year_clinical
5   fwgti_natl_var_clinical
6       fwgti_prov_clinical
7         rep_natl_clinical
8         rep_prov_clinical
9       ms_psucode_clinical
10           regcode_anthro
11           provhuc_anthro
12         enns_year_anthro
13    fwgti_natl_var_anthro
14        fwgti_prov_anthro
15          rep_natl_anthro
16          rep_prov_anthro
17        ms_psucode_anthro
18  fwgti_natl2_var_dietary