In [7]:
# Centralized data loader (run this cell once before running FS cells)
import pandas as pd
from pathlib import Path

ROOT = Path(r"d:\HỌC KỲ V\Chuẩn bị dữ liệu và Visualization\Project_final")
DATA_DIR = ROOT / 'Data' / 'processed'
TRAIN_FP = DATA_DIR / 'train_final.csv'
TEST_FP  = DATA_DIR / 'test_final.csv'

if not TRAIN_FP.exists() or not TEST_FP.exists():
    raise FileNotFoundError(f"train_final.csv or test_final.csv not found in {DATA_DIR}")

train = pd.read_csv(TRAIN_FP)
test = pd.read_csv(TEST_FP)

TARGET = 'sii'
if TARGET not in train.columns:
    raise RuntimeError(f"Target column '{TARGET}' not found in train_final.csv")

# split
y = train[TARGET].copy()
X = train.drop(columns=[TARGET]).copy()
if TARGET in test.columns:
    test = test.drop(columns=[TARGET])

# align test columns to X
for c in X.columns:
    if c not in test.columns:
        test[c] = 0
extra = [c for c in test.columns if c not in X.columns]
if extra:
    test = test.drop(columns=extra)
test = test.reindex(columns=X.columns)

# Remove PCIAT cols from X and test (per your request)
pciat_cols = [c for c in X.columns if c.startswith('PCIAT-') or c.startswith('PCIAT_')]
if pciat_cols:
    X = X.drop(columns=pciat_cols)
    test = test.drop(columns=[c for c in test.columns if c in pciat_cols], errors='ignore')
    test = test.reindex(columns=X.columns)
    print(f"Removed {len(pciat_cols)} PCIAT columns: {pciat_cols[:10]}{'...' if len(pciat_cols)>10 else ''}")

# multiclass target
y_multi = y.astype(int)

# parameters
TOP_K = 100
K = min(TOP_K, X.shape[1])

print('Loaded data. Features:', X.shape[1], 'Selected K=', K)
print('Target classes:', sorted(y_multi.unique()))


Removed 24 PCIAT columns: ['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10']...
Loaded data. Features: 221 Selected K= 100
Target classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]


In [15]:
# Feature selection - Mutual Information (classification, multiclass)
# Uses centralized X and y_multi loaded by the loader cell
import pandas as pd
from pathlib import Path
from sklearn.feature_selection import mutual_info_classif

DATA_DIR = Path(r"d:\HỌC KỲ V\Chuẩn bị dữ liệu và Visualization\Project_final\Data\processed")
# require centralized loader
if 'X' not in globals() or 'y_multi' not in globals():
    raise RuntimeError('Please run the centralized loader cell (top of this notebook) before running this cell')

# work on a local view to avoid accidental mutation
X_local = X
K = min(TOP_K, X_local.shape[1])

try:
    mi = mutual_info_classif(X_local.values, y_multi, discrete_features='auto', random_state=0)
    mi_series = pd.Series(mi, index=X_local.columns).sort_values(ascending=False)
    mi_top = mi_series.head(K)
    mi_top.to_csv(DATA_DIR / 'mi_top.csv')
    print('Mutual Information (multiclass) — top features:')
    print(mi_top.to_string())
    selected_mutual_info = mi_top.index.tolist()
    print('X_local shape:', X_local.shape)
    print('Mutual Information - selected count:', len(selected_mutual_info))
except Exception as e:
    print('Mutual information failed:', e)
    selected_mutual_info = []


Mutual Information (multiclass) — top features:
Physical-Height                                   0.079800
Physical-Weight                                   0.078392
Basic_Demos-Age                                   0.071618
PAQ_A-PAQ_A_Total                                 0.064174
PreInt_EduHx-computerinternet_hoursday            0.060780
BIA-BIA_LDM                                       0.059892
FGC-FGC_CU                                        0.058470
BIA-BIA_ICW                                       0.055314
internet_x_sedentary                              0.049652
FGC-FGC_GSD                                       0.047272
Physical-Waist_Circumference                      0.046569
FGC-FGC_GSND_Zone                                 0.046017
FGC-FGC_GSND                                      0.043455
Physical-BMI                                      0.043295
BIA-BIA_LST                                       0.043259
BIA-BIA_FFM                                       0.040815
BIA-BIA_

In [16]:
# Feature selection - Permutation importance (RandomForest) for multiclass
# Loads train_final/test_final and runs RandomForest+permutation importance. No SimpleImputer used; fillna(0).
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

DATA_DIR = Path(r"d:\HỌC KỲ V\Chuẩn bị dữ liệu và Visualization\Project_final\Data\processed")
# require centralized loader
if 'X' not in globals() or 'y_multi' not in globals():
    raise RuntimeError('Please run the centralized loader cell (top of this notebook) before running this cell')

X_local = X
K = min(TOP_K, X_local.shape[1])

try:
    rf_perm = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0)
    rf_perm.fit(X_local.values, y_multi)
    perm = permutation_importance(rf_perm, X_local.values, y_multi, n_repeats=20, random_state=0, n_jobs=-1, scoring='f1_macro')
    perm_mean = pd.Series(perm.importances_mean, index=X_local.columns).sort_values(ascending=False)
    perm_top = perm_mean.head(K)
    perm_top.to_csv(DATA_DIR / 'perm_top.csv')
    print('Permutation importance (multiclass) — top features:')
    print(perm_top.to_string())
    selected_permutation = perm_top.index.tolist()
    print('X_local shape:', X_local.shape)
    print('Permutation - selected count:', len(selected_permutation))
except Exception as e:
    print('Permutation importance failed:', e)
    selected_permutation = []


Permutation importance (multiclass) — top features:
PreInt_EduHx-computerinternet_hoursday    0.004807
internet_x_sedentary                      0.004157
SDS-SDS_Total_T                           0.002173
SDS-SDS_Total_Raw                         0.002026
FGC-FGC_CU                                0.001217
Basic_Demos-Age                           0.000977
Physical-Height                           0.000619
SDSxCGAS_raw                              0.000515
Physical-HeartRate                        0.000480
Physical-Waist_Circumference              0.000469
Physical-Weight                           0.000344
CGAS-CGAS_Score                           0.000281
PAQ_A-PAQ_A_Total                         0.000250
Physical-Diastolic_BP                     0.000234
FGC-Season_spring                         0.000219
FGC-FGC_GSND                              0.000154
Fitness_Endurance-Time_Sec                0.000000
FGC-FGC_GSD                               0.000000
Fitness_Endurance-Time_Mins   

In [17]:
# Feature selection - RFE (RandomForest wrapper) for multiclass
# Loads data and runs RFE selecting exactly K features (fill missing with 0)
import pandas as pd
from pathlib import Path
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

DATA_DIR = Path(r"d:\HỌC KỲ V\Chuẩn bị dữ liệu và Visualization\Project_final\Data\processed")
DATA_DIR = Path(r"d:\HỌC KỲ V\Chuẩn bị dữ liệu và Visualization\Project_final\Data\processed")
# require centralized loader
if 'X' not in globals() or 'y_multi' not in globals():
    raise RuntimeError('Please run the centralized loader cell (top of this notebook) before running this cell')

X_local = X
K = min(TOP_K, X_local.shape[1])

try:
    estimator = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0)
    rfe = RFE(estimator=estimator, n_features_to_select=K, step=0.1)
    rfe.fit(X_local.values, y_multi)
    rfe_support = pd.Series(rfe.support_, index=X_local.columns)
    rfe_top = rfe_support[rfe_support].index.tolist()
    if len(rfe_top) != K:
        rfe_ranking = pd.Series(rfe.ranking_, index=X_local.columns)
        rfe_top = rfe_ranking.sort_values().head(K).index.tolist()
    pd.Series(rfe_top).to_csv(DATA_DIR / 'rfe_top.csv', index=False)
    print('RFE — selected features:')
    print('\n'.join(rfe_top))
    selected_rfe = rfe_top
    print('X_local shape:', X_local.shape)
    print('RFE - selected count:', len(selected_rfe))
except Exception as e:
    print('RFE failed:', e)
    selected_rfe = []


RFE — selected features:
Basic_Demos-Age
Basic_Demos-Sex
CGAS-CGAS_Score
Physical-BMI
Physical-Height
Physical-Weight
Physical-Waist_Circumference
Physical-Diastolic_BP
Physical-HeartRate
Physical-Systolic_BP
Fitness_Endurance-Time_Mins
Fitness_Endurance-Time_Sec
FGC-FGC_CU
FGC-FGC_CU_Zone
FGC-FGC_GSND
FGC-FGC_GSND_Zone
FGC-FGC_GSD
FGC-FGC_GSD_Zone
FGC-FGC_PU
FGC-FGC_PU_Zone
FGC-FGC_SRL
FGC-FGC_SRL_Zone
FGC-FGC_SRR
FGC-FGC_SRR_Zone
FGC-FGC_TL
FGC-FGC_TL_Zone
BIA-BIA_Activity_Level_num
BIA-BIA_BMC
BIA-BIA_BMI
BIA-BIA_BMR
BIA-BIA_DEE
BIA-BIA_ECW
BIA-BIA_FFM
BIA-BIA_FFMI
BIA-BIA_FMI
BIA-BIA_Fat
BIA-BIA_ICW
BIA-BIA_LDM
BIA-BIA_LST
BIA-BIA_SMM
BIA-BIA_TBW
PAQ_A-PAQ_A_Total
PAQ_C-PAQ_C_Total
SDS-SDS_Total_Raw
SDS-SDS_Total_T
PreInt_EduHx-computerinternet_hoursday
sedentary_por
light_por
moderate_por
X_mean
X_std
X_25%
X_50%
X_75%
X_max
Y_mean
Y_std
Y_min
Y_75%
Y_max
Z_std
Z_50%
Z_75%
Z_max
enmo_50%
enmo_75%
light_count
light_mean
light_std
time_of_day_mean
time_of_day_std
time_of_day_25%
wee

In [18]:
# Feature selection - RFECV (RandomForest + CV) for multiclass, pick top K by ranking
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"d:\HỌC KỲ V\Chuẩn bị dữ liệu và Visualization\Project_final\Data\processed")
# require centralized loader
if 'X' not in globals() or 'y_multi' not in globals():
    raise RuntimeError('Please run the centralized loader cell (top of this notebook) before running this cell')

X_local = X
K = min(100, X_local.shape[1])

try:
    rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0), step=0.1, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=0), scoring='f1_macro', n_jobs=-1)
    rfecv.fit(X_local.values, y_multi)
    rfecv_ranking = pd.Series(rfecv.ranking_, index=X_local.columns).sort_values()
    rfecv_top = rfecv_ranking.head(K).index.tolist()
    pd.Series(rfecv_top).to_csv(DATA_DIR / 'rfecv_top.csv', index=False)
    print('RFECV — top features (by ranking):')
    print('\n'.join(rfecv_top))
    selected_rfecv = rfecv_top
    print('X_local shape:', X_local.shape)
    print('RFECV - selected count:', len(selected_rfecv))
except Exception as e:
    print('RFECV failed:', e)
    selected_rfecv = []


RFECV — top features (by ranking):
Basic_Demos-Age
Basic_Demos-Sex
CGAS-CGAS_Score
Physical-BMI
Physical-Height
Physical-Weight
Physical-Waist_Circumference
Physical-Diastolic_BP
Physical-HeartRate
Physical-Systolic_BP
Fitness_Endurance-Max_Stage
Fitness_Endurance-Time_Mins
Fitness_Endurance-Time_Sec
FGC-FGC_CU
FGC-FGC_CU_Zone
FGC-FGC_GSND
FGC-FGC_GSND_Zone
FGC-FGC_GSD
FGC-FGC_GSD_Zone
FGC-FGC_PU
FGC-FGC_PU_Zone
FGC-FGC_SRL
FGC-FGC_SRL_Zone
FGC-FGC_SRR
FGC-FGC_SRR_Zone
FGC-FGC_TL
FGC-FGC_TL_Zone
BIA-BIA_Activity_Level_num
BIA-BIA_BMC
BIA-BIA_BMI
BIA-BIA_BMR
BIA-BIA_DEE
BIA-BIA_ECW
BIA-BIA_FFM
BIA-BIA_FFMI
BIA-BIA_FMI
BIA-BIA_Fat
BIA-BIA_Frame_num
BIA-BIA_ICW
BIA-BIA_LDM
BIA-BIA_LST
BIA-BIA_SMM
BIA-BIA_TBW
PAQ_A-PAQ_A_Total
PAQ_C-PAQ_C_Total
SDS-SDS_Total_Raw
SDS-SDS_Total_T
PreInt_EduHx-computerinternet_hoursday
sedentary_por
light_por
moderate_por
X_count
X_mean
X_std
X_25%
X_50%
Y_mean
X_75%
X_max
Y_count
Y_min
Y_std
Y_75%
Y_50%
weekday_25%
time_of_day_75%
Y_max
Z_count
Z_std
Z_max
Z

In [19]:
# Feature selection - XGBoost feature importance (multiclass)
import pandas as pd
from pathlib import Path

try:
    import xgboost as xgb
    from sklearn.preprocessing import LabelEncoder
except Exception as e:
    print('XGBoost import failed:', e)

DATA_DIR = Path(r"d:\HỌC KỲ V\Chuẩn bị dữ liệu và Visualization\Project_final\Data\processed")
# require centralized loader
if 'X' not in globals() or 'y_multi' not in globals():
    raise RuntimeError('Please run the centralized loader cell (top of this notebook) before running this cell')

X_local = X
K = min(100, X_local.shape[1])

try:
    n_classes = int(y_multi.nunique())
    xgb_clf = xgb.XGBClassifier(n_estimators=200, use_label_encoder=False, objective='multi:softprob', eval_metric='mlogloss', n_jobs=-1, random_state=0)
    xgb_clf.fit(X_local, y_multi)
    try:
        booster = xgb_clf.get_booster()
        importance = booster.get_score(importance_type='gain')
        # convert to series and map fN to column names
        imp_series = pd.Series({k: importance.get(k, 0) for k in importance})
        def key_to_col(k):
            try:
                if k.startswith('f'):
                    idx = int(k[1:])
                    return X_local.columns[idx]
            except Exception:
                return k
            return k
        imp_series.index = [key_to_col(k) for k in imp_series.index]
        imp_series = imp_series.reindex(X_local.columns).fillna(0)
    except Exception:
        imp_series = pd.Series(xgb_clf.feature_importances_, index=X_local.columns)
except Exception as e:
    print('XGBoost failed:', e)
    selected_xgb = []


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [21]:
# Final summary cell: combine selections from each method saved into CSVs (exclude RFE)
import pandas as pd
from pathlib import Path
OUT_DIR = Path(r"d:\HỌC KỲ V\Chuẩn bị dữ liệu và Visualization\Project_final\Data\processed")

def read_feature_list(fp: Path):
    if not fp.exists():
        return []
    # Attempt to read Series saved with feature names as index (importance values as data)
    try:
        s = pd.read_csv(fp, index_col=0, squeeze=True)
        if isinstance(s, pd.Series):
            # If the series values are numeric (importances), return the index (feature names)
            if pd.api.types.is_numeric_dtype(s.dtype):
                return s.index.astype(str).tolist()
            # Otherwise the series likely already contains feature names as values
            return s.dropna().astype(str).tolist()
    except Exception:
        pass
    # Fallback: read without index and extract first column or first non-numeric column
    try:
        df = pd.read_csv(fp, header=None)
        if df.shape[1] == 1:
            return df.iloc[:, 0].dropna().astype(str).tolist()
        # If second column looks numeric (importance), take first column as feature names
        if df.shape[1] >= 2 and pd.api.types.is_numeric_dtype(df.iloc[:, 1]):
            return df.iloc[:, 0].dropna().astype(str).tolist()
        # Otherwise flatten and return unique strings
        vals = []
        for col in df.columns:
            vals.extend(df[col].dropna().astype(str).tolist())
        return list(dict.fromkeys(vals))
    except Exception:
        return []

# Read selections: mutual_info, permutation, rfecv, xgboost (exclude RFE per request)
selected = {}
selected['mutual_info'] = read_feature_list(OUT_DIR / 'mi_top.csv') or globals().get('selected_mutual_info', [])
selected['permutation'] = read_feature_list(OUT_DIR / 'perm_top.csv') or globals().get('selected_permutation', [])
selected['rfecv'] = read_feature_list(OUT_DIR / 'rfecv_top.csv') or globals().get('selected_rfecv', [])
selected['xgboost'] = read_feature_list(OUT_DIR / 'xgb_top.csv') or globals().get('selected_xgb', [])

# build summary dataframe
all_features = list(X.columns)
summary = pd.DataFrame(index=all_features)
for method, feats in selected.items():
    summary[method] = summary.index.isin(feats).astype(int)
summary['selected_count'] = summary[list(selected.keys())].sum(axis=1)

# distribution and top features by consensus
vc = summary['selected_count'].value_counts().sort_index()
print('selected_count distribution (value: frequency):')
print(vc.to_string())
n_ge1 = int((summary['selected_count'] >= 1).sum())
print(f'Features with selected_count >= 1: {n_ge1} of {len(summary)}')
print('Combined summary (top rows):')
print(summary.sort_values('selected_count', ascending=False).head(200).to_string())
summary.reset_index().rename(columns={'index':'feature'}).to_csv(OUT_DIR / 'feature_selection_combined_summary.csv', index=False)
print('Saved combined summary to', OUT_DIR / 'feature_selection_combined_summary.csv')


selected_count distribution (value: frequency):
selected_count
0    42
1    59
2    48
3    43
4    29
Features with selected_count >= 1: 179 of 221
Combined summary (top rows):
                                                mutual_info  permutation  rfecv  xgboost  selected_count
Basic_Demos-Age                                           1            1      1        1               4
Physical-Height                                           1            1      1        1               4
Physical-BMI                                              1            1      1        1               4
Fitness_Endurance-Time_Sec                                1            1      1        1               4
Physical-Weight                                           1            1      1        1               4
FGC-FGC_GSD_Zone                                          1            1      1        1               4
FGC-FGC_PU                                                1            1      1        