In [14]:
import pandas as pd
import numpy as np
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from scipy.optimize import minimize
from IPython.display import clear_output
from colorama import Fore, Style

# Load CSV files from local Data/raw directory
train = pd.read_csv('../Data/raw/train.csv')
test = pd.read_csv('../Data/raw/test.csv')
sample = pd.read_csv('../Data/raw/sample_submission.csv')

def process_file(filename, dirname):
    # Đọc tệp Parquet
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    
    # Tính toán các thống kê mô tả
    desc_df = df.describe().T # .T để chuyển vị, đưa các cột thành chỉ mục và các thống kê thành cột
    
    # Tạo tên cột mới bằng cách kết hợp Tên cột gốc và Tên thống kê (ví dụ: 'Age_mean')
    # Các thống kê mặc định: ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
    new_cols = []
    values = []
    
    # Duyệt qua từng hàng (là từng cột gốc) của DataFrame thống kê
    for col in desc_df.index:
        for stat in desc_df.columns:
            # Tạo tên mới: {Tên_cột_gốc}_{Tên_thống_kê}
            new_name = f"{col}_{stat}"
            new_cols.append(new_name)
            
            # Lấy giá trị thống kê tương ứng
            values.append(desc_df.loc[col, stat])
            
    # Trả về các giá trị thống kê (dạng mảng 1D) và danh sách tên cột
    # Đồng thời trả về ID
    return values, new_cols, filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        # Sử dụng map như trước, kết quả sẽ là list các tuples: (values, new_cols, index)
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    # Tách kết quả thành 3 lists riêng biệt
    stats_values, col_names_list, indexes = zip(*results)
    
    # Lấy danh sách tên cột từ mẫu đầu tiên (giả định tất cả các tệp có cùng cấu trúc cột)
    if col_names_list:
        new_columns = col_names_list[0]
    else:
        # Xử lý trường hợp không có tệp nào
        return pd.DataFrame()
        
    # Tạo DataFrame mới sử dụng các giá trị thống kê và tên cột đã tạo
    df = pd.DataFrame(list(stats_values), columns=new_columns)
    df['id'] = indexes
    return df

# Load time series data from local Data/raw directory        
train_ts = load_time_series("../Data/raw/series_train.parquet")
test_ts = load_time_series("../Data/raw/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)   

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

100%|██████████| 996/996 [01:02<00:00, 16.02it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<00:00,  7.41it/s]



In [15]:
featuresCols

['Basic_Demos-Enroll_Season',
 'Basic_Demos-Age',
 'Basic_Demos-Sex',
 'CGAS-Season',
 'CGAS-CGAS_Score',
 'Physical-Season',
 'Physical-BMI',
 'Physical-Height',
 'Physical-Weight',
 'Physical-Waist_Circumference',
 'Physical-Diastolic_BP',
 'Physical-HeartRate',
 'Physical-Systolic_BP',
 'Fitness_Endurance-Season',
 'Fitness_Endurance-Max_Stage',
 'Fitness_Endurance-Time_Mins',
 'Fitness_Endurance-Time_Sec',
 'FGC-Season',
 'FGC-FGC_CU',
 'FGC-FGC_CU_Zone',
 'FGC-FGC_GSND',
 'FGC-FGC_GSND_Zone',
 'FGC-FGC_GSD',
 'FGC-FGC_GSD_Zone',
 'FGC-FGC_PU',
 'FGC-FGC_PU_Zone',
 'FGC-FGC_SRL',
 'FGC-FGC_SRL_Zone',
 'FGC-FGC_SRR',
 'FGC-FGC_SRR_Zone',
 'FGC-FGC_TL',
 'FGC-FGC_TL_Zone',
 'BIA-Season',
 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_BMC',
 'BIA-BIA_BMI',
 'BIA-BIA_BMR',
 'BIA-BIA_DEE',
 'BIA-BIA_ECW',
 'BIA-BIA_FFM',
 'BIA-BIA_FFMI',
 'BIA-BIA_FMI',
 'BIA-BIA_Fat',
 'BIA-BIA_Frame_num',
 'BIA-BIA_ICW',
 'BIA-BIA_LDM',
 'BIA-BIA_LST',
 'BIA-BIA_SMM',
 'BIA-BIA_TBW',
 'PAQ_A-Season',
 'PAQ_

In [16]:
# Extract ENMO features (activity patterns) from time series data
def extract_enmo(df_source, id=None):
    """
    Extract activity patterns based on ENMO (Euclidean Norm Minus One) values
    Categories: sedentary, light, moderate
    """
    df = df_source.copy()
    
    # Filter out non-wear periods
    df = df[df['non-wear_flag'] == 0]
    df.drop('non-wear_flag', axis=1, inplace=True)
    
    # Classify activity types based on ENMO thresholds
    df.loc[:, 'Type_activity'] = 'Non-assigned'
    df.loc[(df['enmo'] < 10*1e-3), 'Type_activity'] = 'sedentary'
    df.loc[(df['enmo'] >= 10*1e-3) & (df['enmo'] < 100*1e-3), 'Type_activity'] = 'light'
    df.loc[(df['enmo'] >= 100*1e-3), 'Type_activity'] = 'moderate'
    
    # Calculate total wear time
    total_wear = df['step'].count()
    
    # Calculate proportion of each activity type
    if total_wear > 0:
        sedentary_perall = df[df['Type_activity'] == 'sedentary']['step'].count() / total_wear
        light_perall = df[df['Type_activity'] == 'light']['step'].count() / total_wear
        moderate_perall = df[df['Type_activity'] == 'moderate']['step'].count() / total_wear
    else:
        sedentary_perall = light_perall = moderate_perall = 0
    
    return pd.DataFrame({
        'id': [id], 
        'sedentary_por': [sedentary_perall], 
        'light_por': [light_perall],
        'moderate_por': [moderate_perall]
    })

def getEnmo(ts_path):
    """
    Extract ENMO features from all time series files in a directory
    """
    listdir = os.listdir(ts_path)
    res_df = None
    
    for dir in tqdm(listdir, desc="Extracting ENMO features"):
        dft = pd.read_parquet(os.path.join(ts_path, dir, "part-0.parquet"))
        
        # Extract ID from directory name (format: id=xxxxx)
        id = dir.split('=')[1] if '=' in dir else dir[3:]
        ex_df = extract_enmo(dft, id=id)
        
        if res_df is None:
            res_df = ex_df
        else:
            res_df = pd.concat([res_df, ex_df], ignore_index=True)
    
    return res_df

# Extract ENMO features from train and test time series
print("Extracting ENMO features from training data...")
train_enmo = getEnmo("../Data/raw/series_train.parquet")

print("\nExtracting ENMO features from test data...")
test_enmo = getEnmo("../Data/raw/series_test.parquet")

Extracting ENMO features from training data...


Extracting ENMO features:   0%|          | 0/996 [00:00<?, ?it/s]

Extracting ENMO features: 100%|██████████| 996/996 [02:00<00:00,  8.28it/s]
Extracting ENMO features: 100%|██████████| 996/996 [02:00<00:00,  8.28it/s]



Extracting ENMO features from test data...


Extracting ENMO features: 100%|██████████| 2/2 [00:00<00:00, 13.51it/s]
Extracting ENMO features: 100%|██████████| 2/2 [00:00<00:00, 13.51it/s]


In [20]:
# Reload the original train and test data with IDs for merging
train_with_id = pd.read_csv('../Data/raw/train.csv')
test_with_id = pd.read_csv('../Data/raw/test.csv')

# Merge ENMO features with train and test data
train_with_id = pd.merge(train_with_id, train_enmo, how="left", on='id')
test_with_id = pd.merge(test_with_id, test_enmo, how="left", on='id')

# Merge with existing time series features
train_with_id = pd.merge(train_with_id, train_ts, how="left", on='id')
test_with_id = pd.merge(test_with_id, test_ts, how="left", on='id')

# Merge all features together (no preprocessing, just merging)
train_final = train_with_id.copy()
test_final = test_with_id.copy()

train_final = train_final.dropna(subset='sii')

print(f"\n✓ Dataset đã được ghép thành công!")
print(f"  - Train shape: {train_final.shape}")
print(f"  - Test shape: {test_final.shape}")
print(f"  - Columns: {train_final.columns.tolist()[:10]}... (showing first 10)")
print(f"\nFeatures included:")
print(f"  - Original features: {len(train.columns)}")
print(f"  - Time series features: {len(time_series_cols)}")
print(f"  - ENMO features: 3 (sedentary_por, light_por, moderate_por)")
print(f"  - Total: {train_final.shape[1]} columns")


✓ Dataset đã được ghép thành công!
  - Train shape: (2736, 181)
  - Test shape: (20, 158)
  - Columns: ['id', 'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI', 'Physical-Height', 'Physical-Weight']... (showing first 10)

Features included:
  - Original features: 155
  - Time series features: 96
  - ENMO features: 3 (sedentary_por, light_por, moderate_por)
  - Total: 181 columns


In [22]:
# Create processed directory if it doesn't exist
os.makedirs('../Data/processed', exist_ok=True)

# Save processed train and test data (with ENMO features)
train_final.to_csv('../Data/processed/train_processed.csv', index=False)
test_final.to_csv('../Data/processed/test_processed.csv', index=False)

# Save sample submission (for reference)
sample.to_csv('../Data/processed/sample_submission.csv', index=False)

print("✓ Dữ liệu đã được lưu thành công!")
print(f"  - train_processed.csv: {train_final.shape}")
print(f"  - test_processed.csv: {test_final.shape}")
print(f"  - train_ts_features.csv: {train_ts.shape}")
print(f"  - test_ts_features.csv: {test_ts.shape}")
print(f"  - train_enmo_features.csv: {train_enmo.shape}")
print(f"  - test_enmo_features.csv: {test_enmo.shape}")
print(f"  - sample_submission.csv: {sample.shape}")

✓ Dữ liệu đã được lưu thành công!
  - train_processed.csv: (2736, 181)
  - test_processed.csv: (20, 158)
  - train_ts_features.csv: (996, 97)
  - test_ts_features.csv: (2, 97)
  - train_enmo_features.csv: (996, 4)
  - test_enmo_features.csv: (2, 4)
  - sample_submission.csv: (20, 2)


In [19]:
import pandas as pd

# Đọc file Parquet
df = pd.read_parquet("../Data/raw/series_train.parquet/id=0a418b57/part-0.parquet")

# Hiển thị thông tin về dữ liệu trong Parquet
print(df.head())  # Xem 5 dòng đầu tiên của dữ liệu
print(df.info())  # Thông tin chi tiết về cấu trúc DataFrame (số lượng các cột, kiểu dữ liệu, v.v.)
print(df.columns)  # Hiển thị tên các cột trong DataFrame

   step         X         Y         Z      enmo     anglez  non-wear_flag  \
0     0 -0.075242 -0.256743 -0.973791  0.038081 -72.952141            0.0   
1     1 -0.265893 -0.270508 -0.765470  0.077430 -52.849220            0.0   
2     2  0.334517 -0.548602 -0.588596  0.039162 -44.118084            0.0   
3     3  0.000193 -0.021069 -0.999681  0.001450 -88.759613            0.0   
4     4 -0.000685 -0.020681 -0.997677  0.000491 -88.756958            0.0   

   light  battery_voltage     time_of_day  weekday  quarter  \
0    5.0      4202.000000  51250000000000        2        4   
1    0.5      4185.333496  51255000000000        2        4   
2   11.5      4185.500000  51260000000000        2        4   
3    0.0      4185.666504  51265000000000        2        4   
4    8.5      4185.833496  51270000000000        2        4   

   relative_date_PCIAT  
0                 -9.0  
1                 -9.0  
2                 -9.0  
3                 -9.0  
4                 -9.0  
<class '