In [1]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import warnings
import os
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
def process_file(filename, dirname):
    data = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    data.drop('step', axis=1, inplace=True)
    return data.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname):
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    data = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    data['id'] = indexes
    return data

In [3]:
# train_parquet = load_time_series("../data/input/series_train.parquet")
# test_parquet = load_time_series("../data/input/series_test.parquet")

In [4]:


train_df = pd.read_csv("../data/input/train.csv")
test_df = pd.read_csv("../data/input/test.csv")

In [5]:
train_df.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [6]:
train_df.shape, test_df.shape

((3960, 82), (20, 59))

In [7]:
BASE_FEATURES = test_df.drop('id', axis=1).columns
TEST_IDS = test_df['id']

# analysis

In [8]:


train_missing_values = train_df.isnull().sum().sort_values(ascending=False)
test_missing_values = test_df.isnull().sum().sort_values(ascending=False)
train_missing_values

PAQ_A-PAQ_A_Total              3485
PAQ_A-Season                   3485
Fitness_Endurance-Time_Sec     3220
Fitness_Endurance-Time_Mins    3220
Fitness_Endurance-Max_Stage    3217
                               ... 
PreInt_EduHx-Season             420
Basic_Demos-Enroll_Season         0
Basic_Demos-Sex                   0
Basic_Demos-Age                   0
id                                0
Length: 82, dtype: int64

In [9]:
train_df.dtypes.groupby(train_df.dtypes).size()

int64       2
float64    68
object     12
dtype: int64

In [10]:
train_df.dtypes.groupby(train_df[BASE_FEATURES].dtypes).size()

int64       2
float64    46
object     10
dtype: int64

In [11]:
test_df.dtypes.groupby(test_df[BASE_FEATURES].dtypes).size()

int64       2
float64    46
object     10
dtype: int64

In [12]:
CATEGORICAL_FEATURES = train_df.select_dtypes(include='object').columns
NUMERICAL_FEATURES = train_df.select_dtypes(exclude='object').columns
RANDOM_STATE = 42
CATEGORICAL_FEATURES, NUMERICAL_FEATURES

(Index(['id', 'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
        'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
        'PAQ_C-Season', 'PCIAT-Season', 'SDS-Season', 'PreInt_EduHx-Season'],
       dtype='object'),
 Index(['Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'Physical-BMI',
        'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
        'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
        'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
        'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
        'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
        'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
        'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
        'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
        'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_

In [13]:

# categorical features with missing values
train_df[CATEGORICAL_FEATURES].isnull().sum().sort_values(ascending=False)

PAQ_A-Season                 3485
Fitness_Endurance-Season     2652
PAQ_C-Season                 2239
BIA-Season                   1815
CGAS-Season                  1405
SDS-Season                   1342
PCIAT-Season                 1224
Physical-Season               650
FGC-Season                    614
PreInt_EduHx-Season           420
id                              0
Basic_Demos-Enroll_Season       0
dtype: int64

In [14]:
train_df[CATEGORICAL_FEATURES] = train_df[CATEGORICAL_FEATURES].fillna('missing')
test_df[[col for col in CATEGORICAL_FEATURES if col in test_df.columns]] = test_df[[col for col in CATEGORICAL_FEATURES if col in test_df.columns]].fillna('missing')

In [15]:
train_df[NUMERICAL_FEATURES].isnull().sum().sort_values(ascending=False)

PAQ_A-PAQ_A_Total                         3485
Fitness_Endurance-Time_Sec                3220
Fitness_Endurance-Time_Mins               3220
Fitness_Endurance-Max_Stage               3217
Physical-Waist_Circumference              3062
                                          ... 
Physical-Height                            933
Physical-Weight                            884
PreInt_EduHx-computerinternet_hoursday     659
Basic_Demos-Sex                              0
Basic_Demos-Age                              0
Length: 70, dtype: int64

In [16]:
from sklearn.impute import KNNImputer
def fill_numerical_features(df):
    for feature in NUMERICAL_FEATURES:
        if feature in df.columns and df[feature].isnull().sum() > 0:
            df[f"{feature}_mean"] = df[feature].fillna(df[feature].mean())
            df["{feature}_median"] = df[feature].fillna(df[feature].median())
            df["{feature}_knn"] = KNNImputer(n_neighbors=5).fit_transform(df[feature].values.reshape(-1, 1))
    return df
train_df = fill_numerical_features(train_df)
test_df = fill_numerical_features(test_df)
train_df.isnull().sum(), test_df.isnull().sum()

(id                                             0
 Basic_Demos-Enroll_Season                      0
 Basic_Demos-Age                                0
 Basic_Demos-Sex                                0
 CGAS-Season                                    0
                                               ..
 PCIAT-PCIAT_Total_mean                         0
 SDS-SDS_Total_Raw_mean                         0
 SDS-SDS_Total_T_mean                           0
 PreInt_EduHx-computerinternet_hoursday_mean    0
 sii_mean                                       0
 Length: 152, dtype: int64,
 id                                             0
 Basic_Demos-Enroll_Season                      0
 Basic_Demos-Age                                0
 Basic_Demos-Sex                                0
 CGAS-Season                                    0
                                               ..
 PAQ_A-PAQ_A_Total_mean                         0
 PAQ_C-PAQ_C_Total_mean                         0
 SDS-SDS_Total_Raw_mea

In [17]:
from sklearn.preprocessing import LabelEncoder
train_df.to_csv("../data/input/train_original_imputed.csv", index=False)
test_df.to_csv("../data/input/test_original_imputed.csv", index=False)

def encode_categorical_features(feature, df, tdf):
    le = LabelEncoder()
    combined_data = df[feature].tolist()
    if feature in tdf.columns:
        combined_data += tdf[feature].tolist()
    le.fit(combined_data)

    df[feature] = le.transform(df[feature]).astype(int)
    if feature in tdf.columns:
        tdf[feature] = le.transform(tdf[feature]).astype(int)
    else:
        print(f"{feature} not in test data")
    return df, tdf


for feature in CATEGORICAL_FEATURES:
    train_df, test_df = encode_categorical_features(feature, train_df, test_df)




PCIAT-Season not in test data


In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_base_features = [f for f in [f"{feature}_mean" for feature in BASE_FEATURES] + [f"{feature}_median" for feature in BASE_FEATURES] + [f"{feature}_knn" for feature in BASE_FEATURES] if f in test_df.columns]
train_df[numerical_base_features] = scaler.fit_transform(train_df[numerical_base_features])
test_df[numerical_base_features] = scaler.transform(test_df[numerical_base_features])
train_df.to_csv("../data/features/train_encoded.csv", index=False)
test_df.to_csv("../data/features/test_encoded.csv", index=False)
