In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor
import warnings
from scipy.optimize import minimize
import os
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
def process_file(filename, dirname):
    data = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    data.drop('step', axis=1, inplace=True)
    return data.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname):
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    data = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    data['id'] = indexes
    return data

In [3]:
# train_parquet = load_time_series("../data/input/series_train.parquet")
# test_parquet = load_time_series("../data/input/series_test.parquet")

In [4]:


train_df = pd.read_csv("../data/input/train.csv").dropna(subset='sii')
test_df = pd.read_csv("../data/input/test.csv")

In [5]:
train_df.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,1.0,2.0,1.0,34.0,Summer,40.0,56.0,Spring,0.0,1.0


In [6]:
train_df.shape, test_df.shape

((2736, 82), (20, 59))

In [7]:
BASE_FEATURES = test_df.drop('id', axis=1).columns
TEST_IDS = test_df['id']

# analysis

In [8]:


train_missing_values = train_df.isnull().sum().sort_values(ascending=False)
test_missing_values = test_df.isnull().sum().sort_values(ascending=False)
train_missing_values

PAQ_A-PAQ_A_Total               2373
PAQ_A-Season                    2373
Physical-Waist_Circumference    2253
Fitness_Endurance-Time_Sec      2008
Fitness_Endurance-Time_Mins     2008
                                ... 
PCIAT-Season                       0
Basic_Demos-Enroll_Season          0
Basic_Demos-Sex                    0
Basic_Demos-Age                    0
sii                                0
Length: 82, dtype: int64

In [9]:
train_df.dtypes.groupby(train_df.dtypes).size()

int64       2
float64    68
object     12
dtype: int64

In [10]:
train_df.dtypes.groupby(train_df[BASE_FEATURES].dtypes).size()

int64       2
float64    46
object     10
dtype: int64

In [11]:
test_df.dtypes.groupby(test_df[BASE_FEATURES].dtypes).size()

int64       2
float64    46
object     10
dtype: int64

In [12]:
CATEGORICAL_FEATURES = train_df[BASE_FEATURES].select_dtypes(include='object').columns
NUMERICAL_FEATURES = train_df[BASE_FEATURES].select_dtypes(exclude='object').columns
RANDOM_STATE = 42
CATEGORICAL_FEATURES, NUMERICAL_FEATURES

(Index(['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
        'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
        'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season'],
       dtype='object'),
 Index(['Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'Physical-BMI',
        'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
        'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
        'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
        'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
        'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
        'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
        'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
        'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
        'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
        'BIA-BIA

In [13]:

# categorical features with missing values
train_df[CATEGORICAL_FEATURES].isnull().sum().sort_values(ascending=False)

PAQ_A-Season                 2373
Fitness_Endurance-Season     1476
PAQ_C-Season                 1296
BIA-Season                    892
CGAS-Season                   394
SDS-Season                    209
Physical-Season               141
FGC-Season                     89
PreInt_EduHx-Season            17
Basic_Demos-Enroll_Season       0
dtype: int64

In [14]:
train_df[CATEGORICAL_FEATURES] = train_df[CATEGORICAL_FEATURES].fillna('missing')
test_df[CATEGORICAL_FEATURES] = test_df[CATEGORICAL_FEATURES].fillna('missing')