In [283]:
# %pip install pandas
# %pip install numpy
# %pip install scikit-learn
# %pip install tqdm

In [284]:
import pandas as pd
import numpy as np
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor

# Preprocessing data


In [285]:
df_data_dictionary = pd.read_csv(
    "Dataset/child-mind-institute-problematic-internet-use/data_dictionary.csv"
)

## Load time series data


In [286]:
def process_parquet_file(path: str) -> list:
    """
    Process data in a parquet file of an id:
        - Read parquet file.
        - Drop column "step".
        - Calculate statistics for all columns then flatten it. This is used as
        additional features along with ones in train.csv.
        - Extract ID from the path and append to the end of column list above.

    Parameters:
        path (str): path to the parquet file.

    Returns:
        A list contains statistical columns as additional features and ID at the end.
    """

    df = pd.read_parquet(path)
    df.drop("step", axis=1, inplace=True)
    res = df.describe().values.flatten().tolist()
    id = path.split("=")[-1].split("\\")[0]
    res.append(id)

    return res

In [287]:
def load_parquet_files(dir: str) -> pd.DataFrame:
    """
    Load all parquet file ands process each one in 4 threads.

    Parameters:
        dir (str): path the to the directory which contains parquet files.

    Returns:
        A list contains results of processed parquet files.
    """

    all_ids = [os.listdir(dir)[0]]
    all_parquet_files = [os.path.join(dir, id, "part-0.parquet") for id in all_ids]

    with ThreadPoolExecutor(max_workers=4) as excuter:
        results = list(
            tqdm(
                excuter.map(process_parquet_file, all_parquet_files), total=len(all_ids)
            )
        )

    return results

In [288]:
def create_df_parquet(dir: str) -> pd.DataFrame:
    """
    Create a Dataframe using restuls of processed parquet files.

    Parameters:
        dir (str): path the to the directory which contains parquet files.

    Returns:
        A Dataframe contains statistical time series data of all IDs.
    """

    time_series_data = load_parquet_files(dir)
    df = pd.DataFrame(
        data=time_series_data,
        columns=["Stat_" + str(i) for i in range(len(time_series_data[0]))],
    )

    df.rename(columns={df.columns[-1]: "id"}, inplace=True)

    return df

In [289]:
df_train_parquet = create_df_parquet(
    dir="Dataset/child-mind-institute-problematic-internet-use/series_train.parquet"
)

df_test_parquet = create_df_parquet(
    dir="Dataset/child-mind-institute-problematic-internet-use/series_test.parquet"
)

100%|██████████| 1/1 [00:00<00:00, 48.41it/s]
100%|██████████| 1/1 [00:00<00:00, 50.26it/s]


## Load csv data


In [290]:
df_train_csv = pd.read_csv(
    "Dataset/child-mind-institute-problematic-internet-use/train.csv"
)

df_test_csv = pd.read_csv(
    "Dataset/child-mind-institute-problematic-internet-use/test.csv"
)

## Merge time series and csv data


In [291]:
df_train = pd.merge(df_train_csv, df_train_parquet, how="left", on="id")
df_test = pd.merge(df_test_csv, df_test_parquet, how="left", on="id")

In [292]:
test_id = df_test["id"]

In [293]:
df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

## Drop PCIAT columns


In [294]:
df_train = df_train[[col for col in df_train.columns if "PCIAT" not in col]]
df_train = df_train[[col for col in df_train if col != "sii"] + ["sii"]]
df_train.dropna(subset="sii", inplace=True)

## Mapping string data to numeric


In [295]:
season_columns = [
    "Basic_Demos-Enroll_Season",
    "CGAS-Season",
    "Physical-Season",
    "Fitness_Endurance-Season",
    "FGC-Season",
    "BIA-Season",
    "PAQ_A-Season",
    "PAQ_C-Season",
    "SDS-Season",
    "PreInt_EduHx-Season",
]

season_mapping = {"Summer": 0, "Winter": 1, "Spring": 2, "Fall": 3, "Missing": 4}

In [296]:
def season_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill in missing data of season-related columns and convert
    to numeric category.

    Parameters:
        df (pandas.Dataframe): a Dataframe to be processed.

    Returns:
        A processed Dataframe (just filling and mapping season-related columns).
    """

    for col in season_columns:
        df[col] = df[col].fillna("Missing")
        df[col] = df[col].map(season_mapping)
        df[col] = df[col].astype(int)

    return df

In [297]:
df_train = season_to_numeric(df_train)
df_test = season_to_numeric(df_test)

## Fill in missing cells using mean


In [298]:
df_train = df_train.fillna(df_train.mean())
df_test = df_test.fillna(df_test.mean())

In [299]:
df_test[df_test.columns.tolist()]

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,Stat_86,Stat_87,Stat_88,Stat_89,Stat_90,Stat_91,Stat_92,Stat_93,Stat_94,Stat_95
0,3,5,0,1,51.0,3,16.877316,46.0,50.8,25.4,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
1,0,9,0,4,62.5,3,14.03559,48.0,46.0,22.0,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
2,0,10,1,3,71.0,3,16.648696,56.5,75.6,25.4,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
3,1,9,0,3,71.0,0,18.292347,56.0,81.6,25.4,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
4,2,18,1,0,62.5,4,19.835939,52.961538,79.2,25.4,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
5,2,13,1,1,50.0,0,22.279952,59.5,112.2,25.4,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
6,3,10,0,4,62.5,3,19.66076,55.0,84.6,25.4,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
7,3,10,1,4,62.5,3,16.861286,59.25,84.2,27.0,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
8,0,15,0,4,62.5,2,19.835939,52.961538,79.2,25.4,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
9,0,19,1,0,62.5,4,19.835939,52.961538,79.2,25.4,...,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0


# Train model


In [300]:
x_train = df_train.drop(columns=["sii"], axis=1)
y_train = df_train["sii"]

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

In [302]:
y_pred = model.predict(df_test)
rounded_pred = np.round(y_pred).astype(int)

In [303]:
submission = pd.DataFrame({"id": test_id, "sii": rounded_pred})

In [304]:
submission.to_csv("submission.csv", index=False)

# Submission
