In [None]:
import numpy as np
import pandas as pd
import os
import re
import copy
import pickle
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline

In [None]:
import numpy as np
import pandas as pd

train = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv")
test = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv")
dictionary = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv")
# sample_submission = pd.read_csv("/kaggle/input/dataset/sample_submission.csv")

train.head()

In [None]:
id_to_test = np.array(test['id'])

In [None]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    df = df.loc[:, ['X', 'Y', 'Z', 'enmo', 'anglez']]
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df


class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']
    
    return df

In [None]:
import os 
from tqdm import tqdm 
def extract_enmo(df_source, id=None):
    df = df_source.copy()
    df = df[df['non-wear_flag'] == 0]
    df.drop('non-wear_flag', axis=1, inplace=True)
    df.loc[:, 'Type_activity'] = 'Non-assigned'
    df.loc[(df['enmo'] < 10*1e-3), 'Type_activity'] = 'sedentary'
    df.loc[(df['enmo'] >= 10*1e-3) & (df['enmo'] < 100*1e-3), 'Type_activity'] = 'light'
    df.loc[(df['enmo'] >= 100*1e-3), 'Type_activity'] = 'moderate'
    
    total_wear = df['step'].count()
    
    sedentary_perall = df[df['Type_activity'] == 'sedentary']['step'].count()
    sedentary_perall = sedentary_perall / total_wear
    
    light_perall = df[df['Type_activity'] == 'light']['step'].count()
    light_perall = light_perall / total_wear
    
    moderate_perall = df[df['Type_activity'] == 'moderate']['step'].count()
    moderate_perall = moderate_perall / total_wear

    sedentary_perall, light_perall, moderate_perall
    return pd.DataFrame({'id': [id], 
                         'sedentary_por': [sedentary_perall], 
                         'light_por': [light_perall],
                         'moderate_por': [moderate_perall]}
                       )

def getEnmo(ts_path):
    listdir = os.listdir(ts_path)
    res_df = None
    for dir in tqdm(listdir):
        # print(dir)
        dft = pd.read_parquet(os.path.join(ts_path, dir, "part-0.parquet"))
        
        id = dir[3:]
        ex_df = extract_enmo(dft, id=id)
        if res_df is None:
            res_df = ex_df
        else:
            res_df = pd.concat([res_df, ex_df])
    return res_df

#res_df là kết quả

In [None]:
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from torch.utils.data import DataLoader
import torch
from torch import nn
from torch.nn import functional as F
from sklearn.preprocessing import StandardScaler

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.cur_epoches = 0
        #input (N, 5)
        self.conv1 = nn.Conv1d(5, 64, kernel_size=3, stride=2, padding='valid') # 32, N, 1
        self.avgpool1 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, stride=2, padding='valid') # 32, N, 1
        self.avgpool2 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv3 = nn.Conv1d(128, 128, kernel_size=3, stride=2, padding='valid') # 32, N, 1
        self.avgpool3 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv4 = nn.Conv1d(128, 256, kernel_size=3, stride=2, padding='valid') # 32, N, 1
        self.avgpool4 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv5 = nn.Conv1d(256, 256, kernel_size=3, stride=2, padding='valid')
        
        self.fc1 = nn.Linear(256, 128) # Adjust output size based on input dims
        self.fc15 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 4)

        self.dropout1 = nn.Dropout(0.3, inplace=False)
        self.dropout2 = nn.Dropout(0.3, inplace=False)

        self.act = nn.LeakyReLU(0.1)
        self.act1 = nn.Sigmoid()

    def forward(self, x, debug=False):
        x = self.act(self.conv1(x))
        if debug: print(x.shape)
        x = self.avgpool1(x)
        if debug: print(x.shape)

        x = self.act(self.conv2(x))
        if debug: print(x.shape)
        x = self.avgpool2(x)
        if debug: print(x.shape)

        x = self.act(self.conv3(x))
        if debug: print(x.shape)
        x = self.avgpool3(x)
        if debug: print(x.shape)

        x = self.act(self.conv4(x))
        if debug: print(x.shape)
        x = self.avgpool4(x)
        if debug: print(x.shape)

        x = self.act(self.conv5(x))
        if debug: print(x.shape)

        x = F.adaptive_avg_pool1d(x, 1).squeeze()
        if debug: print(x.shape)

        x = self.act1(self.fc1(x))
        if debug: print(x.shape)
        x = self.dropout1(x)
        
        x = self.act1(self.fc15(x))
        x = self.dropout2(x)
        
        x = self.fc2(x)
        if debug: print(x.shape)
        x = F.softmax(x, dim=0)

        return x

    def feature_extract(self, x, debug=False):
        x = self.act(self.conv1(x))
        if debug: print(x.shape)
        x = self.avgpool1(x)
        if debug: print(x.shape)

        x = self.act(self.conv2(x))
        if debug: print(x.shape)
        x = self.avgpool2(x)
        if debug: print(x.shape)

        x = self.act(self.conv3(x))
        if debug: print(x.shape)
        x = self.avgpool3(x)
        if debug: print(x.shape)

        x = self.act(self.conv4(x))
        if debug: print(x.shape)
        x = self.avgpool4(x)
        if debug: print(x.shape)

        x = self.act(self.conv5(x))
        if debug: print(x.shape)

        x = F.adaptive_avg_pool1d(x, 1).squeeze()
        if debug: print(x.shape)

        x = self.act1(self.fc1(x))
        if debug: print(x.shape)
        x = self.dropout1(x)
        
        x = self.act1(self.fc15(x)) #64
        y = self.dropout2(x)
        
        y = self.fc2(y)
        if debug: print(y.shape)
        y = F.softmax(y, dim=0) #4

        return x, y
        

In [None]:
class Dataset:
    def __init__(self, device, path_tabu, path_ts, preload=True, type='train'):
        self.device = device
        self.path_ts = path_ts
        self.tabu_data = pd.read_csv(path_tabu) 
        self.ids = [x[3:] for x in os.listdir(path_ts)]
        self.filter()
        self.ids.sort()
        self.type = type
        if type == 'train':
            n = len(self.ids)
            self.ids = self.ids[:int(n*0.8)]
        elif type == 'val':
            n = len(self.ids)
            self.ids = self.ids[int(n*0.8):]
        self.preload = preload
        if self.preload:
            self.ts_data_X, self.ts_data_Y = self.load_all_data()
    def filter(self):
        temp_ids = []
        for id in tqdm(self.ids):
            df = pd.read_parquet(os.path.join(self.path_ts, "id=" + id, "part-0.parquet"))
            if df.shape[0] >= 900:
                temp_ids.append(id)
        self.ids = temp_ids

    def collate(self, index):
        X = [self.ts_data_X[i].to(self.device) for i in index]
        Y = None
        if self.ts_data_Y is not None:
            Y = [self.ts_data_Y[i] for i in index]
            Y = torch.tensor(Y, dtype=torch.int64)
            Y = torch.nn.functional.one_hot(Y, num_classes=4).to(self.device).to(torch.float32)
        return {'X': X,
                'Y': Y}
            
    def dataloader(self, batch_size=1):
        size = len(self.ts_data_X)
        batch_size = size if batch_size == -1 else batch_size
        loader = DataLoader(list(range(size)), batch_size=batch_size, collate_fn=self.collate, shuffle=True if self.type == 'test' else False, num_workers=0)
        return loader
        
    def load_all_data(self):
        count = 0
        inputs = []
        labels = None if self.type == 'test' else list()
        for id in tqdm(self.ids):
            df = pd.read_parquet(os.path.join(self.path_ts, "id=" + id, "part-0.parquet"))
            df = df.loc[:, ['X', 'Y', 'Z', 'enmo', 'anglez']]
            # normalize the signals 
            scaler = StandardScaler()
            df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

            input = torch.tensor(df.values)
            input = input.T
            inputs.append(input)
            if self.type != 'test': labels.append(self.tabu_data[self.tabu_data['id'] == id]['sii'].values[0])

            
            # count += 1
            # if count == 2:
            #     break
        
        return inputs, labels
        
# train_tabu_path="/kaggle/input/child-mind-institute-problematic-internet-use/train.csv"
# train_ts_path='/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet'
# dataset = Dataset('cpu', path_tabu=train_tabu_path, path_ts=train_ts_path, type='train')
# dataloader = dataset.dataloader()
# model.zero_grad()
# for data in dataloader:
#     print(data['X'][0])
#     print(data['Y'])

In [None]:
def get_feature_64_4(tabu_path, ts_path, merge=False):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_path = "/kaggle/input/cnn-feature_extractor/pytorch/default/5/model-50 (1).pth"
    model = CNN()
    model.load_state_dict(torch.load(model_path, weights_only=True))
    model.to(device)
    model.eval()

    trainset = Dataset(device, path_tabu=tabu_path, path_ts=ts_path, type='test')
    dataloader = trainset.dataloader(batch_size=1)


    features64 = list()
    features4 = list()
    with torch.no_grad():
        for data in dataloader:
            X = data['X'][0]
            x_64, x_4 = model.feature_extract(X)
            features64.append(x_64)
            features4.append(x_4)
    
    features64 = torch.stack(features64, dim=0)
    features4 = torch.stack(features4, dim=0)
    features64 = features64.cpu().detach().numpy()
    features4 = features4.cpu().detach().numpy()
    ids = np.array(trainset.ids)
    if merge:
        features68 = np.concatenate([features64, features4], axis=-1)
        ts_features_df = pd.DataFrame(features68, columns=[f'feature_{i}' for i in range(features68.shape[1])])
        ts_features_df.insert(0, 'id', ids)
        return ts_features_df
    else:
        ts_features_df64 = pd.DataFrame(features64, columns=[f'feature64_{i}' for i in range(features64.shape[1])])
        ts_features_df64.insert(0, 'id', ids)

        ts_features_df4 = pd.DataFrame(features4, columns=[f'feature4_{i}' for i in range(features4.shape[1])])
        ts_features_df4.insert(0, 'id', ids)
        return ts_features_df64, ts_features_df4

In [None]:
tabu_path_train = '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'
ts_path_train = '/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet'

tabu_path_test = '/kaggle/input/child-mind-institute-problematic-internet-use/test.csv'
ts_path_test = '/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet'

train_64, train_4 = get_feature_64_4(tabu_path_train, ts_path_train, merge=False)
test_64, test_4 = get_feature_64_4(tabu_path_test, ts_path_test, merge=False)

train_enmo = getEnmo(ts_path_train)
test_enmo = getEnmo(ts_path_test)

train_ts_7 = train_enmo.merge(train_4, on ='id', how='left')
test_ts_7 = test_enmo.merge(test_4, on='id', how='left')

In [None]:

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

train_ts_encoded = train_ts_encoded.merge(train_ts_7, on='id', how='left')
test_ts_encoded = test_ts_encoded.merge(test_ts_7, on='id', how='left')

#train_ts_encoded, test_ts_encoded là feature cuối.

In [None]:
train = train.merge(train_ts_encoded, on='id', how='left')
test = test.merge(test_ts_encoded, on='id', how='left')
train

In [None]:
from sklearn import preprocessing

encoding = preprocessing.LabelEncoder()

def convert(dataset):
    if 'id' in dataset.columns:
        getId = dataset['id']
        dataset = dataset.drop(columns = 'id')
    get_columns = dataset.select_dtypes(exclude="number").columns
    for cols in get_columns:
        dataset[cols] = encoding.fit_transform(dataset[cols])
    return dataset

In [None]:
def reduction(dataset):
    for cols in dataset.columns:
        if (cols != 'sii') and (dataset[cols].isna().sum() > len(dataset[cols]) * 0.7):
            dataset.drop(cols, axis=1, errors="ignore", inplace=True)
    return dataset

In [None]:
def fillCategory(dataset):
    cols = dataset.select_dtypes("object").columns
    dataset[cols] = dataset[cols].fillna("Missing")
    dataset[cols] = dataset[cols].astype("category")
    return dataset

#def fillNum(dataset):
    #cols = dataset.select_dtypes("number").columns
    #dataset[cols] = dataset[cols].fillna(dataset[cols].median())
#    return dataset

In [None]:
common_columns = train.columns.intersection(test.columns)
final_train = pd.DataFrame(train[common_columns])

In [None]:
final_train['sii'] = train['sii'].reset_index(drop=True)
final_train = final_train.dropna(subset='sii')

final_train = reduction(final_train)
final_train = fillCategory(final_train)
final_train = convert(final_train)
test = convert(test)
final_train

In [None]:
test

In [None]:
#final_train = fillCategory(final_train)
#final_train

In [None]:
new_common_columns = test.columns.intersection(final_train.columns)
final_test = pd.DataFrame(test[new_common_columns])
final_test = convert(final_test)
len(final_test.columns)

In [None]:
def rounding(y):
    return np.where(y < 0.5, 0,
                    np.where(y < 1.5, 1,
                             np.where(y < 2.5, 2, 3)))

In [None]:
from sklearn.model_selection import train_test_split
feature_train = final_train.drop(columns = 'sii')
X = np.array(feature_train)
y = np.array(final_train["sii"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

X_to_test = np.array(final_test)
y_to_test = y[:20]


In [None]:
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
param_distributions = {
    'iterations': [1, 5, 10, 50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5, 0.8],
    'depth': [1, 2, 3, 5, 10, 15],
}
classifier = CatBoostRegressor(verbose=0, task_type = 'GPU', gpu_ram_part = 0.5, early_stopping_rounds=20)

CatBoost = RandomizedSearchCV(
    estimator=classifier,
    param_distributions=param_distributions,
    n_iter=100, 
    cv=3,
    scoring='r2', 
    random_state=42,
    n_jobs=1
)
CatBoost.fit(X_train, y_train, use_best_model=False)
prediction1 = CatBoost.predict(X_to_test)

In [None]:
prediction1 = prediction1.flatten()
prediction1 = rounding(prediction1)
prediction1.shape

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
param_distributions = {
    
    "n_estimators": [5, 10, 25, 50, 100, 200],
    "learning_rate": [0.005, 0.01, 0.05, 0.1, 0.5, 0.8],
    "lambda_l1": [0.1, 1, 5, 10, 25, 50],
    "lambda_l2": [0.1, 1, 5, 10, 25, 50],
}
classifier = lgb.LGBMClassifier(verboss=0, force_col_wise=True)

LightGBM = RandomizedSearchCV(
    estimator=classifier,
    param_distributions=param_distributions,
    n_iter=100,
    cv=3,
    scoring='r2', 
    random_state=42,
    n_jobs=-1
)

LightGBM.fit(X_train, y_train)
prediction2 = LightGBM.predict(X_to_test)

In [None]:
prediction2 = rounding(prediction2)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold

from sklearn.metrics import cohen_kappa_score
param_distributions = {
    "n_estimators": [5, 10, 25, 50, 100, 500],
    "max_depth" : [1, 5, 10, 25, 50, 100],
    "learning_rate": [0.005, 0.01, 0.05, 0.1, 0.5, 0.8],
    "min_child_weight": [1, 5, 10, 20, 30, 50],
    "colsample_bytree": [0.01, 0.05, 0.1, 0.5, 0.8, 1],
    "subsample": [0.01, 0.05, 0.1, 0.5, 0.8, 1],
    "reg_alpha": [0.1, 1, 5, 10, 25, 50],
    "reg_lambda": [0.1, 1, 5, 10, 25, 50],
}
classifier = XGBRegressor()
XGBoost = RandomizedSearchCV (
    estimator=classifier,
    param_distributions=param_distributions,
    n_iter=100, 
    cv=3,
    scoring='r2', 
    random_state=42,
    n_jobs=-1
)
XGBoost.fit(X_train, y_train)    
prediction3 = XGBoost.predict(X_to_test)

In [None]:
prediction3 = rounding(prediction3)

In [None]:
prediction1 = pd.Series(prediction1)
prediction2 = pd.Series(prediction2)
prediction3 = pd.Series(prediction3)
#prediction = pd.DataFrame(prediction1, prediction2, prediction3)

In [None]:
submission = pd.DataFrame({
    'id': id_to_test,
    'sii': prediction1
})
submission

In [None]:
submission.to_csv("submission.csv", index=False)