In [None]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb




## Overview

In this task you will have to solve a standard problem on tabular data. However, the final solution must be obtained using the function **clf_train**.

## Metric

squared [RMSE](https://en.wikipedia.org/wiki/Root_mean_square_deviation) 
$$ SCORE = (\sum_{i=1}^{n}{(true_i - predict_i)^{2}}/n)^{1/4} $$
* **true** - real value of target
* **predict** - your predict
* **n** - length of target

## Restriction

You cannot change the code of the **clf_train** function. You can only use submissions produced by this function. This function takes the following s input: training and test datasets with the features preprocessed by you, weights of target for training, id column for generating the sample_submission.csv and function for inverting the target. 

## Data

* **train_tables.csv** - train dataset with 9 numeric features, 3 datetime features and target
* **test_tables.csv** - test dataset with 9 numeric features, 3 datetime features and id for submission
* **sample_submission.csv** -  example of submission file with id coluns and target column that needed to predict.

read train and test dataframes

In [2]:
train = pd.read_csv('/kaggle/input/neoai-2025-tricy-table-data/train_tables.csv')
test = pd.read_csv('/kaggle/input/neoai-2025-tricy-table-data/test_tables.csv')

Inference function

**You cannot change this function.**

In [None]:
def clf_train(train, test, target, weight_col, id_col, name_file = 'sub.csv', func_inv=None):

    param = {
    'learning_rate': 0.1,
    'num_leaves': 48,
    'lambda_l1' : 1,
    'lambda_l2' : 1,
    'min_data_in_leaf' : 100,
    'objective': 'mae',
    'verbosity':-1,
    }
    
    predict_test = np.zeros(len(test))

    tr = lgb.Dataset(train, target, weight=weight_col)
    bst = lgb.train(param, tr, num_boost_round=500)
    predict_test = bst.predict(test)
    if func_inv:
        predict_test = func_inv(predict_test)

    sub = pd.DataFrame()
    sub['id'] = id_col
    sub['target'] = predict_test
    sub.to_csv(name_file, index = None)
  
    
    

Function to change target if you need it

In [None]:
def func_inv(x):
    # This function inverts the power transformation applied to the target during training.
    # Since the evaluation metric is RMSE with a 4th root applied,
    # I trained the model on the 4th root of the target (train['target']** 0.25) to linearize the loss,
    # and used this inverse function (x^4) to return predictions to their original scale.
    x = x ** 4
    return x 

Train and inference. 
You should use **clf_train** for generating submission

In [None]:

#Here, I impute `feat_1` and `feat_8` in the test data based on their mean values within each (day, hour) time window. 
#This was done by grouping the clean part of the test data and merging the group statistics back in to fill the gaps.


test_clean = test.dropna(subset=['day', 'hour']).copy()
# Compute mean for feat_1 and feat_8 per (day, hour)
group_test_means = test_clean.groupby(['day', 'hour'])[['feat_1', 'feat_8']].mean().reset_index()
test = test.merge(group_test_means, on=['day', 'hour'], how='left', suffixes=('', '_time_mean'))
test['feat_1'] = test['feat_1'].fillna(test['feat_1_time_mean'])
test['feat_8'] = test['feat_8'].fillna(test['feat_8_time_mean'])
test.drop(columns=['feat_1_time_mean', 'feat_8_time_mean'])

drop_cols = ['target']
train_cols = [c for c in train.columns if c not in drop_cols]
weight = np.ones(len(train))
test_sub = clf_train(train[train_cols], test[train_cols], train['target']** 0.25 , weight, test['id'].tolist(), 'submission.csv', func_inv = func_inv)


In [None]:
#To better understand how feature distributions evolve over time, 
#I computed the relative standard deviation of each feature within `(day, hour)` time windows. 
#This helped reveal whether certain features exhibited time-dependent variability.


import pandas as pd

feat_cols = [f'feat_{i}' for i in range(9)]

# Drop rows where day or hour is NaN
train_clean = train.dropna(subset=['day', 'hour']).copy()
test_clean = test.dropna(subset=['day', 'hour']).copy()

# Compute overall std for each feature (across whole dataset)
overall_std = train_clean[feat_cols].std()
overall_test_std = test_clean[feat_cols].std()

# Compute group std per (day, hour)
group_std = train_clean.groupby(['day', 'hour'])[feat_cols].std().reset_index()
group_test_std = test_clean.groupby(['day', 'hour'])[feat_cols].std().reset_index()

# Calculate relative std (group std / overall std)
for feat in feat_cols:
    group_std[f'{feat}_rel_std'] = group_std[feat] / overall_std[feat]

for feat in feat_cols:
    group_test_std[f'{feat}_rel_std'] = group_test_std[feat] / overall_test_std[feat]

# Now group_std contains the relative std values per time group per feature
train = train.merge(group_std[['day', 'hour'] + [f'{feat}_rel_std' for feat in feat_cols]], 
                    on=['day', 'hour'], how='left')

test = test.merge(group_std[['day', 'hour'] + [f'{feat}_rel_std' for feat in feat_cols]], 
                  on=['day', 'hour'], how='left')


print(group_test_std)


    day  hour     feat_0      feat_1    feat_2    feat_3     feat_4    feat_5  \
0   1.0   3.0  68.318174   35.154312  0.161525  1.645722  24.300528  5.032904   
1   1.0   4.0  32.708355   29.140498  0.163607  1.940780  18.415738  3.206455   
2   1.0   5.0  55.050122  149.877779  0.164393  4.026098  26.455074  4.527512   
3   1.0   6.0  65.085840   45.206753  0.161610  3.438532  24.638165  4.585673   
4   1.0   7.0  72.455214   19.098902  0.153890  4.968696  18.054194  5.117336   
5   1.0   8.0  49.415961   20.245195  0.158357  3.234293  17.899657  3.861229   
6   1.0   9.0  47.534170  111.305035  0.153543  3.402877  21.417155  3.510116   
7   1.0  10.0  50.198807   27.179672  0.156781  3.483692  18.005464  3.585000   
8   1.0  11.0  37.262027   23.611157  0.151215  2.644291  18.356149  2.642663   
9   1.0  12.0  41.815335   21.798794  0.154236  2.988080  17.801661  3.129901   
10  1.0  13.0  35.620418   34.256741  0.159014  2.364941  17.869114  2.510433   
11  1.0  14.0  54.024655   2