In [12]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [13]:
df_train = pd.read_parquet('sell_in_train.parquet')
df_test = pd.read_parquet('sell_in_test.parquet')

In [18]:
X_train = df_train.iloc[:,:-1]
X_test = df_test.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
y_test = df_test['tn_norm']

In [19]:
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

X_train: (20789, 18)
X_test: (780, 18)
y_train: (20789,)
y_test: (780,)


In [20]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[train_data, test_data])

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

example_row = X_test.iloc[0]
example_pred = model.predict(example_row.values.reshape(1, -1), num_iteration=model.best_iteration)
print(f'Prediction for example row: {example_pred[0]}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3669
[LightGBM] [Info] Number of data points in the train set: 20789, number of used features: 17
[LightGBM] [Info] Start training from score -0.020332
Prediction for example row: -0.35463766147436404


In [21]:
final = X_test
final['tn_futuro'] = y_test
final['tn_prediccion'] = y_pred

In [23]:
prod_stats = pd.read_parquet('prod_stats.parquet')
prod_stats = prod_stats[['product_id', 'average_tn','std_dev_tn']]
final = final.merge(prod_stats, how='left', on='product_id')
final.columns

Index(['product_id', 'periodo', 'primer_periodo', 'ultimo_periodo', 'tn_norm',
       'periodo_dt', 'tn_lag_1', 'tn_lag_2', 'tn_lag_3', 'tn_lag_4',
       'tn_lag_5', 'tn_lag_6', 'tn_lag_7', 'tn_lag_8', 'tn_lag_9', 'tn_lag_10',
       'tn_lag_11', 'tn_lag_12', 'tn_futuro', 'tn_prediccion', 'average_tn',
       'std_dev_tn'],
      dtype='object')

In [24]:
final['tn_futuro_real'] = final['tn_futuro'] * final['std_dev_tn'] + final['average_tn']
final['tn_prediccion_real'] = final['tn_prediccion'] * final['std_dev_tn'] + final['average_tn']

In [25]:
final

Unnamed: 0,product_id,periodo,primer_periodo,ultimo_periodo,tn_norm,periodo_dt,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,...,tn_lag_9,tn_lag_10,tn_lag_11,tn_lag_12,tn_futuro,tn_prediccion,average_tn,std_dev_tn,tn_futuro_real,tn_prediccion_real
0,20001,201912,201701,201912,0.356686,201912,-0.003260,0.547254,0.877630,-0.459504,...,0.242540,-0.467056,-0.411111,0.296306,0.356686,-0.354638,1398.344322,298.145460,1504.68856,1292.610713
1,20002,201912,201701,201912,0.256522,201912,1.363271,3.193077,0.265998,-0.643725,...,0.244400,0.110736,0.847234,0.000284,0.256522,-0.477632,1009.368178,303.834835,1087.30855,864.246788
2,20003,201912,201701,201912,0.011975,201912,0.203021,0.658692,0.269716,-0.867729,...,-0.859359,-0.447470,0.259436,-0.408084,0.011975,-0.582593,889.004243,292.036581,892.50129,718.865687
3,20004,201912,201701,201912,-0.150213,201912,0.233133,1.751307,0.510385,-0.844204,...,-0.230985,-1.024335,-0.714093,-0.383384,-0.150213,-0.705338,671.615383,224.450085,637.90002,513.302313
4,20005,201912,201701,201912,-0.233451,201912,-0.170835,1.615325,1.078133,-0.492651,...,-0.714639,-1.073430,-1.285619,-1.244157,-0.233451,-0.506950,644.200514,218.273222,593.24443,533.547001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,21263,201912,201810,201912,-0.498974,201912,-0.370080,-0.480589,-0.508232,-0.360887,...,-0.149127,-0.195352,-0.296472,-0.186159,-0.498974,-0.686038,0.089233,0.153381,0.01270,-0.015992
776,21265,201912,201903,201912,-0.362775,201912,-0.216364,0.180776,-0.666076,-0.676554,...,-0.676554,,,,-0.362775,-0.491263,0.089541,0.108803,0.05007,0.036090
777,21266,201912,201903,201912,-0.410020,201912,-0.259786,0.223190,-0.624896,-0.753614,...,-0.732192,,,,-0.410020,-0.451882,0.094659,0.105968,0.05121,0.046774
778,21267,201912,201903,201912,-0.965053,201912,-0.654440,0.049100,-0.932403,-0.654190,...,1.537993,,,,-0.965053,-0.423634,0.092835,0.079939,0.01569,0.058970


In [27]:
final.to_parquet('sell_in_pred.parquet', index=False)