In [11]:
import pandas as pd
import lightgbm as lgb

In [12]:
df_train = pd.read_parquet('sell_in_train.parquet')
df_test = pd.read_parquet('sell_in_test.parquet')

In [13]:
X_train = df_train.iloc[:,:-1]
X_test = df_test.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
y_test = df_test['tn_futuro']

In [14]:
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

X_train: (20789, 42)
X_test: (780, 42)
y_train: (20789,)
y_test: (780,)


In [15]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.4
}

model = lgb.train(params, train_data, num_boost_round=10000, valid_sets=[train_data, test_data])

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

example_row = X_test.iloc[0]
example_pred = model.predict(example_row.values.reshape(1, -1), num_iteration=model.best_iteration)
print(f'Prediction for example row: {example_pred[0]}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9789
[LightGBM] [Info] Number of data points in the train set: 20789, number of used features: 41
[LightGBM] [Info] Start training from score 0.057580
Prediction for example row: 0.7750481715120037


In [16]:
final = X_test
final['tn_futuro'] = y_test
final['tn_prediccion'] = y_pred

In [17]:
prod_stats = pd.read_parquet('prod_stats.parquet')
prod_stats = prod_stats[['product_id', 'average_tn','std_dev_tn']]
final = final.merge(prod_stats, how='left', on='product_id')
final.columns

Index(['product_id', 'periodo', 'primer_periodo', 'ultimo_periodo', 'tn_norm',
       'periodo_dt', 'tn_lag_1', 'tn_lag_2', 'tn_lag_3', 'tn_lag_4',
       'tn_lag_5', 'tn_lag_6', 'tn_lag_7', 'tn_lag_8', 'tn_lag_9', 'tn_lag_10',
       'tn_lag_11', 'tn_lag_12', 'tn_diff_1', 'tn_diff_2', 'tn_diff_3',
       'tn_diff_4', 'tn_diff_5', 'tn_diff_6', 'tn_diff_7', 'tn_diff_8',
       'tn_diff_9', 'tn_diff_10', 'tn_diff_11', 'tn_diff_12', 'tn_diff2_1',
       'tn_diff2_2', 'tn_diff2_3', 'tn_diff2_4', 'tn_diff2_5', 'tn_diff2_6',
       'tn_diff2_7', 'tn_diff2_8', 'tn_diff2_9', 'tn_diff2_10', 'tn_diff2_11',
       'tn_diff2_12', 'tn_futuro', 'tn_prediccion', 'average_tn',
       'std_dev_tn'],
      dtype='object')

In [18]:
final['tn_futuro_real'] = (final['tn_norm'] + final['tn_futuro']) * final['std_dev_tn'] + final['average_tn']
final['tn_prediccion_real'] = (final['tn_norm'] + final['tn_prediccion']) * final['std_dev_tn'] + final['average_tn']

In [19]:
final.head()

Unnamed: 0,product_id,periodo,primer_periodo,ultimo_periodo,tn_norm,periodo_dt,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,...,tn_diff2_9,tn_diff2_10,tn_diff2_11,tn_diff2_12,tn_futuro,tn_prediccion,average_tn,std_dev_tn,tn_futuro_real,tn_prediccion_real
0,20001,201912,201701,201912,0.356686,201912,-0.00326,0.547254,0.87763,-0.459504,...,-0.34965,0.415891,1.067363,1.454473,,0.775048,1398.344322,298.14546,,1735.765654
1,20002,201912,201701,201912,0.256522,201912,1.363271,3.193077,0.265998,-0.643725,...,-1.240414,-0.37025,-1.953699,1.385908,,0.66293,1009.368178,303.834835,,1288.729765
2,20003,201912,201701,201912,0.011975,201912,0.203021,0.658692,0.269716,-0.867729,...,0.220842,0.51586,-0.858568,1.305646,,0.830771,889.004243,292.036581,,1135.116926
3,20004,201912,201701,201912,-0.150213,201912,0.233133,1.751307,0.510385,-0.844204,...,-1.176696,-0.073104,-0.052637,0.58249,,0.689765,671.615383,224.450085,,792.717853
4,20005,201912,201701,201912,-0.233451,201912,-0.170835,1.615325,1.078133,-0.492651,...,-0.421406,-0.274804,-0.021154,0.380083,,0.206793,644.200514,218.273222,,638.381746


In [20]:
final.to_parquet('sell_in_pred.parquet', index=False)