In [21]:
import pandas as pd
import lightgbm as lgb

In [22]:
df_train = pd.read_parquet('sell_in_train.parquet')
df_test = pd.read_parquet('sell_in_test.parquet')

In [23]:
X_train = df_train.iloc[:,:-1]
X_test = df_test.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
y_test = df_test.iloc[:,-1]

In [24]:
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

X_train: (14511, 110)
X_test: (780, 110)
y_train: (14511,)
y_test: (780,)


In [25]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.4
}

model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data, test_data])

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

example_row = X_test.iloc[0]
example_pred = model.predict(example_row.values.reshape(1, -1), num_iteration=model.best_iteration)
print(f'Prediction for example row: {example_pred[0]}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16005
[LightGBM] [Info] Number of data points in the train set: 14511, number of used features: 107
[LightGBM] [Info] Start training from score -0.061244
Prediction for example row: -0.9372336717356808


In [26]:
final = X_test
final['tn_futuro'] = y_test
final['tn_prediccion'] = y_pred

In [27]:
prod_stats = pd.read_parquet('prod_stats.parquet')
prod_stats = prod_stats[['product_id', 'average_tn','std_dev_tn']]
final = final.merge(prod_stats, how='left', on='product_id')
final.columns

Index(['product_id', 'periodo', 'primer_periodo', 'ultimo_periodo', 'tn_norm',
       'periodo_dt', 'mes', 'quarter', 'month_in_quarter', 'year',
       ...
       'crece_10', 'crece_11', 'crece_12', 'crece_13', 'crece_sum',
       'decrece_sum', 'tn_futuro', 'tn_prediccion', 'average_tn',
       'std_dev_tn'],
      dtype='object', length=114)

In [28]:
final['tn_futuro_real'] = (final['tn_norm'] + final['tn_futuro']) * final['std_dev_tn'] + final['average_tn']
final['tn_prediccion_real'] = (final['tn_norm'] + final['tn_prediccion']) * final['std_dev_tn'] + final['average_tn']

In [29]:
final.head()

Unnamed: 0,product_id,periodo,primer_periodo,ultimo_periodo,tn_norm,periodo_dt,mes,quarter,month_in_quarter,year,...,crece_12,crece_13,crece_sum,decrece_sum,tn_futuro,tn_prediccion,average_tn,std_dev_tn,tn_futuro_real,tn_prediccion_real
0,20001,201912,201701,201912,0.356686,201912,12,4,3,2019,...,False,False,5,7,,-0.937234,1398.344322,298.14546,,1225.256596
1,20002,201912,201701,201912,0.256522,201912,12,4,3,2019,...,True,False,6,6,,-0.472139,1009.368178,303.834835,,943.856354
2,20003,201912,201701,201912,0.011975,201912,12,4,3,2019,...,True,False,6,6,,-0.502988,889.004243,292.036581,,745.610447
3,20004,201912,201701,201912,-0.150213,201912,12,4,3,2019,...,False,False,5,7,,-0.549373,671.615383,224.450085,,514.593195
4,20005,201912,201701,201912,-0.233451,201912,12,4,3,2019,...,False,False,6,6,,-0.417612,644.200514,218.273222,,502.090965


In [30]:
final.to_parquet('sell_in_pred.parquet', index=False)