In [51]:
import pandas as pd
import lightgbm as lgb
#from lightgbm import LGBMRegressor

In [52]:
df_train = pd.read_parquet('sell_in_train.parquet')
df_test = pd.read_parquet('sell_in_test.parquet')
df_futuro = pd.read_parquet('sell_in_futuro.parquet')
print(f"df_train: {df_train.shape}")
print(f"df_test: {df_test.shape}")
print(f"df_futuro: {df_futuro.shape}")

df_train: (13731, 138)
df_test: (780, 138)
df_futuro: (780, 138)


In [53]:
categorical_features = df_train.select_dtypes(['category']).columns.tolist()
for col in categorical_features:
    print(f"convertida {col}")
    df_train[col] = df_train[col].cat.codes
    df_test[col] = df_test[col].cat.codes
    df_futuro[col] = df_futuro[col].cat.codes

convertida yearquarter
convertida cat1
convertida cat2
convertida cat3
convertida brand


In [54]:
X_train = df_train.iloc[:,:-1]
X_test = df_test.iloc[:,:-1]
X_futuro = df_futuro.iloc[:,:-1]

y_train = df_train.iloc[:,-1]
y_test = df_test.iloc[:,-1]
y_futuro = df_futuro.iloc[:,-1]

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"X_futuro: {X_futuro.shape}")

print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_futuro: {y_futuro.shape}")

X_train: (13731, 137)
X_test: (780, 137)
X_futuro: (780, 137)
y_train: (13731,)
y_test: (780,)
y_futuro: (780,)


In [55]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)
futuro_data = lgb.Dataset(X_futuro, label=y_futuro)

params = {
    'objective': 'regression',
    'metric': {'l2', 'rmse'},
    'boosting_type': 'gbdt',
    'num_leaves': 50,
#    'max_depth': -1,
    'learning_rate': 0.01,
#    'feature_fraction': 0.4
}

model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data, test_data])

#y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_futuro = model.predict(X_futuro, num_iteration=model.best_iteration)

#example_row = X_test.iloc[0]
#example_pred = model.predict(example_row.values.reshape(1, -1), num_iteration=model.best_iteration)
#print(f'Prediction for example row: {example_pred[0]}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19447
[LightGBM] [Info] Number of data points in the train set: 13731, number of used features: 134
[LightGBM] [Info] Start training from score -0.023311


In [56]:
final = X_test
final['tn_futuro'] = y_test
final['tn_prediccion'] = y_pred

In [57]:
prod_stats = pd.read_parquet('prod_stats.parquet')
prod_stats = prod_stats[['product_id', 'average_tn','std_dev_tn']]
final = final.merge(prod_stats, how='left', on='product_id')
final.columns

Index(['product_id', 'periodo', 'primer_periodo', 'ultimo_periodo', 'tn_norm',
       'periodo_dt', 'mes', 'quarter', 'month_in_quarter', 'year',
       ...
       'prop_product_yearquarter_cat1', 'prop_product_yearquarter_cat2',
       'prop_product_yearquarter_brand', 'prop_product_year_cat1',
       'prop_product_year_cat2', 'prop_product_year_brand', 'tn_futuro',
       'tn_prediccion', 'average_tn', 'std_dev_tn'],
      dtype='object', length=141)

In [58]:
final['tn_futuro_real'] = (final['tn_norm'] + final['tn_futuro']) * final['std_dev_tn'] + final['average_tn']
final['tn_prediccion_real'] = (final['tn_norm'] + final['tn_prediccion']) * final['std_dev_tn'] + final['average_tn']

In [59]:
final.head()

Unnamed: 0,product_id,periodo,primer_periodo,ultimo_periodo,tn_norm,periodo_dt,mes,quarter,month_in_quarter,year,...,prop_product_yearquarter_brand,prop_product_year_cat1,prop_product_year_cat2,prop_product_year_brand,tn_futuro,tn_prediccion,average_tn,std_dev_tn,tn_futuro_real,tn_prediccion_real
0,20001,201910,201701,201912,0.547254,201910,10,4,1,2019,...,25.615935,0.729053,1.30015,6.721442,-0.190568,-0.537512,1398.344322,298.14546,1504.68856,1401.248846
1,20002,201910,201701,201912,3.193077,201910,10,4,1,2019,...,8.629338,0.924228,1.648214,2.181345,-2.936555,-2.275262,1009.368178,303.834835,1087.30855,1288.232643
2,20003,201910,201701,201912,0.658692,201910,10,4,1,2019,...,9.281088,1.524496,2.028025,2.711313,-0.646717,-0.312087,889.004243,292.036581,892.50129,990.22561
3,20004,201910,201701,201912,1.751307,201910,10,4,1,2019,...,9.138012,1.500995,1.996762,2.669516,-1.90152,-1.96425,671.615383,224.450085,637.90002,623.82014
4,20005,201910,201701,201912,1.615325,201910,10,4,1,2019,...,8.555128,1.405251,1.869395,2.499236,-1.848776,-1.102689,644.200514,218.273222,593.24443,756.095265


In [60]:
final.to_parquet('sell_in_pred.parquet', index=False)

In [61]:
final_futuro = X_futuro
final_futuro['tn_futuro'] = y_futuro
final_futuro['tn_prediccion'] = y_pred_futuro

final_futuro = final_futuro.merge(prod_stats, how='left', on='product_id')
final_futuro.columns

final_futuro['tn_futuro_real'] = (final_futuro['tn_norm'] + final_futuro['tn_futuro']) * final_futuro['std_dev_tn'] + final_futuro['average_tn']
final_futuro['tn_prediccion_real'] = (final_futuro['tn_norm'] + final_futuro['tn_prediccion']) * final_futuro['std_dev_tn'] + final_futuro['average_tn']

final_futuro.to_parquet('sell_in_pred_futuro.parquet', index=False)