In [123]:
import pandas as pd
import lightgbm as lgb

In [124]:
df_train = pd.read_parquet('sell_in_train.parquet')
df_test = pd.read_parquet('sell_in_test.parquet')
df_futuro = pd.read_parquet('sell_in_futuro.parquet')
print(f"df_train: {df_train.shape}")
print(f"df_test: {df_test.shape}")
print(f"df_futuro: {df_futuro.shape}")

df_train: (14511, 138)
df_test: (780, 138)
df_futuro: (780, 138)


In [125]:
categorical_features = df_train.select_dtypes(['category']).columns.tolist()
for col in categorical_features:
    print(f"convertida {col}")
    df_train[col] = df_train[col].cat.codes
    df_test[col] = df_test[col].cat.codes
    df_futuro[col] = df_futuro[col].cat.codes

convertida yearquarter
convertida cat1
convertida cat2
convertida cat3
convertida brand


In [126]:
X_train = df_train.iloc[:,:-1]
X_test = df_test.iloc[:,:-1]
X_futuro = df_futuro.iloc[:,:-1]

y_train = df_train.iloc[:,-1]
y_test = df_test.iloc[:,-1]
y_futuro = df_futuro.iloc[:,-1]

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"X_futuro: {X_futuro.shape}")

print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_futuro: {y_futuro.shape}")

X_train: (14511, 137)
X_test: (780, 137)
X_futuro: (780, 137)
y_train: (14511,)
y_test: (780,)
y_futuro: (780,)


In [127]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)
futuro_data = lgb.Dataset(X_futuro, label=y_futuro)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.4
}

model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data, test_data])

#y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_futuro = model.predict(X_futuro, num_iteration=model.best_iteration)

#example_row = X_test.iloc[0]
#example_pred = model.predict(example_row.values.reshape(1, -1), num_iteration=model.best_iteration)
#print(f'Prediction for example row: {example_pred[0]}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19450
[LightGBM] [Info] Number of data points in the train set: 14511, number of used features: 134
[LightGBM] [Info] Start training from score -0.061244


In [128]:
final = X_test
final['tn_futuro'] = y_test
final['tn_prediccion'] = y_pred

In [129]:
prod_stats = pd.read_parquet('prod_stats.parquet')
prod_stats = prod_stats[['product_id', 'average_tn','std_dev_tn']]
final = final.merge(prod_stats, how='left', on='product_id')
final.columns

Index(['product_id', 'periodo', 'primer_periodo', 'ultimo_periodo', 'tn_norm',
       'periodo_dt', 'mes', 'quarter', 'month_in_quarter', 'year',
       ...
       'prop_product_yearquarter_cat1', 'prop_product_yearquarter_cat2',
       'prop_product_yearquarter_brand', 'prop_product_year_cat1',
       'prop_product_year_cat2', 'prop_product_year_brand', 'tn_futuro',
       'tn_prediccion', 'average_tn', 'std_dev_tn'],
      dtype='object', length=141)

In [130]:
final['tn_futuro_real'] = (final['tn_norm'] + final['tn_futuro']) * final['std_dev_tn'] + final['average_tn']
final['tn_prediccion_real'] = (final['tn_norm'] + final['tn_prediccion']) * final['std_dev_tn'] + final['average_tn']

In [131]:
final.head()

Unnamed: 0,product_id,periodo,primer_periodo,ultimo_periodo,tn_norm,periodo_dt,mes,quarter,month_in_quarter,year,...,prop_product_yearquarter_brand,prop_product_year_cat1,prop_product_year_cat2,prop_product_year_brand,tn_futuro,tn_prediccion,average_tn,std_dev_tn,tn_futuro_real,tn_prediccion_real
0,20001,201912,201701,201912,0.356686,201912,12,4,3,2019,...,24.683874,0.702526,1.252843,6.476876,,-0.890872,1398.344322,298.14546,,1239.079079
1,20002,201912,201701,201912,0.256522,201912,12,4,3,2019,...,4.739874,0.507655,0.905322,1.198157,,-0.553614,1009.368178,303.834835,,919.101355
2,20003,201912,201701,201912,0.011975,201912,12,4,3,2019,...,7.660107,1.258237,1.673822,2.237771,,-0.653311,889.004243,292.036581,,701.710657
3,20004,201912,201701,201912,-0.150213,201912,12,4,3,2019,...,5.47493,0.899303,1.196336,1.599409,,-0.56977,671.615383,224.450085,,510.015118
4,20005,201912,201701,201912,-0.233451,201912,12,4,3,2019,...,5.091663,0.836348,1.112587,1.487444,,-0.508377,644.200514,218.273222,,482.279275


In [132]:
final.to_parquet('sell_in_pred.parquet', index=False)

In [133]:
final_futuro = X_futuro
final_futuro['tn_futuro'] = y_futuro
final_futuro['tn_prediccion'] = y_pred_futuro

final_futuro = final_futuro.merge(prod_stats, how='left', on='product_id')
final_futuro.columns

final_futuro['tn_futuro_real'] = (final_futuro['tn_norm'] + final_futuro['tn_futuro']) * final_futuro['std_dev_tn'] + final_futuro['average_tn']
final_futuro['tn_prediccion_real'] = (final_futuro['tn_norm'] + final_futuro['tn_prediccion']) * final_futuro['std_dev_tn'] + final_futuro['average_tn']

final_futuro.to_parquet('sell_in_pred_futuro.parquet', index=False)