In [129]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [130]:
df = pd.read_csv('sell_in_train.csv')

In [131]:
df['primer_periodo'] = pd.to_datetime(df['primer_periodo']).dt.strftime('%Y%m').astype(int)
df['ultimo_periodo'] = pd.to_datetime(df['ultimo_periodo']).dt.strftime('%Y%m').astype(int)
df['periodo_dt'] = pd.to_datetime(df['periodo_dt']).dt.strftime('%Y%m').astype(int)

In [132]:
periodo = 201910

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train = df[df['periodo'] <= periodo].iloc[:,:-1]
X_test = df[df['periodo'] > periodo].iloc[:,:-1]
y_train = df[df['periodo'] <= periodo].iloc[:,-1]
y_test = df[df['periodo'] > periodo]['tn_norm']

In [133]:
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

X_train: (20789, 25)
X_test: (1560, 25)
y_train: (20789,)
y_test: (1560,)


In [134]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[train_data, test_data])

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

example_row = X_test.iloc[0]
example_pred = model.predict(example_row.values.reshape(1, -1), num_iteration=model.best_iteration)
print(f'Prediction for example row: {example_pred[0]}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5226
[LightGBM] [Info] Number of data points in the train set: 20789, number of used features: 24
[LightGBM] [Info] Start training from score -0.020332
Prediction for example row: -0.6269385661870562


In [146]:
final = X_test
final['tn_futuro'] = y_test
final['tn_prediccion'] = y_pred

In [136]:
#prod_stats = pd.read_csv('prod_stats.csv')
#prod_stats = prod_stats[['product_id', 'average_tn','std_dev_tn']]
#final = final.merge(prod_stats, how='left', on='product_id')
#final.columns

In [147]:
final['tn_futuro_real'] = final['tn_futuro'] * final['std_dev_tn'] + final['average_tn']
final['tn_prediccion_real'] = final['tn_prediccion'] * final['std_dev_tn'] + final['average_tn']

In [148]:
final

Unnamed: 0,product_id,periodo,tn,primer_periodo,ultimo_periodo,values,total_tn,average_tn,median_tn,std_dev_tn,...,tn_lag_7,tn_lag_8,tn_lag_9,tn_lag_10,tn_lag_11,tn_lag_12,tn_futuro,tn_prediccion,tn_futuro_real,tn_prediccion_real
34,20001,201911,1397.37231,201701,201912,36.0,50340.39558,1398.344322,1418.023430,298.145460,...,0.836149,0.242540,-0.467056,-0.411111,0.296306,1.390834,-0.003260,-0.626939,1397.37231,1211.425434
35,20001,201912,1504.68856,201701,201912,36.0,50340.39558,1398.344322,1418.023430,298.145460,...,0.776259,0.836149,0.242540,-0.467056,-0.411111,0.296306,0.356686,-0.397791,1504.68856,1279.744833
70,20002,201911,1423.57739,201701,201912,36.0,36337.25439,1009.368178,992.005505,303.834835,...,0.915811,0.244400,0.110736,0.847234,0.000284,2.492942,1.363271,-0.145353,1423.57739,965.204899
71,20002,201912,1087.30855,201701,201912,36.0,36337.25439,1009.368178,992.005505,303.834835,...,0.084326,0.915811,0.244400,0.110736,0.847234,0.000284,0.256522,-0.400489,1087.30855,887.685612
106,20003,201911,948.29393,201701,201912,36.0,32004.15274,889.004243,786.715735,292.036581,...,-1.108308,-0.859359,-0.447470,0.259436,-0.408084,1.088608,0.203021,-0.610928,948.29393,710.590815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22328,21266,201912,0.05121,201903,201912,10.0,0.94659,0.094659,0.055755,0.105968,...,0.770807,2.520768,-0.732192,,,,-0.410020,-0.466727,0.05121,0.045201
22337,21267,201911,0.04052,201903,201912,10.0,0.92835,0.092835,0.066670,0.079939,...,0.376226,1.537993,,,,,-0.654440,-0.577529,0.04052,0.046668
22338,21267,201912,0.01569,201903,201912,10.0,0.92835,0.092835,0.066670,0.079939,...,1.897394,0.376226,1.537993,,,,-0.965053,-0.434905,0.01569,0.058069
22347,21276,201911,0.03341,201903,201912,10.0,0.45447,0.045447,0.027100,0.043618,...,1.290356,1.766304,,,,,-0.275963,-0.455731,0.03341,0.025569


In [149]:
final.to_csv('sell_in_pred.csv', index=False)