In [1]:
import lightgbm as lgb
from data_pipeline import data_transform_pipeline, cap_sales
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import re

data = pd.read_csv('data/train.csv')

train_data = data[data['date'] < '2017-08-01']
train_data = cap_sales(train_data, 6500)
test_data = data[data['date'] >= '2017-08-01']
test_data = cap_sales(test_data, 6500)

X_train = train_data
y_train = X_train.pop('sales')

X_test = test_data
y_test = X_test.pop('sales')

X_train = data_transform_pipeline.fit_transform(X_train)
X_test = data_transform_pipeline.transform(X_test)

X_train = X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_test = X_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sales = df.sales.map(lambda x: x if x < cap else cap)


In [2]:
model = lgb.LGBMRegressor(force_row_wise=True, n_estimators=500, learning_rate=0.1, num_leaves=100, subsample=0.8, colsample_bytree=0.8, max_depth=10, random_state=42)

In [40]:
model.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 593
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 40
[LightGBM] [Info] Start training from score 337.277697


In [41]:
predictions = model.predict(X_test)
# predictions = np.e **predictions + 1
# y_test = np.e ** y_test  + 1
print(f"MSE: {mean_squared_error(y_test, predictions)}")
print(f"ROOT of MSE: {mean_squared_error(y_test, predictions)**0.5}")

MSE: 418414.79698298447
ROOT of MSE: 646.84990297826


In [3]:
model.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

[LightGBM] [Info] Total Bins 594
[LightGBM] [Info] Number of data points in the train set: 3000888, number of used features: 40
[LightGBM] [Info] Start training from score 338.214939


In [4]:
X_competition = pd.read_csv('data/test.csv')
competition_indexes = X_competition['id']
X_competition = data_transform_pipeline.transform(X_competition)
X_competition = X_competition.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
competition_prediction = model.predict(X_competition)
competition_prediction[competition_prediction < 0] = 0


In [48]:
pd.DataFrame({'id':competition_indexes, 
              'sales':competition_prediction }).to_csv('data/submission.csv', index=False)

LN approach

In [35]:
import numpy as np

y_train_log = np.log1p(y_train)

model.fit(X_train, y_train_log)
predictions_log = model.predict(X_test)
predictions = np.expm1(predictions_log)

print(f"MSE: {mean_squared_error(y_test, predictions)}")
print(f"ROOT of MSE: {mean_squared_error(y_test, predictions)**0.5}")



[LightGBM] [Info] Total Bins 593
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 40
[LightGBM] [Info] Start training from score 2.917812
MSE: 779427.0126501742
ROOT of MSE: 882.8516368281673


In [42]:
predictions_train = model.predict(X_train)
print(f"MSE: {mean_squared_error(y_train, predictions_train)}")
print(f"ROOT of MSE: {mean_squared_error(y_train, predictions_train)**0.5}")

MSE: 296602.7318815338
ROOT of MSE: 544.6124602701758


Linear model

In [5]:
from sklearn.linear_model import LinearRegression

# Create an instance of LinearRegression
linear_model = LinearRegression()

In [None]:
# Fit the model to your training data
linear_model.fit(X_train, y_train)

# Make predictions
predictions = linear_model.predict(X_test)

In [60]:
print(f"MSE: {mean_squared_error(y_test, predictions)}")
print(f"ROOT of MSE: {mean_squared_error(y_test, predictions)**0.5}")

MSE: 382735.53260258527
ROOT of MSE: 618.6562313616386


In [6]:
linear_model.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

competition_prediction_linear = linear_model.predict(X_competition)

competition_prediction_linear[competition_prediction_linear < 0] = 0

In [None]:
pd.DataFrame({'id':competition_indexes, 
              'sales':competition_prediction }).to_csv('data/linear_submission.csv', index=False)

In [None]:
X_competition

In [7]:
competition_prediction

array([8.16501585e-01, 0.00000000e+00, 1.25849533e+01, ...,
       1.19327193e+03, 7.47549445e+01, 2.52801040e+01])

In [8]:
competition_prediction_linear

array([   0.        ,    0.        ,    0.        , ..., 1231.93945312,
         18.70703125,   26.921875  ])

In [10]:
competition_prediction = (competition_prediction + competition_prediction_linear)/2

In [11]:
competition_prediction

array([4.08250792e-01, 0.00000000e+00, 6.29247667e+00, ...,
       1.21260569e+03, 4.67309879e+01, 2.61009895e+01])

In [12]:
pd.DataFrame({'id':competition_indexes, 
              'sales':competition_prediction }).to_csv('data/joined_submission.csv', index=False)