In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [2]:
# input dataset
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'])
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'])
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv', parse_dates=['date'])

In [3]:
# adjust date formats
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
transactions['date'] = pd.to_datetime(transactions['date'])
holidays['date'] = pd.to_datetime(holidays['date'])
oil['date'] = pd.to_datetime(oil['date'])

# merge the dataset
train = train.merge(stores, on='store_nbr', how='left')
test = test.merge(stores, on='store_nbr', how='left')

train = train.merge(transactions, on=['date', 'store_nbr'], how='left')
test = test.merge(transactions, on=['date', 'store_nbr'], how='left')

train = train.merge(oil, on='date', how='left')
test = test.merge(oil, on='date', how='left')

holidays = holidays[['date', 'locale', 'transferred']].rename(columns={'locale': 'holidays_locale'})
train = train.merge(holidays, on='date', how='left', suffixes=('', '_holiday'))
test = test.merge(holidays, on='date', how='left', suffixes=('', '_holiday'))

train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3054348 entries, 0 to 3054347
Data columns (total 14 columns):
 #   Column           Dtype         
---  ------           -----         
 0   id               int64         
 1   date             datetime64[ns]
 2   store_nbr        int64         
 3   family           object        
 4   sales            float64       
 5   onpromotion      int64         
 6   city             object        
 7   state            object        
 8   type             object        
 9   cluster          int64         
 10  transactions     float64       
 11  dcoilwtico       float64       
 12  holidays_locale  object        
 13  transferred      object        
dtypes: datetime64[ns](1), float64(3), int64(4), object(6)
memory usage: 326.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----       

In [4]:
# check the missing values
print("---missing value---")
print(train.isnull().sum())
print("-------------------")
print(test.isnull().sum())
print("\n---unique value---")
for col in train:
    print(f"{col}:{train[col].nunique()}")

---missing value---
id                       0
date                     0
store_nbr                0
family                   0
sales                    0
onpromotion              0
city                     0
state                    0
type                     0
cluster                  0
transactions        249117
dcoilwtico          955152
holidays_locale    2551824
transferred        2551824
dtype: int64
-------------------
id                     0
date                   0
store_nbr              0
family                 0
onpromotion            0
city                   0
state                  0
type                   0
cluster                0
transactions       28512
dcoilwtico          7128
holidays_locale    26730
transferred        26730
dtype: int64

---unique value---
id:3000888
date:1684
store_nbr:54
family:33
sales:379610
onpromotion:362
city:22
state:16
type:5
cluster:17
transactions:4993
dcoilwtico:994
holidays_locale:3
transferred:2


In [5]:
#fill in the missing value
train['holidays_locale'] = train['holidays_locale'].fillna(False)
test['holidays_locale'] = test['holidays_locale'].fillna(False)

train['transferred'] = train['transferred'].fillna(False)
test['transferred'] = test['transferred'].fillna(False)

train['transactions'] = train.groupby('store_nbr')['transactions'].transform(lambda x: x.fillna(x.median()))
store_medians = train.groupby('store_nbr')['transactions'].median()
test['transactions'] = test['store_nbr'].map(store_medians)  # fill with median

train['dcoilwtico'] = train['dcoilwtico'].interpolate(method='linear')
train['dcoilwtico'] = train['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')  # linear interpolation
test['dcoilwtico'] = test['dcoilwtico'].interpolate(method='linear')
test['dcoilwtico'] = test['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')

# check the missing values
print("---missing value---")
print(train.isnull().sum())
print("-------------------")
print(test.isnull().sum())

  train['transferred'] = train['transferred'].fillna(False)
  test['transferred'] = test['transferred'].fillna(False)
  train['dcoilwtico'] = train['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')  # linear interpolation
  test['dcoilwtico'] = test['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')


---missing value---
id                 0
date               0
store_nbr          0
family             0
sales              0
onpromotion        0
city               0
state              0
type               0
cluster            0
transactions       0
dcoilwtico         0
holidays_locale    0
transferred        0
dtype: int64
-------------------
id                 0
date               0
store_nbr          0
family             0
onpromotion        0
city               0
state              0
type               0
cluster            0
transactions       0
dcoilwtico         0
holidays_locale    0
transferred        0
dtype: int64


In [6]:
#extract temporal feature
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['dayofweek'] = train['date'].dt.dayofweek
train['weekofyear'] = train['date'].dt.isocalendar().week.astype(int)
train['is_weekend'] = train['dayofweek'].isin([5, 6]).astype(int)

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['dayofweek'] = test['date'].dt.dayofweek
test['weekofyear'] = test['date'].dt.isocalendar().week.astype(int)
test['is_weekend'] = test['dayofweek'].isin([5, 6]).astype(int)

#adjust the order of dataset
train_order = [
    'id','date','year','month','day','dayofweek','weekofyear','is_weekend', 'holidays_locale', 'transferred', 
    'dcoilwtico',
    'store_nbr', 'city', 'state', 'type', 'cluster',
    'onpromotion', 'family', 'transactions','sales'
]

test_order = [
    'id','date', 'year','month','day','dayofweek','weekofyear','is_weekend','holidays_locale', 'transferred', 
    'dcoilwtico',
    'store_nbr',  'city', 'state', 'type', 'cluster',
    'onpromotion', 'family', 'transactions'
]

train = train[train_order]
test = test[test_order]

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb

# numerical conversion
cat_cols = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster', 'holidays_locale', 'transferred']
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    all_values = pd.concat([train[col], test[col]], axis=0).astype(str) # merge the collections to avoid potential errors
    le.fit(all_values)
    encoders[col] = le
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# logarithm transformation
train['sales_log'] = np.log1p(train['sales'])

# splite train set and validation set，dependent and independent variables
X_train = train[train['date'] < '2017-03-01'].drop(columns=['sales','sales_log','date','id'])
y_train = train[train['date'] < '2017-03-01']['sales_log']
X_val   = train[train['date'] >= '2017-03-01'].drop(columns=['sales','sales_log','date','id'])
y_val   = train[train['date'] >= '2017-03-01']['sales_log']

# build the LightGBM collection
train_data = lgb.Dataset(X_train, label = y_train)
val_data = lgb.Dataset(X_val, label = y_val, reference = train_data)

#set up the parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 128,
    'max_depth': -1,      
    'min_data_in_leaf': 50, 
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# train the model
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

# validation
y_val_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
y_val_pred = np.expm1(y_val_pred_log) 

rmse = mean_squared_error(y_val, y_val_pred, squared=False)
rmsle = np.sqrt(mean_squared_log_error(train.loc[train['date'] >= '2017-03-01','sales'], np.maximum(y_val_pred,0)))

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation RMSLE: {rmsle:.4f}")

Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 0.559068	valid's rmse: 0.570537
[200]	train's rmse: 0.498857	valid's rmse: 0.533271
[300]	train's rmse: 0.469218	valid's rmse: 0.516598
[400]	train's rmse: 0.446823	valid's rmse: 0.505039
[500]	train's rmse: 0.431145	valid's rmse: 0.500192
[600]	train's rmse: 0.420233	valid's rmse: 0.497347
[700]	train's rmse: 0.409951	valid's rmse: 0.494786
[800]	train's rmse: 0.401599	valid's rmse: 0.493369
[900]	train's rmse: 0.394165	valid's rmse: 0.492471
[1000]	train's rmse: 0.388572	valid's rmse: 0.490765
Did not meet early stopping. Best iteration is:
[998]	train's rmse: 0.388671	valid's rmse: 0.490761
Validation RMSE: 1338.1138
Validation RMSLE: 0.4900


In [8]:
# prediction
X_test = test.drop(columns=['date','id'])
y_test_pred_log = model.predict(X_test, num_iteration=model.best_iteration)
y_test_pred = np.expm1(y_test_pred_log)

# generate submission file
submission = pd.DataFrame({
    'id': test['id'],
    'sales': np.maximum(y_test_pred, 0) 
})

submission.to_csv('submission.csv', index=False)