In [17]:
# -------------------------------------------
# Imports & notebook housekeeping
# -------------------------------------------

import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# -------------------------------------------
from datetime import datetime
import itertools
import calendar as cal                   # weekday / month names
import scipy.stats as stats              # normality, skew, etc.

# -------------------------------------------
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import lightgbm as lgb                   # gradient boosting (fast baseline)
import xgboost as xgb                    # alternative GBM
# import prophet                          # uncomment if you want fbprophet
import statsmodels.api as sm            # ARIMA, ETS, SARIMAX, etc.


# -------------------------------------------
import plotly.express as px              # interactive EDA
import plotly.graph_objects as go

# -------------------------------------------
# Reproducibility
SEED = 42
np.random.seed(SEED)

# -------------------------------------------
# Nice-to-have settings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")          # comment out if you prefer default
# # keep this line by itself if you want explicit inline plotting
# %matplotlib inline

# -------------------------------------------
# Wider, higher-res plots for Matplotlib
plt.rcParams["figure.figsize"] = (12, 5)
plt.rcParams["figure.dpi"] = 110

In [18]:
# -------------------------------------------
# File locations
# -------------------------------------------]

DATA_PATH = Path("data")            # adjust if our folder is elsewhere

train_path      = DATA_PATH / "train.csv"
calendar_path   = DATA_PATH / "calendar_events.csv"
submission_path = DATA_PATH / "forecast_submission.csv"

In [22]:
# -------------------------------------------
# Load the data
# -------------------------------------------

# parse_dates converts the 'date' string into pandas.Timestamp
train_df    = pd.read_csv(train_path,    parse_dates=["date"])
calendar_df = pd.read_csv(calendar_path, parse_dates=["date"])
test_df     = pd.read_csv(submission_path)   # 'id' already encodes date

# quick sanity check
print("Train shape     :", train_df.shape)
print("Calendar shape  :", calendar_df.shape)
print("Forecast shape  :", test_df.shape)

display(train_df)
display(calendar_df)
display(test_df)

Train shape     : (18766, 4)
Calendar shape  : (162, 2)
Forecast shape  : (1012, 2)


Unnamed: 0,store_id,store_name,date,revenue
0,0,All Stores,2011-01-29,204126.52
1,0,All Stores,2011-01-30,197426.42
2,0,All Stores,2011-01-31,144267.27
3,0,All Stores,2011-02-01,151903.00
4,0,All Stores,2011-02-02,117399.88
...,...,...,...,...
18761,10,Wisconsin – Badger Crossing,2015-09-26,25689.55
18762,10,Wisconsin – Badger Crossing,2015-09-27,26557.53
18763,10,Wisconsin – Badger Crossing,2015-09-28,19067.53
18764,10,Wisconsin – Badger Crossing,2015-09-29,16467.95


Unnamed: 0,date,event
0,2011-02-06,SuperBowl
1,2011-02-14,ValentinesDay
2,2011-02-21,PresidentsDay
3,2011-03-09,LentStart
4,2011-03-16,LentWeek2
...,...,...
157,2016-05-08,Mother's day
158,2016-05-30,MemorialDay
159,2016-06-02,NBAFinalsStart
160,2016-06-07,Ramadan starts


Unnamed: 0,id,prediction
0,0_20151001,0
1,0_20151002,0
2,0_20151003,0
3,0_20151004,0
4,0_20151005,0
...,...,...
1007,10_20151227,0
1008,10_20151228,0
1009,10_20151229,0
1010,10_20151230,0


In [16]:
# ----------------------------------------------------------------------------
# Parse 'id' in forecast_submission into store_id + date for easier joins
# ----------------------------------------------------------------------------
def split_submission_id(df):
    df = df.copy()
    df['store_id'] = df['id'].str.split('_').str[0].astype(int)
    df['date'] = pd.to_datetime(df['id'].str.split('_').str[1], format='%Y%m%d')
    return df

test_df = split_submission_id(test_df)

# ----------------------------------------------------------------------------
# Quick RMSE helper (works for both CV and Kaggle sample submissions)
# ----------------------------------------------------------------------------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))