In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# input dataset
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'])
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'])
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv', parse_dates=['date'])

In [None]:
# adjust date formats
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
transactions['date'] = pd.to_datetime(transactions['date'])
holidays['date'] = pd.to_datetime(holidays['date'])
oil['date'] = pd.to_datetime(oil['date'])

# merge the dataset
train = train.merge(stores, on='store_nbr', how='left')
test = test.merge(stores, on='store_nbr', how='left')

train = train.merge(transactions, on=['date', 'store_nbr'], how='left')

train = train.merge(oil, on='date', how='left')
test = test.merge(oil, on='date', how='left')

holidays = holidays[['date', 'locale', 'transferred']].rename(columns={'locale': 'holidays_locale'})
train = train.merge(holidays, on='date', how='left', suffixes=('', '_holiday'))
test = test.merge(holidays, on='date', how='left', suffixes=('', '_holiday'))

train.info()
test.info()

In [None]:
# check the missing values
print("---missing value---")
print(train.isnull().sum())

print("\n---unique value---")
for col in train:
    print(f"{col}:{train[col].nunique()}")

In [None]:
#fill in the missing value
train['holidays_locale'] = train['holidays_locale'].fillna(False)
train['transferred'] = train['transferred'].fillna(False)
train['transactions'] = train.groupby('store_nbr')['transactions'].transform(lambda x: x.fillna(x.median()))
train['dcoilwtico'] = train['dcoilwtico'].interpolate(method='linear')
train['dcoilwtico'] = train['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')

In [None]:
# check the missing values
print("---missing value---")
print(train.isnull().sum())

print("\n---unique value---")
for col in train:
    print(f"{col}:{train[col].nunique()}")

In [None]:
#extract temporal feature
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['dayofweek'] = train['date'].dt.dayofweek
train['weekofyear'] = train['date'].dt.isocalendar().week.astype(int)
train['is_weekend'] = train['dayofweek'].isin([5, 6]).astype(int)

In [None]:
#extract temporal feature
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['dayofweek'] = train['date'].dt.dayofweek
train['weekofyear'] = train['date'].dt.isocalendar().week.astype(int)
train['is_weekend'] = train['dayofweek'].isin([5, 6]).astype(int)


#adjust the order of dataset
train_order = [
    'id', 'date','year','month','day','dayofweek','weekofyear','is_weekend', 'holidays_locale', 'transferred', 
    'dcoilwtico',
    'store_nbr', 'city', 'state', 'type', 'cluster',
    'onpromotion', 'family', 'transactions','sales'
]

test_order = [
    'id', 'date','year','month','day','dayofweek','weekofyear','is_weekend','holidays_locale', 'transferred', 
    'dcoilwtico',
    'store_nbr',  'city', 'state', 'type', 'cluster',
   'onpromotion', 'family'
]

train = train[train_order]
test = test[test_order]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# splite train set and validation set，dependent and independent variables
X_train = train[train['date'] < '2017-01-01'].drop(columns=['sales', 'date', 'id'])
y_train = train[train['date'] < '2017-01-01']['sales']
X_val = train[train['date'] >= '2017-01-01'].drop(columns=['sales', 'date', 'id'])
y_val = train[train['date'] >= '2017-01-01']['sales']


# split categorical and numerical columns
cat_cols = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster', 'holidays_locale', 'transferred']
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    all_values = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(all_values)
    encoders[col] = le
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
