# TPS-JAN22, Quick EDA+XGBoost
reference
* [TPS-JAN22, Quick EDA+XGBoost](https://www.kaggle.com/cv13j0/tps-jan22-quick-eda-xgboost)


## Install

In [1]:
!pip install holidays

In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import holidays
import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = "{:,.2f}".format
pd.set_option("display.max_columns", 15)
pd.set_option("display.max_rows", 50)
SEED = 2022

## EDA
### Laod data

In [3]:
TRN_PATH = '/kaggle/input/tabular-playground-series-jan-2022/train.csv'
TST_PATH = '/kaggle/input/tabular-playground-series-jan-2022/test.csv'
SUB_PATH = '/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv'

In [4]:
train_df = pd.read_csv(TRN_PATH)
test_df = pd.read_csv(TST_PATH)
submission_df = pd.read_csv(SUB_PATH)

### Exploring the Dataframes, (Size, Stats, Nulls, and Others)

In [5]:
train_df.info()

In [6]:
train_df.head()

In [7]:
test_df.info()

In [8]:
test_df.head()

In [9]:
train_df.describe()

In [10]:
test_df.describe()

In [11]:
country_list = train_df.country.unique()
store_list = train_df.store.unique()
product_list = train_df["product"].unique()

print(f"Country List: {country_list}")
print(f"Store List:{store_list}")
print(f"Product List:{product_list}")

In [12]:
train_df.isnull().sum()

In [13]:
def evaluate_time(df):
    min_date = df["date"].min()
    max_date = df["date"].max()
    print(f"Min Date: {min_date}/ Max Date:{max_date}")
    
evaluate_time(train_df)
evaluate_time(test_df)

## Feature Engineering

In [14]:
TARGET = "num_sold"

### get holidays

In [15]:
holiday_FI = holidays.CountryHoliday("FI", years = [2015, 2016, 2017, 2018, 2019])
holiday_NO = holidays.CountryHoliday("NO", years = [2015, 2016, 2017, 2018, 2019])
holiday_SE = holidays.CountryHoliday("SE", years = [2015, 2016, 2017, 2018, 2019])

holiday_dict = holiday_FI.copy()
holiday_dict.update(holiday_NO)
holiday_dict.update(holiday_SE)

train_df["date"] = pd.to_datetime(train_df["date"])
train_df["holiday_name"] = train_df["date"].map(holiday_dict)
train_df["is_holiday"] = np.where(train_df["holiday_name"].notnull(), 1, 0)
train_df["holiday_name"] = train_df["holiday_name"].fillna("Not Holiday")

test_df["date"] = pd.to_datetime(test_df["date"])
test_df["holiday_name"] = test_df["date"].map(holiday_dict)
test_df["is_holiday"] = np.where(test_df["holiday_name"].notnull(), 1, 0)
test_df["holiday_name"] = test_df["holiday_name"].fillna("Not Holiday")


In [16]:
train_df.sample(10)

#### add new years eve and chrismas day
ittann houchi

In [17]:
def add_holiday(df):
    new_years_eve = ["12/31/2015","12/31/2016","12/31/2017","12/31/2018","12/31/2019"]
    chirismas_day = ["12/24/2015","12/24/2016","12/24/2017","12/24/2018","12/24/2019"]

### get time features

In [18]:
def create_time_features(df:pd.DataFrame) -> pd.DataFrame:
    df["date"] = pd.to_datetime(df["date"])
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofmonth'] = df['date'].dt.days_in_month
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.weekofyear
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = np.where((df['weekday'] == 5) | (df['weekday'] == 6), 1, 0)
    
    return df

train_df = create_time_features(train_df)
test_df = create_time_features(test_df)

### convert the categorical veriables to one-hot encoded features

In [19]:
CATEGORICAL = ["country", "store", "product", "holiday_name"]
def creawte_one_hot(df, categ_columns = CATEGORICAL):
    df = pd.get_dummies(df, columns = CATEGOCICAL)
    return df

def encode_categ_features(df, categ_columns = CATEGORICAL):
    le = LabelEncoder()
    for col in categ_columns:
        df["enc_" + col] = le.fit_transform(df[col])
    return df

train_df = encode_categ_features(train_df)
test_df = encode_categ_features(test_df)

In [20]:
def transform_target(df, target = TARGET):
    df[target] = np.log(df[target])
    return df

train_df = transform_target(train_df, TARGET)

In [21]:
train_df[TARGET].head()

In [22]:
avoid = ["row_id", "data", "num_sold"]
FEATURES = [feat for feat in train_df.columns if feat not in avoid]

print(FEATURES)

In [23]:
print(FEATURES)

In [24]:
FEATURES = [
    #'country',
    #'store',
    #'product',
    #'holiday_name',
    #'is_holiday',
    "year",
    #'quarter',
    "month",
    "day",
    "dayofweek",
    "is_weekend",
     #'dayofmonth',
    #'dayofyear',
    #'weekofyear',
    #'weekday',
    "enc_country",
    "enc_store",
    "enc_product",
    # "enc_holiday_name",
]

In [25]:
## Creates a Simple Train / Validation Strategy

In [26]:
# Create the train and validation setes to train the model..
# Define a cutoff data to split the datesets
CUTOFF_DATE = "2018-01-01"

# split the data into train and validation datasets using timestep best suited for timeseries..
x_train = train_df[train_df["date"] < CUTOFF_DATE][FEATURES]
y_train = train_df[train_df["date"] < CUTOFF_DATE][TARGET]

x_val = train_df[train_df["date"] >= CUTOFF_DATE][FEATURES]
y_val = train_df[train_df["date"] >= CUTOFF_DATE][TARGET]

In [27]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

## Train a simple model(XGBoost Regressor)

In [28]:
# Defines a really simple XGBoost Regressor...
xgboost_params = {
    "eta":0.1,
    "n_estimators":16384,
    "max_depth":8,
    "max_leaves":256,
    "colsample_bylevel":0.75,
    "colsample_bytree":0.75,
    "subsample":0.75,
    "min_child_weight":512,
    "min_split_loss":0.002,
    "alpha":0.08,
    "lambda":128,
    "objective":"reg:squarederror",
    "eval_metric":"rmse",
    "tree_method":"gpu_hist",
    "seed":SEED
}

# XGBRegressor -> https://ichi.pro/xgbregressor-o-shiyoshita-maishu-no-hoteru-kyanseru-no-yosoku-236664363058724

regressor = XGBRegressor(**xgboost_params)

regressor.fit(x_train,
             y_train,
             eval_set = [(x_val, y_val)],
             early_stopping_rounds = 250,
             verbose = 500)


In [29]:
val_pred = regressor.predict(x_val[FEATURES])
# Convert the target back to non-logaritmic.
val_pred = np.exp(val_pred)
y_val = np.exp(y_val)

score = np.sqrt(mean_squared_error(y_val, val_pred))
print(f'RMSE: {score} / SMAPE: {SMAPE(y_val, val_pred)}')

## Model Results:Features Used in the Trianing and Validation..

1. Plain features, nothing added to the model. Removed Id, Datetime and Target </br> RMSE: 141.17269369190075 / SMAPE: 17.040551866223385

2. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'dayofmonth', 'dayofyear', 'weekofyear', 'weekday' </br> RMSE: 66.89475324109723 / SMAPE: 9.30006322183181

3. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'dayofmonth', 'dayofyear', 'weekofyear', 'weekday', 'quarter' </br> RMSE: 67.4018691784641 / SMAPE: 9.343389593022566

4. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'dayofmonth', 'dayofyear', 'weekofyear', 'weekday', 'quarter' </br> Added new Features,'is_holiday'</br> RMSE: 66.59882566819414 / SMAPE: 9.477461518875648

5. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'dayofmonth', 'dayofyear', 'weekofyear', 'weekday', 'quarter' </br> Added new Features,'is_holiday', 'is_weekend'</br> RMSE: 66.27489712300181 / SMAPE: 9.370856195608114

6. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'dayofmonth', 'dayofyear', 'weekofyear', 'weekday', 'quarter' </br> Added new Features,'is_holiday', 'is_weekend','enc_holiday_name' </br> RMSE: 65.93668135230337 / SMAPE: 9.428644170683123

7. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'dayofmonth', 'dayofyear', 'weekofyear', 'weekday'</br> Added new Features,'is_weekend' </br> RMSE: 66.73112188359103 / SMAPE: 9.29087254951728

8. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'dayofyear', 'weekofyear', 'weekday'</br> Added new Features,'is_weekend' </br> RMSE: 66.1329325693428 / SMAPE: 9.290678813131464

1. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'weekofyear', 'weekday'</br> Added new Features,'is_weekend' </br> RMSE: 66.13737123847237 / SMAPE: 9.256808780901792

1. Added Datetime features,'year', 'month', 'day', 'dayofweek', 'weekday'</br> Added new Features,'is_weekend' </br> RMSE: 65.40444050929132 / SMAPE: 9.045220024208168

1. Added Datetime features,'year', 'month', 'day', 'dayofweek'</br> Added new Features,'is_weekend' </br> RMSE: 65.20180075198031 / SMAPE: 9.049180607434174




In [30]:
feats = {}
for feature, importance in zip(FEATURES, regressor.feature_importances_):
    feats[feature] = importance

importances = pd.DataFrame.from_dict(feats, orient = "index").rename(columns = {0:"Gini-importance"})
importances.sort_values(by = "Gini-importance", ascending = False).plot(kind = "bar", rot = 45, figsize = (10, 5))

## Train a simple Model(XGBoost Regressor) using a CV(Cross Validation) Loop.

In [31]:
N_SPLITS = 3
EARLY_STOPPING_ROUNDS = 150
VERBOSE = 0

In [32]:
# pipeline -> https://blog.amedama.jp/entry/2018/07/07/223257
#             https://www.salesanalytics.co.jp/datascience/datascience007/          
transformer = Pipeline(steps = [("scaler", StandardScaler()), ("min_max", MinMaxScaler(feature_range = (0, 1)))])

#### Pipeline process
reference
* [Python:scilit-learnのPipelineを使ってみる](https://blog.amedama.jp/entry/2018/07/07/223257)
* [scikit-learn(sklearn)の使い方](https://qiita.com/kenta1984/items/c2f3b2609071717dcf71)
* [How to use Pipeline of scikit-learn library (Notebook)](https://www.kaggle.com/uchiborikoki/how-to-use-pipeline-of-scikit-learn-libarary/edit)

use example (PCA -> RandomForest)
```python
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from skleran.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# set Pipeline
steps = [
    ('pca', PCA()),
    ('rf', RandomForestClassifier())
]
pipeline = Pipeline(steps=steps)
# train and predict
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.33,
                                                    random_state = 42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# evaluate
accuracy_score(y_test, y_pred)
```

In [33]:
def cross_validation_train(train, labels, test, model, model_params, n_folds = 5):
    """
    The following function is responsable for training a model in a
    cross validation loop and generate predictions on the specified test set.
    The function provides the model feature importance list as other variables.

    Args:
    train  (Dataframe): ...
    labels (Series): ...
    test   (Dataframe): ...
    model  (Model): ...
    model_params (dict of str: int): ...

    Return:
    classifier  (Model): ...
    feat_import (Dataframe): ...
    test_pred   (Dataframe): ...
    """
    # oof(out of fold) and test prediction
    oof_pred = np.zeros(len(train))
    oof_label = np.zeros(len(train))
    test_pred = np.zeros(len(test))
    val_indexes_used = []
    
    # feature importance
    feat_import = np.zeros(len(FEATURES))
    
    # Kfold object
    # TimeSeriesSplit -> https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
    Kf = TimeSeriesSplit(n_splits = n_folds)
    
    # validation loop
    for fold, (train_idx, val_idx) in enumerate(Kf.split(train)):
        print(f"Fold:{fold}")
        train_min_date = train_df.iloc[train_idx]["date"].min()
        train_max_date = train_df.iloc[train_idx]["date"].max()
        
        valid_min_date = train_df.iloc[val_idx]["date"].min()
        valid_max_date = train_df.iloc[val_idx]["date"].max()
        
        print(f"Train Min / Max Dates: {train_min_date} / {train_max_date}")
        print(f"Valid Min / Max Dates: {valid_min_date} / {valid_max_date}")
        
        x_trn, y_trn = train.iloc[train_idx], labels.iloc[train_idx]
        x_val, y_val = train.iloc[val_idx], labels.iloc[val_idx]
        
        val_indexes_used = np.concatenate((val_indexes_used, val_idx), axis = None)
        
        regressor = model(**model_params)
        regressor.fit(x_trn,
                     y_trn,
                     eval_set = [(x_val, y_val)],
                     early_stopping_rounds = EARLY_STOPPING_ROUNDS,
                     verbose = VERBOSE)
        val_pred = regressor.predict(x_val)
        oof_pred[val_idx] = val_pred
        oof_label[val_idx] = y_val
        
        error = np.sqrt(mean_squared_error(y_val, val_pred))
        
        print(f"RMSE:{error}")
        print(f"SMAPE:{SMAPE(y_val, val_pred)}")
        print("."*50)
      
        feat_import += regressor.feature_importances_
        
        test_pred += (regressor.predict(test)) / n_folds
        
    val_indexes_used = val_indexes_used.astype(int)
    global_error = np.sqrt(mean_squared_error(labels.iloc[val_indexes_used], oof_pred[val_indexes_used]))
    print("")
    print(f"RMSE:{global_error}")
    print(f"SMAPE:{SMAPE(labels.iloc[val_indexes_used], oof_pred[val_indexes_used])}...")
    
    return regressor, feat_import, test_pred, oof_label, oof_pred

In [34]:
xgbr, feat_imp, predictions, oof_label, oof_pred = cross_validation_train(train = train_df[FEATURES],
                                                                        labels = train_df[TARGET],
                                                                        test = test_df[FEATURES],
                                                                        model = XGBRegressor,
                                                                        model_params = xgboost_params,
                                                                        n_folds = N_SPLITS)


### check shape of train_df, oof_label and oof_pred

In [35]:
print(f"train_df:{train_df.shape}")
print(f"oof_label:{oof_label.shape}")
print(f"oof_pred:{oof_pred.shape}")

### feature importance

In [36]:
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(FEATURES, xgbr.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance', ascending=False).plot(kind='bar', rot=45, figsize=(12,5))

## Model inference(submission to Kaggle)

In [37]:
# Use the created model to predict the sales for 2019....
pred = regressor.predict(test_df[FEATURES])
pred = np.exp(pred)
submission_df["num_sold"] = pred
submission_df.head(10)

In [39]:
# Create a submission file for kaggle..
submission_df.to_csv("submission.csv", index = False)