In [None]:
from sklearn import preprocessing # One-hot-Encoder y LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from xgboost import plot_tree
from sklearn.metrics import root_mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 8
plt.rcParams['figure.facecolor'] = '#00000000'

In [None]:
# get data
ross_df = pd.read_csv('./dataset/train.csv', low_memory=False)
store_df = pd.read_csv('./dataset/store.csv')
test_df = pd.read_csv('./dataset/test.csv')
submission_df = pd.read_csv('./dataset/sample_submission.csv')

In [None]:
ross_df.head()

In [None]:
# mergue data
merged_df = ross_df.merge(store_df, how='left', on='Store')
merged_test_df = test_df.merge(store_df, how='left', on='Store')

merged_df.info()

In [None]:
# convert Date
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week

split_date(merged_df)
split_date(merged_test_df)

In [None]:
# filter Open = 1
merged_df[merged_df.Open == 0].Sales.value_counts()

In [None]:
merged_df = merged_df[merged_df.Open == 1].copy()

In [None]:
def comp_months(df):
    df['CompetitionOpen'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + (df.Month - df.CompetitionOpenSinceMonth)
    df['CompetitionOpen'] = df['CompetitionOpen'].map(lambda x: 0 if x < 0 else x).fillna(0)

comp_months(merged_df)
comp_months(merged_test_df)

merged_df.head()

In [None]:
# Promotion
def check_promo_month(row):
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        months = (row['PromoInterval'] or '').split(',')
        if row['Promo2Open'] and month2str[row['Month']] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0

def promo_cols(df):
    # Months since Promo2 was open
    df['Promo2Open'] = 12 * (df.Year - df.Promo2SinceYear) +  (df.WeekOfYear - df.Promo2SinceWeek)*7/30.5
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x < 0 else x).fillna(0) * df['Promo2']
    # Whether a new round of promotions was started in the current month
    df['IsPromo2Month'] = df.apply(check_promo_month, axis=1) * df['Promo2']

promo_cols(merged_df)
promo_cols(merged_test_df)

In [None]:
# Input & target
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', 
              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month']
              
target_col = 'Sales'

inputs = merged_df[input_cols].copy()
targets = merged_df[target_col].copy()

test_inputs = merged_test_df[input_cols].copy()

numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 'CompetitionDistance', 'CompetitionOpen',
                'Promo2', 'Promo2Open', 'IsPromo2Month','Day', 'Month', 'Year', 'WeekOfYear']

categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

In [None]:
# missing numerica data
inputs[numeric_cols].isna().sum()

In [None]:
# filled data
label = ["CompetitionDistance"]

# imputer data
imputer = SimpleImputer(strategy="mean").fit(inputs[label])
inputs[label] = imputer.transform(inputs[label])

# imputer test
test_inputs[label] = imputer.transform(test_inputs[label])


In [None]:
test_inputs[numeric_cols].isna().sum()

In [None]:
# Scale Numeric Values
scaler = MinMaxScaler().fit(inputs[numeric_cols])

inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

# OneHotEncoder
encoder = preprocessing.OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

inputs[encoded_cols] = encoder.transform(inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

X = inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

## Gradient Boosting

In [None]:
model = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=20, max_depth=4)
model.fit( X, targets)

In [None]:
preds = model.predict(X)
preds

In [None]:
root_mean_squared_error(targets, preds)

In [None]:
rcParams['figure.figsize'] = 30,30
plot_tree(model, rankdir="LR");

In [None]:
plot_tree(model, rankdir='LR', num_trees=19);

In [None]:
trees = model.get_booster().get_dump()
len(trees)

In [None]:
print(trees[0])

In [None]:
# Feature importance
import_df = pd.DataFrame({"feature": X.columns,
                         "importance": model.feature_importances_}).sort_values("importance", ascending=False)
import_df.head(10)

In [None]:
plt.figure(figsize=(3,3))
sns.barplot(import_df.head(10), x="importance", y="feature");

## K Fold Cross Validation

In [None]:
def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):
    model = XGBRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, train_targets)
    train_rmse = root_mean_squared_error(model.predict(X_train), train_targets)
    val_rmse = root_mean_squared_error(model.predict(X_val), val_targets)
    return model, train_rmse, val_rmse

In [None]:
kfold = KFold(n_splits=5)
models = []

for train_idxs, val_idxs in kfold.split(X):
    
    X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
    X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
    
    model, train_rmse, val_rmse = train_and_evaluate(X_train, train_targets, 
                                                     X_val, val_targets, 
                                                     max_depth=4, n_estimators=20)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

In [None]:
# predicts avg of 5 models
def predict_avg(models, inputs):
    return np.mean([model.predict(inputs) for model in models], axis=0)

preds = predict_avg(models, X)
preds

In [None]:
# Hyperparameter Tuning and Regularization
def test_params_kfold(n_splits, **params):
    train_rmses, val_rmses, models = [], [], []
    kfold = KFold(n_splits)
    for train_idxs, val_idxs in kfold.split(X):
        X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
        X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
        model, train_rmse, val_rmse = train_and_evaluate(X_train, train_targets, X_val, val_targets, **params)
        models.append(model)
        train_rmses.append(train_rmse)
        val_rmses.append(val_rmse)
    print('Train RMSE: {}, Validation RMSE: {}'.format(np.mean(train_rmses), np.mean(val_rmses)))
    return models

# split data
X_train, X_val, train_targets, val_targets = train_test_split(X, targets, test_size=0.1)

def test_params(**params):
    model = XGBRegressor(n_jobs=-1, random_state=42, **params)
    model.fit(X_train, train_targets)
    train_rmse = root_mean_squared_error(model.predict(X_train), train_targets)
    val_rmse = root_mean_squared_error(model.predict(X_val), val_targets)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

In [None]:
# n_estimators
test_params(n_estimators=10)

In [None]:
test_params(n_estimators=200)

In [None]:
# max_depth
test_params(max_depth=2)

In [None]:
test_params(max_depth=10)

In [None]:
# learning_rate
test_params(n_estimators=50, learning_rate=0.01)

In [None]:
test_params(n_estimators=50, learning_rate=0.99)

In [None]:
# booster
test_params(booster='gblinear')

In [None]:
# Putting together and making predictions
model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=200, 
                     learning_rate=0.2, max_depth=10, subsample=0.9, 
                     colsample_bytree=0.7)

model.fit(X, targets)

test_preds = model.predict(X_test)

submission_df['Sales']  = test_preds

submission_df['Sales'] = submission_df['Sales'] * test_df.Open.fillna(1.)
submission_df