In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt

import seaborn as sns
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

from numpy import random
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

import warnings
warnings.simplefilter("ignore")



To bring in the code from the 'from learntools.time_series.style import * command from the Kaggle learntools components, since it was very difficult to actually install learntools on a computer. 

In [2]:
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)

get_ipython().config.InlineBackend.figure_format = 'retina'

To create the seasonal_plot function from the Kaggle learntools files.

In [3]:
def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette(
        "husl",
        n_colors=X[period].nunique(),
    )
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax


To create the plot_periodograms function from the Kaggle learntools files.

In [4]:
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [5]:
test = pd.read_csv('test_features.csv')
train = pd.read_csv('train_features.csv')
holidays = pd.read_csv('holidays_events.csv')
oil = pd.read_csv('oil.csv')
stores = pd.read_csv('stores.csv')
transactions = pd.read_csv('transactions.csv')
submission = pd.read_csv('sample_submission.csv')

In [6]:
submission

Unnamed: 0,id,sales
0,3000888,0.0
1,3000889,0.0
2,3000890,0.0
3,3000891,0.0
4,3000892,0.0
...,...,...
28507,3029395,0.0
28508,3029396,0.0
28509,3029397,0.0
28510,3029398,0.0


In [7]:
test

Unnamed: 0,id,date,store_nbr,family,onpromotion,day_of_week,weekend,year,month,day_of_month,...,city,state,type,cluster,date_str,national_holiday,national_event,national_workday,local_holiday,regional_holiday
0,3000888,2017-08-16,1,AUTOMOTIVE,0,3,0,2017,8,16,...,Quito,Pichincha,D,13,2017-08-16,0,0,0,0,0
1,3000889,2017-08-16,1,BABY CARE,0,3,0,2017,8,16,...,Quito,Pichincha,D,13,2017-08-16,0,0,0,0,0
2,3000890,2017-08-16,1,BEAUTY,2,3,0,2017,8,16,...,Quito,Pichincha,D,13,2017-08-16,0,0,0,0,0
3,3000891,2017-08-16,1,BEVERAGES,20,3,0,2017,8,16,...,Quito,Pichincha,D,13,2017-08-16,0,0,0,0,0
4,3000892,2017-08-16,1,BOOKS,0,3,0,2017,8,16,...,Quito,Pichincha,D,13,2017-08-16,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1,4,0,2017,8,31,...,Quito,Pichincha,B,6,2017-08-31,0,0,0,0,0
28508,3029396,2017-08-31,9,PREPARED FOODS,0,4,0,2017,8,31,...,Quito,Pichincha,B,6,2017-08-31,0,0,0,0,0
28509,3029397,2017-08-31,9,PRODUCE,1,4,0,2017,8,31,...,Quito,Pichincha,B,6,2017-08-31,0,0,0,0,0
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9,4,0,2017,8,31,...,Quito,Pichincha,B,6,2017-08-31,0,0,0,0,0


# Exercise 2: Hybrid Models 

To create the sales by store number for all years, not just 2017. 

In [8]:
store_sales = pd.read_csv(
    'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()

In [9]:
store_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2013-01-01,0.000000,0
1,AUTOMOTIVE,2013-01-02,2.000000,0
1,AUTOMOTIVE,2013-01-03,3.000000,0
1,AUTOMOTIVE,2013-01-04,3.000000,0
1,AUTOMOTIVE,2013-01-05,5.000000,0
...,...,...,...,...
9,SEAFOOD,2017-08-11,23.830999,0
9,SEAFOOD,2017-08-12,16.859001,4
9,SEAFOOD,2017-08-13,20.000000,0
9,SEAFOOD,2017-08-14,17.000000,0


To create a dataframe for average sales by family and then promotions by family, although I'm not sure what the promotions by family signify. 

In [10]:
family_sales = (
    store_sales
    .groupby(['family', 'date'])
    .mean()
    .unstack('family')
    .loc['2017']
)

In [11]:
family_sales

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,...,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-01-01,0.092593,0.037037,0.055556,74.222221,0.000000,9.084685,0.129630,7.500000,11.518518,3.629167,...,0.0,0.018519,0.111111,0.018519,0.000000,0.000000,0.037037,0.129630,0.000000,0.000000
2017-01-02,11.481482,0.259259,11.648149,6208.055664,0.481481,844.836304,14.203704,2233.648193,1545.000000,539.114807,...,0.0,0.462963,10.592593,0.537037,0.000000,0.259259,1.166667,5.629630,0.000000,0.407407
2017-01-03,8.296296,0.296296,7.185185,4507.814941,0.814815,665.124084,10.629630,1711.907349,1204.203735,404.300079,...,0.0,0.481481,9.722222,0.444444,0.000000,0.388889,1.351852,56.296296,0.000000,0.407407
2017-01-04,6.833333,0.333333,6.888889,3911.833252,0.759259,594.160583,11.185185,1508.036987,1107.796265,309.397675,...,0.0,0.370370,12.037037,0.444444,0.000000,0.296296,5.444444,101.277778,0.000000,0.333333
2017-01-05,6.333333,0.351852,5.925926,3258.796387,0.407407,495.511597,12.444445,1241.833374,829.277771,260.776489,...,0.0,8.981481,5.666667,0.000000,0.000000,0.296296,0.907407,5.018519,0.000000,0.444444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,8.166667,0.129630,6.500000,3502.055664,0.000000,522.305298,16.111111,1031.148193,831.648132,345.934967,...,0.0,22.925926,9.000000,0.000000,0.000000,27.407407,0.537037,6.000000,2.666667,2.537037
2017-08-12,7.462963,0.055556,6.833333,3376.259277,0.000000,507.473114,15.722222,1072.870361,793.592590,290.553589,...,0.0,0.000000,8.703704,0.000000,0.018519,0.111111,0.277778,6.000000,2.759259,0.759259
2017-08-13,8.907408,0.166667,8.018518,3747.296387,0.018519,615.976990,10.074074,1143.648193,928.000000,325.801361,...,0.0,0.000000,9.240741,0.000000,0.000000,0.185185,0.222222,5.907407,2.777778,0.018519
2017-08-14,5.407407,0.166667,6.240741,3237.629639,0.000000,518.564026,11.000000,1019.111084,797.222229,271.532227,...,0.0,0.000000,8.722222,0.000000,0.000000,0.129630,0.148148,6.018519,2.851852,0.000000


To create a new class to use for the boosted hybrid model

In [12]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  # store column names from fit method

To create a method to fit two X datasets against one y dataset. Each X is for one of the models. 

In [13]:
def fit(self, X_1, X_2, y):
    self.model_1.fit(X_1, y)

    y_fit = pd.DataFrame(
        self.model_1.predict(X_1),
        index=X_1.index, columns=y.columns,
    )

    # Compute residuals
    y_resid = y - y_fit
    y_resid = y_resid.stack().squeeze() # wide to long

    # Fit self.model_2 on residuals
    self.model_2.fit(X_2, y_resid)

    # Save column names for predict method
    self.y_columns = y.columns
    # Save data for question checking
    self.y_fit = y_fit
    self.y_resid = y_resid


# Add method to class
BoostedHybrid.fit = fit


Now define the predict for the 2 X values. I'm not sure how the stack and unstack lines work. 

In [14]:
def predict(self, X_1, X_2):
    y_pred = pd.DataFrame(
        self.model_1.predict(X_1),
        index=X_1.index, columns=self.y_columns,
    )
    y_pred = y_pred.stack().squeeze()  # wide to long

    y_pred += self.model_2.predict(X_2)
    
    return y_pred.unstack()  # long to wide


# Add method to class
BoostedHybrid.predict = predict

To create a y value for all the average sales for each product for each day of 2017. 

In [15]:
y = family_sales.loc[:, 'sales']

In [16]:
y

family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,0.092593,0.037037,0.055556,74.222221,0.000000,9.084685,0.129630,7.500000,11.518518,3.629167,...,0.074074,5.013166,2.629630,0.074074,0.222222,2.855537,1.040722,31.163778,0.000000,0.259259
2017-01-02,11.481482,0.259259,11.648149,6208.055664,0.481481,844.836304,14.203704,2233.648193,1545.000000,539.114807,...,7.777778,488.522339,714.370361,12.814815,22.777779,581.413147,153.555542,3409.500488,2.925926,33.581944
2017-01-03,8.296296,0.296296,7.185185,4507.814941,0.814815,665.124084,10.629630,1711.907349,1204.203735,404.300079,...,3.500000,427.520905,488.333344,8.870370,13.555555,462.013672,125.820534,3394.559814,2.018518,27.135260
2017-01-04,6.833333,0.333333,6.888889,3911.833252,0.759259,594.160583,11.185185,1508.036987,1107.796265,309.397675,...,3.500000,369.121490,426.574066,8.129630,12.092592,431.276398,116.740814,3467.447998,1.722222,22.034130
2017-01-05,6.333333,0.351852,5.925926,3258.796387,0.407407,495.511597,12.444445,1241.833374,829.277771,260.776489,...,3.203704,368.945679,360.925934,7.629630,10.333333,345.673492,101.434570,2069.316650,1.425926,20.615334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,8.166667,0.129630,6.500000,3502.055664,0.000000,522.305298,16.111111,1031.148193,831.648132,345.934967,...,9.259259,498.964661,295.888885,10.870370,12.111111,468.857361,96.286926,2198.854492,65.240738,23.566963
2017-08-12,7.462963,0.055556,6.833333,3376.259277,0.000000,507.473114,15.722222,1072.870361,793.592590,290.553589,...,8.944445,329.178711,294.500000,9.407408,13.185185,354.342773,84.693817,2070.154785,67.481483,19.037592
2017-08-13,8.907408,0.166667,8.018518,3747.296387,0.018519,615.976990,10.074074,1143.648193,928.000000,325.801361,...,8.685185,345.055817,336.814819,10.018518,13.722222,379.801208,91.509422,2331.922363,68.851852,20.704575
2017-08-14,5.407407,0.166667,6.240741,3237.629639,0.000000,518.564026,11.000000,1019.111084,797.222229,271.532227,...,8.462963,314.364563,279.203705,7.722222,9.259259,344.398285,86.062500,2134.399902,52.333332,17.975555


To create a trend X value for the linear programming model. 

In [17]:
dp = DeterministicProcess(index=y.index, order=1)
X_1 = dp.in_sample()

In [18]:
X_1

Unnamed: 0_level_0,trend
date,Unnamed: 1_level_1
2017-01-01,1.0
2017-01-02,2.0
2017-01-03,3.0
2017-01-04,4.0
2017-01-05,5.0
...,...
2017-08-11,223.0
2017-08-12,224.0
2017-08-13,225.0
2017-08-14,226.0


To create the X factor for XGBoost, with just the onpromotion feature utilized. 

In [19]:
X_2 = family_sales.drop('sales', axis=1).stack()

In [20]:
X_2

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion
date,family,Unnamed: 2_level_1
2017-01-01,AUTOMOTIVE,0.000000
2017-01-01,BABY CARE,0.000000
2017-01-01,BEAUTY,0.000000
2017-01-01,BEVERAGES,0.703704
2017-01-01,BOOKS,0.000000
...,...,...
2017-08-15,POULTRY,0.111111
2017-08-15,PREPARED FOODS,0.166667
2017-08-15,PRODUCE,58.685185
2017-08-15,SCHOOL AND OFFICE SUPPLIES,2.740741


The label encoding for 'family' to change it from a categorical value to a numerical value. 

In [21]:
le = LabelEncoder()
X_2 = X_2.reset_index('family')
X_2['family'] = le.fit_transform(X_2['family'])

In [22]:
X_2

Unnamed: 0_level_0,family,onpromotion
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,0,0.000000
2017-01-01,1,0.000000
2017-01-01,2,0.000000
2017-01-01,3,0.703704
2017-01-01,4,0.000000
...,...,...
2017-08-15,28,0.111111
2017-08-15,29,0.166667
2017-08-15,30,58.685185
2017-08-15,31,2.740741


To add the day of the month to the XGBoost X dataframe. 

In [23]:
X_2["day"] = X_2.index.day  

In [24]:
X_2

Unnamed: 0_level_0,family,onpromotion,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,0,0.000000,1
2017-01-01,1,0.000000,1
2017-01-01,2,0.000000,1
2017-01-01,3,0.703704,1
2017-01-01,4,0.000000,1
...,...,...,...
2017-08-15,28,0.111111,15
2017-08-15,29,0.166667,15
2017-08-15,30,58.685185,15
2017-08-15,31,2.740741,15


To define the model combining linear regression and XGBRegressor. 

In [25]:
model = BoostedHybrid(LinearRegression(fit_intercept=False), XGBRegressor())

model.fit(X_1, X_2, y)
y_pred = model.predict(X_1, X_2)

y_pred = y_pred.clip(0.0)

In [26]:
y_pred

family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,5.153979,5.106370,5.141543,30.272480,5.105710,8.760529,5.101250,4.430238,0.000000,17.100396,...,4.929466,66.147739,14.408252,15.246856,8.699985,105.378399,0.000000,133.196875,0.000000,1.855846
2017-01-02,0.000000,5.107683,9.363442,5658.788755,5.106363,195.628053,9.375859,1720.985178,1225.152294,280.311414,...,4.972707,107.096350,354.042766,48.926488,8.777052,232.866052,122.686107,2694.659803,0.000000,15.572623
2017-01-03,7.072391,6.929562,11.220496,4370.230740,6.927582,71.927950,11.285620,1103.294357,1008.938707,185.046314,...,6.836515,110.579889,209.169572,50.008247,9.877468,187.029790,83.266694,3287.621663,1.057266,16.738031
2017-01-04,8.208842,8.018404,8.811626,3799.133663,8.015764,291.246982,12.462344,1339.691426,596.853905,170.108790,...,7.967282,114.127606,252.736335,51.154189,11.042064,138.249010,152.736716,3324.209255,2.231471,17.967614
2017-01-05,8.498277,8.260230,9.088625,2703.309779,8.256929,292.653125,2.207640,1069.324579,713.676085,30.465612,...,8.251034,66.694611,286.194272,11.266374,11.359642,127.656393,85.434094,1806.616182,2.318149,18.109669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,12.478059,1.861143,12.113112,3614.011141,1.713950,783.500085,23.962945,1443.377091,898.529635,556.262983,...,10.992023,787.212925,522.500760,17.361869,21.521635,665.403891,155.069444,2335.226364,47.525629,54.204141
2017-08-12,12.526981,1.862456,8.939096,3637.608888,1.714603,787.155557,24.052140,1957.995107,1338.576683,548.756138,...,11.035263,603.918041,524.615481,17.420282,21.598702,620.798636,161.785535,2350.903556,47.612307,42.605354
2017-08-13,11.729587,1.017452,8.129264,3832.418549,0.868939,1144.616503,23.295017,1793.284991,1144.788557,572.950907,...,10.232186,605.531917,437.167790,16.632377,20.829450,640.339790,161.566131,3032.931028,46.852668,27.821864
2017-08-14,11.778509,1.018765,8.165751,3560.917785,0.869592,912.084437,23.384212,1340.704926,869.841511,583.195703,...,10.275426,607.992110,526.049097,16.690790,20.906517,625.112560,162.193044,2799.787053,59.357193,27.963919


In [27]:
mean_squared_log_error(y,y_pred)

0.47065321714293523

In [28]:
store_sales_prep = store_sales.drop('onpromotion', axis=1)

In [29]:
store_sales_prep

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.000000
1,AUTOMOTIVE,2013-01-02,2.000000
1,AUTOMOTIVE,2013-01-03,3.000000
1,AUTOMOTIVE,2013-01-04,3.000000
1,AUTOMOTIVE,2013-01-05,5.000000
...,...,...,...
9,SEAFOOD,2017-08-11,23.830999
9,SEAFOOD,2017-08-12,16.859001
9,SEAFOOD,2017-08-13,20.000000
9,SEAFOOD,2017-08-14,17.000000


In [30]:
y_prep = store_sales_prep.unstack(['store_nbr', 'family']).loc["2017"]

In [31]:
y_prep

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
2017-01-02,5.0,0.0,0.0,1434.0,0.0,166.819000,0.0,332.0,376.0,44.980000,...,5.0,659.570007,1243.0,11.0,41.0,843.596008,115.188995,3136.895996,1.0,23.000000
2017-01-03,4.0,0.0,4.0,3081.0,2.0,519.348022,15.0,952.0,1045.0,209.300003,...,2.0,547.364014,876.0,6.0,15.0,714.659973,133.039001,3229.558105,1.0,14.000000
2017-01-04,1.0,0.0,4.0,3039.0,2.0,543.250977,17.0,1055.0,1029.0,135.944000,...,3.0,395.287994,677.0,6.0,13.0,536.830017,75.201004,1491.416992,7.0,0.000000
2017-01-05,2.0,0.0,3.0,2617.0,0.0,533.479980,40.0,918.0,853.0,137.005997,...,2.0,470.768005,604.0,7.0,10.0,414.100006,113.698997,1566.821045,1.0,17.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,0.0,1.0,1006.0,0.0,145.606995,4.0,341.0,343.0,64.302002,...,5.0,309.244995,373.0,11.0,2.0,525.223999,112.099998,1453.078003,140.0,23.830999
2017-08-12,6.0,0.0,3.0,1659.0,0.0,243.220001,3.0,351.0,526.0,99.487999,...,2.0,260.298004,400.0,7.0,10.0,383.386993,129.903992,1419.264038,138.0,16.859001
2017-08-13,1.0,0.0,1.0,803.0,0.0,136.679001,1.0,169.0,266.0,47.770000,...,3.0,327.205994,510.0,2.0,9.0,412.458008,105.168999,1693.607056,200.0,20.000000
2017-08-14,1.0,0.0,6.0,2201.0,0.0,346.037994,4.0,571.0,699.0,154.578003,...,12.0,330.975006,445.0,2.0,14.0,283.428986,114.120003,1348.425049,182.0,17.000000


In [32]:
model = BoostedHybrid(LinearRegression(fit_intercept=False), XGBRegressor())

model.fit(X_1, X_2, y_prep)
y_pred = model.predict(X_1, X_2)

y_pred = y_pred.clip(0.0)

In [33]:
y_pred

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-01,0.000000,0.000000,0.000000,10.548725,2.396941,8.704517,2.492772,2.579921,22.496998,0.000000,...,4.525074,95.249794,29.468325,26.361579,26.382714,116.467177,24.947480,212.594920,0.000000,5.263458
2017-01-02,0.000000,0.000000,3.057951,1346.656127,2.399198,59.602875,8.461819,412.058904,312.199696,64.761569,...,4.548871,235.829011,750.673433,116.685350,26.455475,196.828534,136.965902,2357.113881,0.000000,22.652409
2017-01-03,5.196943,2.119076,1.324664,2786.010741,6.185059,61.927794,1.690518,476.864662,487.063735,80.464786,...,6.690299,197.036462,397.648577,60.401992,19.745717,192.966183,96.384087,3032.783737,0.000000,22.736387
2017-01-04,4.486724,1.380449,7.886149,2849.096434,2.397282,330.494804,5.694886,722.175975,820.614175,68.725684,...,9.883440,202.795549,303.449679,63.622963,22.987822,199.055705,130.038997,1227.754007,0.157824,26.028503
2017-01-05,6.099514,1.380449,9.492153,2079.391356,2.399538,185.200399,3.749150,447.639356,690.547550,14.355948,...,11.995837,236.107904,335.566181,22.954905,23.060582,149.229410,126.903318,1493.287755,0.311460,26.151278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,5.505688,0.000000,6.970515,1597.027159,1.357816,505.894966,22.728108,668.713737,265.127007,142.043332,...,6.218524,603.305299,758.110293,15.206579,19.919782,774.838553,210.333990,1945.858390,87.199359,35.105107
2017-08-12,5.534095,0.000000,6.448590,1967.142637,1.360073,508.219885,22.826195,682.796076,653.806537,221.312686,...,6.242320,619.487325,699.117626,15.258204,19.992542,720.254948,196.181800,1956.853441,87.352995,37.182838
2017-08-13,5.562502,0.000000,10.551252,1473.433286,1.362330,389.499941,22.924283,691.182431,525.477949,231.503224,...,2.703172,618.514127,676.935566,11.746884,16.502358,743.348598,193.419332,2349.092800,83.943688,20.488415
2017-08-14,5.397211,0.000000,10.922723,2166.274717,1.170887,648.287819,22.828671,883.918014,702.569954,184.354669,...,0.000000,617.524225,720.775544,8.218861,12.995470,718.952685,190.640154,2124.168708,124.305039,17.031542


In [34]:
df_test = pd.read_csv(
    'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

In [35]:
df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2017-08-16,3000888,0
1,AUTOMOTIVE,2017-08-17,3002670,0
1,AUTOMOTIVE,2017-08-18,3004452,0
1,AUTOMOTIVE,2017-08-19,3006234,0
1,AUTOMOTIVE,2017-08-20,3008016,0
...,...,...,...,...
9,SEAFOOD,2017-08-27,3022271,0
9,SEAFOOD,2017-08-28,3024053,0
9,SEAFOOD,2017-08-29,3025835,0
9,SEAFOOD,2017-08-30,3027617,0


In [36]:
family_sales_test = (
    df_test
    .groupby(['family', 'date'])
    .mean()
    .unstack('family')
    .loc['2017']
)

In [37]:
family_sales_test

Unnamed: 0_level_0,id,id,id,id,id,id,id,id,id,id,...,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-08-16,3001762.5,3001763.5,3001764.5,3001765.5,3001766.5,3001767.5,3001768.5,3001769.5,3001770.5,3001771.5,...,0.0,0.018519,18.0,0.0,0.148148,0.185185,0.148148,223.314815,13.592593,0.0
2017-08-17,3003544.5,3003545.5,3003546.5,3003547.5,3003548.5,3003549.5,3003550.5,3003551.5,3003552.5,3003553.5,...,0.0,8.981481,8.740741,0.018519,0.074074,0.092593,0.018519,2.074074,2.62963,0.0
2017-08-18,3005326.5,3005327.5,3005328.5,3005329.5,3005330.5,3005331.5,3005332.5,3005333.5,3005334.5,3005335.5,...,0.0,22.796296,9.907407,0.0,0.148148,25.240741,0.37037,2.611111,2.407407,2.592593
2017-08-19,3007108.5,3007109.5,3007110.5,3007111.5,3007112.5,3007113.5,3007114.5,3007115.5,3007116.5,3007117.5,...,0.0,0.037037,10.722222,0.0,0.203704,0.092593,0.0,2.833333,2.740741,0.777778
2017-08-20,3008890.5,3008891.5,3008892.5,3008893.5,3008894.5,3008895.5,3008896.5,3008897.5,3008898.5,3008899.5,...,0.0,0.0,11.203704,0.0,0.12963,0.12963,0.0,2.814815,2.814815,0.0
2017-08-21,3010672.5,3010673.5,3010674.5,3010675.5,3010676.5,3010677.5,3010678.5,3010679.5,3010680.5,3010681.5,...,0.0,0.055556,10.314815,0.0,0.111111,0.12963,0.0,1.777778,2.685185,0.0
2017-08-22,3012454.5,3012455.5,3012456.5,3012457.5,3012458.5,3012459.5,3012460.5,3012461.5,3012462.5,3012463.5,...,0.0,0.0,10.185185,0.0,0.166667,0.12963,0.018519,53.574074,2.611111,0.0
2017-08-23,3014236.5,3014237.5,3014238.5,3014239.5,3014240.5,3014241.5,3014242.5,3014243.5,3014244.5,3014245.5,...,0.0,0.0,12.018519,0.018519,0.074074,0.12963,0.018519,101.759259,2.907407,0.111111
2017-08-24,3016018.5,3016019.5,3016020.5,3016021.5,3016022.5,3016023.5,3016024.5,3016025.5,3016026.5,3016027.5,...,0.0,9.203704,9.648148,0.0,0.092593,0.12963,0.018519,1.574074,2.37037,0.314815
2017-08-25,3017800.5,3017801.5,3017802.5,3017803.5,3017804.5,3017805.5,3017806.5,3017807.5,3017808.5,3017809.5,...,0.0,23.092593,9.796296,0.0,0.12963,24.962963,0.37037,1.277778,2.574074,2.407407


In [38]:
y_prep = store_sales.unstack(['store_nbr', 'family']).loc["2017"]

In [39]:
y_prep

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,...,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
2017-01-02,5.0,0.0,0.0,1434.0,0.0,166.819000,0.0,332.0,376.0,44.980000,...,0,0,13,0,0,2,1,4,0,0
2017-01-03,4.0,0.0,4.0,3081.0,2.0,519.348022,15.0,952.0,1045.0,209.300003,...,0,0,11,0,0,1,2,150,0,0
2017-01-04,1.0,0.0,4.0,3039.0,2.0,543.250977,17.0,1055.0,1029.0,135.944000,...,0,0,15,0,0,1,8,9,0,0
2017-01-05,2.0,0.0,3.0,2617.0,0.0,533.479980,40.0,918.0,853.0,137.005997,...,0,21,8,0,0,1,1,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,0.0,1.0,1006.0,0.0,145.606995,4.0,341.0,343.0,64.302002,...,0,0,11,0,0,22,3,6,7,0
2017-08-12,6.0,0.0,3.0,1659.0,0.0,243.220001,3.0,351.0,526.0,99.487999,...,0,0,7,0,0,0,1,7,10,4
2017-08-13,1.0,0.0,1.0,803.0,0.0,136.679001,1.0,169.0,266.0,47.770000,...,0,0,9,0,0,0,1,7,8,0
2017-08-14,1.0,0.0,6.0,2201.0,0.0,346.037994,4.0,571.0,699.0,154.578003,...,0,0,10,0,0,0,0,7,11,0


In [40]:
dp = DeterministicProcess(index=family_sales_test.index, order=1)
X_1_submit = dp.in_sample()

In [41]:
X_1_submit

Unnamed: 0_level_0,trend
date,Unnamed: 1_level_1
2017-08-16,1.0
2017-08-17,2.0
2017-08-18,3.0
2017-08-19,4.0
2017-08-20,5.0
2017-08-21,6.0
2017-08-22,7.0
2017-08-23,8.0
2017-08-24,9.0
2017-08-25,10.0


In [42]:
X_2_submit = family_sales_test.drop('id', axis=1).stack()

In [43]:
X_2_submit

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion
date,family,Unnamed: 2_level_1
2017-08-16,AUTOMOTIVE,0.000000
2017-08-16,BABY CARE,0.000000
2017-08-16,BEAUTY,2.000000
2017-08-16,BEVERAGES,31.592593
2017-08-16,BOOKS,0.000000
...,...,...
2017-08-31,POULTRY,0.388889
2017-08-31,PREPARED FOODS,0.111111
2017-08-31,PRODUCE,3.351852
2017-08-31,SCHOOL AND OFFICE SUPPLIES,2.870370


In [44]:
le = LabelEncoder()
X_2_submit = X_2_submit.reset_index('family')
X_2_submit['family'] = le.fit_transform(X_2_submit['family'])

In [45]:
X_2_submit

Unnamed: 0_level_0,family,onpromotion
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-16,0,0.000000
2017-08-16,1,0.000000
2017-08-16,2,2.000000
2017-08-16,3,31.592593
2017-08-16,4,0.000000
...,...,...
2017-08-31,28,0.388889
2017-08-31,29,0.111111
2017-08-31,30,3.351852
2017-08-31,31,2.870370


In [46]:
X_2_submit["day"] = X_2_submit.index.day  

In [47]:
X_2_submit

Unnamed: 0_level_0,family,onpromotion,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-16,0,0.000000,16
2017-08-16,1,0.000000,16
2017-08-16,2,2.000000,16
2017-08-16,3,31.592593,16
2017-08-16,4,0.000000,16
...,...,...,...
2017-08-31,28,0.388889,31
2017-08-31,29,0.111111,31
2017-08-31,30,3.351852,31
2017-08-31,31,2.870370,31


In [96]:
#y_submit = pd.DataFrame(model.predict(X_1_submit, X_2_submit), index=X_1_submit.index, columns=y.columns)

In [48]:
y_submit = pd.DataFrame(model.predict(X_1_submit, X_2_submit), index=X_1_submit.index, columns=y_prep.columns)

In [50]:
y_submit

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,...,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-08-16,-0.994442,-1.022849,16.293124,1968.02124,0.663139,413.246733,0.75897,23.959231,739.935145,-128.775601,...,,,,,,,,,,
2017-08-17,6.113851,6.057036,12.043701,638.317565,8.553826,199.354473,8.745488,548.215215,560.710133,45.549194,...,,,,,,,,,,
2017-08-18,1.352686,1.267464,10.193163,657.230224,3.766509,198.341349,4.054002,454.703713,500.280166,327.297908,...,,,,,,,,,,
2017-08-19,-1.212298,-1.325927,7.077847,1014.834777,1.175377,194.159126,2.571336,413.914256,80.192976,41.480338,...,,,,,,,,,,
2017-08-20,0.891885,-0.119952,8.380675,1314.292235,2.383608,-139.372034,2.862762,71.653261,379.004764,43.632594,...,,,,,,,,,,
2017-08-21,0.050492,-0.119952,8.402297,1438.566405,2.385864,154.743672,2.96085,408.114517,378.537108,44.503645,...,,,,,,,,,,
2017-08-22,0.078899,-0.119952,10.093356,550.644316,2.388121,190.796801,3.928738,470.612872,955.478081,45.374697,...,,,,,,,,,,
2017-08-23,-5.243748,-5.471007,2.518259,390.310484,-5.157404,296.8004,-2.083219,128.084997,-705.668927,52.130033,...,,,,,,,,,,
2017-08-24,7.197053,4.63385,8.726172,-505.495364,5.84806,238.254515,6.710538,453.035912,182.651685,49.347543,...,,,,,,,,,,
2017-08-25,-0.07412,-0.358193,2.166239,-55.003589,-0.903684,292.098278,0.537851,392.601985,135.823806,-422.738704,...,,,,,,,,,,


In [51]:
y_submit = y_submit.stack(['store_nbr', 'family'])

In [52]:
y_submit['sales'] = [a if a>0 else 0 for a in y_submit['sales']]

In [53]:
y_submit

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,onpromotion,sales
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-16,1,AUTOMOTIVE,,0.000000
2017-08-16,1,BABY CARE,,0.000000
2017-08-16,1,BEAUTY,,16.293124
2017-08-16,1,BEVERAGES,,1968.021240
2017-08-16,1,BOOKS,,0.663139
...,...,...,...,...
2017-08-31,9,POULTRY,,132.482012
2017-08-31,9,PREPARED FOODS,,1.832343
2017-08-31,9,PRODUCE,,716.125099
2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,,206.065567


In [54]:
y_submit = y_submit.join(df_test.id).reindex(columns=['id', 'sales'])

In [55]:
y_submit

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,sales
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-16,1,AUTOMOTIVE,3000888,0.000000
2017-08-16,1,BABY CARE,3000889,0.000000
2017-08-16,1,BEAUTY,3000890,16.293124
2017-08-16,1,BEVERAGES,3000891,1968.021240
2017-08-16,1,BOOKS,3000892,0.663139
...,...,...,...,...
2017-08-31,9,POULTRY,3029395,132.482012
2017-08-31,9,PREPARED FOODS,3029396,1.832343
2017-08-31,9,PRODUCE,3029397,716.125099
2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,3029398,206.065567


In [56]:
y_submit.to_csv('./submissions/submit-exercise2.csv', index=False)

# End of Core Exercise - Looking at Using other Models

Depending on your problem, you might want to use other hybrid combinations than the linear regression + XGBoost hybrid you've created in the previous questions. Run the next cell to try other algorithms from scikit-learn.

In [None]:
# Model 1 (trend)
from sklearn.linear_model import ElasticNet, Lasso, Ridge

# Model 2
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Boosted Hybrid

# YOUR CODE HERE: Try different combinations of the algorithms above
model = BoostedHybrid(
    model_1=Ridge(),
    model_2=KNeighborsRegressor(),
)

These are just some suggestions. You might discover other algorithms you like in the scikit-learn User Guide.

Use the code in this cell to see the predictions your hybrid makes.

In [None]:
y_train, y_valid = y[:"2017-07-01"], y["2017-07-02":]
X1_train, X1_valid = X_1[: "2017-07-01"], X_1["2017-07-02" :]
X2_train, X2_valid = X_2.loc[:"2017-07-01"], X_2.loc["2017-07-02":]

# Some of the algorithms above do best with certain kinds of
# preprocessing on the features (like standardization), but this is
# just a demo.
model.fit(X1_train, X2_train, y_train)
y_fit = model.predict(X1_train, X2_train).clip(0.0)
y_pred = model.predict(X1_valid, X2_valid).clip(0.0)

families = y.columns[0:6]
axs = y.loc(axis=1)[families].plot(
    subplots=True, sharex=True, figsize=(11, 9), **plot_params, alpha=0.5,
)
_ = y_fit.loc(axis=1)[families].plot(subplots=True, sharex=True, color='C0', ax=axs)
_ = y_pred.loc(axis=1)[families].plot(subplots=True, sharex=True, color='C3', ax=axs)
for ax, family in zip(axs, families):
    ax.legend([])
    ax.set_ylabel(family)