In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
import category_encoders as ce

In [5]:
df = pd.read_csv(r"https://raw.githubusercontent.com/sydneyproject000/dat-11-15/main/ClassMaterial/Unit1/data/master.csv", parse_dates = ['visit_date'])

In [6]:
df['yesterday'] = df.groupby('id').apply(lambda x: x['visitors'].shift()).values
df['last_week'] = df.groupby('id').apply(lambda x: x['visitors'].shift(7)).values

# fill in missing reservations
df['reserve_visitors'] = df['reserve_visitors'].fillna(0)

# drop missing values from shifts
df = df.dropna()

In [7]:
X = df[['id', 'yesterday', 'day_of_week']]
y = df['visitors']

In [8]:
# let's assume these were our optimized parameters
tree = DecisionTreeRegressor(max_depth = 7, max_features = 0.8, min_samples_leaf = 10)

pipe = make_pipeline(ce.TargetEncoder(), tree)

In [9]:
# fit the tree, and export it
pipe.fit(X, y)

Pipeline(steps=[('targetencoder', TargetEncoder(cols=['id', 'day_of_week'])),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(max_depth=7, max_features=0.8,
                                       min_samples_leaf=10))])

In [10]:
# the pickle module allows you to export saved models
import pickle

# rb -- WRITE the file in BYTES
with open('pipe.pkl', 'wb') as export:
    # this creates an external version of the file that we can now import later on
    pickle.dump(pipe, export)

In [11]:
# we can now import this, and re-use it on new data:  very handy

with open('pipe.pkl', 'rb') as import_:
    # this will import the pickled object again
    pipe2 = pickle.load(import_)

In [12]:
# here it is
pipe2

Pipeline(steps=[('targetencoder', TargetEncoder(cols=['id', 'day_of_week'])),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(max_depth=7, max_features=0.8,
                                       min_samples_leaf=10))])

In [13]:
# and we can use it to make new predictions
pipe2.predict(X)

array([19.67006418, 24.71952428, 32.24199288, ..., 61.10708899,
       40.35955424, 61.10708899])

In [14]:
id_='fgdsfdakfd'
day_of_week = 'Sunday'
yesterday = 33


In [16]:
sample = {'id':id_,
         'yesterday': yesterday,
         'day_of_week': day_of_week}
sample=pd.DataFrame(sample, index = [0])

In [18]:
pipe2.predict(sample)[0]

23.098383895662035