import pandas as pd
from ts2ml .core import add_missing_slots
from ts2ml .core import transform_ts_data_into_features_and_target
df = pd .DataFrame ({
'pickup_hour' : ['2022-01-01 00:00:00' , '2022-01-01 01:00:00' , '2022-01-01 03:00:00' , '2022-01-01 01:00:00' , '2022-01-01 02:00:00' , '2022-01-01 05:00:00' ],
'pickup_location_id' : [1 , 1 , 1 , 2 , 2 , 2 ],
'rides' : [2 , 3 , 1 , 1 , 2 , 1 ]
})
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
pickup_hour
pickup_location_id
rides
0
2022-01-01 00:00:00
1
2
1
2022-01-01 01:00:00
1
3
2
2022-01-01 03:00:00
1
1
3
2022-01-01 01:00:00
2
1
4
2022-01-01 02:00:00
2
2
5
2022-01-01 05:00:00
2
1
Let’s fill the missing slots with zeros
df = add_missing_slots (df , datetime_col = 'pickup_hour' , entity_col = 'pickup_location_id' , value_col = 'rides' , freq = 'H' )
df
100%|██████████| 2/2 [00:00<00:00, 907.86it/s]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
pickup_hour
pickup_location_id
rides
0
2022-01-01 00:00:00
1
2
1
2022-01-01 01:00:00
1
3
2
2022-01-01 02:00:00
1
0
3
2022-01-01 03:00:00
1
1
4
2022-01-01 04:00:00
1
0
5
2022-01-01 05:00:00
1
0
6
2022-01-01 00:00:00
2
0
7
2022-01-01 01:00:00
2
1
8
2022-01-01 02:00:00
2
2
9
2022-01-01 03:00:00
2
0
10
2022-01-01 04:00:00
2
0
11
2022-01-01 05:00:00
2
1
Now, let’s build features and targets to predict the number of rides for
the next hour for each location_id, by using the historical number of
rides for the last 3 hours
features , targets = transform_ts_data_into_features_and_target (
df ,
n_features = 3 ,
datetime_col = 'pickup_hour' ,
entity_col = 'pickup_location_id' ,
value_col = 'rides' ,
n_targets = 1 ,
step_size = 1 ,
step_name = 'hour'
)
100%|██████████| 2/2 [00:00<00:00, 597.86it/s]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
rides_previous_3_hour
rides_previous_2_hour
rides_previous_1_hour
pickup_hour
pickup_location_id
0
2.0
3.0
0.0
2022-01-01 03:00:00
1
1
3.0
0.0
1.0
2022-01-01 04:00:00
1
2
0.0
1.0
2.0
2022-01-01 03:00:00
2
3
1.0
2.0
0.0
2022-01-01 04:00:00
2
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
target_rides_next_hour
0
1.0
1
0.0
2
0.0
3
0.0
Xy_df = pd .concat ([features , targets ], axis = 1 )
Xy_df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
rides_previous_3_hour
rides_previous_2_hour
rides_previous_1_hour
pickup_hour
pickup_location_id
target_rides_next_hour
0
2.0
3.0
0.0
2022-01-01 03:00:00
1
1.0
1
3.0
0.0
1.0
2022-01-01 04:00:00
1
0.0
2
0.0
1.0
2.0
2022-01-01 03:00:00
2
0.0
3
1.0
2.0
0.0
2022-01-01 04:00:00
2
0.0
Montly spaced time series
import pandas as pd
import numpy as np
# Generate timestamp index with monthly frequency
date_rng = pd .date_range (start = '1/1/2020' , end = '12/1/2022' , freq = 'MS' )
# Create list of city codes
cities = ['FOR' , 'SP' , 'RJ' ]
# Create dataframe with random sales data for each city on each month
df = pd .DataFrame ({
'date' : date_rng ,
'city' : np .repeat (cities , len (date_rng )// len (cities )),
'sales' : np .random .randint (1000 , 5000 , size = len (date_rng ))
})
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
date
city
sales
0
2020-01-01
FOR
4944
1
2020-02-01
FOR
3435
2
2020-03-01
FOR
4543
3
2020-04-01
FOR
3879
4
2020-05-01
FOR
2601
5
2020-06-01
FOR
2922
6
2020-07-01
FOR
4542
7
2020-08-01
FOR
1338
8
2020-09-01
FOR
2938
9
2020-10-01
FOR
2695
10
2020-11-01
FOR
4065
11
2020-12-01
FOR
3864
12
2021-01-01
SP
2652
13
2021-02-01
SP
2137
14
2021-03-01
SP
2663
15
2021-04-01
SP
1168
16
2021-05-01
SP
4523
17
2021-06-01
SP
4135
18
2021-07-01
SP
3566
19
2021-08-01
SP
2121
20
2021-09-01
SP
1070
21
2021-10-01
SP
1624
22
2021-11-01
SP
3034
23
2021-12-01
SP
4063
24
2022-01-01
RJ
2297
25
2022-02-01
RJ
3430
26
2022-03-01
RJ
2903
27
2022-04-01
RJ
4197
28
2022-05-01
RJ
4141
29
2022-06-01
RJ
2899
30
2022-07-01
RJ
4529
31
2022-08-01
RJ
3612
32
2022-09-01
RJ
1856
33
2022-10-01
RJ
4804
34
2022-11-01
RJ
1764
35
2022-12-01
RJ
4425
FOR city only have data for 2020 year, RJ only for 2022 and SP only for
2021. Let’s also simulate more missing slots between the years.
# Generate random indices to drop
drop_indices = np .random .choice (df .index , size = int (len (df )* 0.2 ), replace = False )
# Drop selected rows from dataframe
df = df .drop (drop_indices )
df .reset_index (drop = True , inplace = True )
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
date
city
sales
0
2020-01-01
FOR
4944
1
2020-02-01
FOR
3435
2
2020-03-01
FOR
4543
3
2020-04-01
FOR
3879
4
2020-05-01
FOR
2601
5
2020-06-01
FOR
2922
6
2020-07-01
FOR
4542
7
2020-08-01
FOR
1338
8
2020-09-01
FOR
2938
9
2020-11-01
FOR
4065
10
2020-12-01
FOR
3864
11
2021-01-01
SP
2652
12
2021-02-01
SP
2137
13
2021-03-01
SP
2663
14
2021-07-01
SP
3566
15
2021-08-01
SP
2121
16
2021-10-01
SP
1624
17
2021-11-01
SP
3034
18
2021-12-01
SP
4063
19
2022-01-01
RJ
2297
20
2022-02-01
RJ
3430
21
2022-03-01
RJ
2903
22
2022-04-01
RJ
4197
23
2022-05-01
RJ
4141
24
2022-06-01
RJ
2899
25
2022-09-01
RJ
1856
26
2022-10-01
RJ
4804
27
2022-11-01
RJ
1764
28
2022-12-01
RJ
4425
Now lets fill the missing slots with zero values. The function will
complete the missing slots with zeros:
df_full = add_missing_slots (df , datetime_col = 'date' , entity_col = 'city' , value_col = 'sales' , freq = 'MS' )
df_full
100%|██████████| 3/3 [00:00<00:00, 843.70it/s]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
date
city
sales
0
2020-01-01
FOR
4944
1
2020-02-01
FOR
3435
2
2020-03-01
FOR
4543
3
2020-04-01
FOR
3879
4
2020-05-01
FOR
2601
...
...
...
...
103
2022-08-01
RJ
0
104
2022-09-01
RJ
1856
105
2022-10-01
RJ
4804
106
2022-11-01
RJ
1764
107
2022-12-01
RJ
4425
108 rows × 3 columns
Let’s build a dataset for training a machine learning model to predict
the sales for the next 3 months, for each city, based on historical data
of sales for the previous 6 months.
features , targets = transform_ts_data_into_features_and_target (
df_full ,
n_features = 3 ,
datetime_col = 'date' ,
entity_col = 'city' ,
value_col = 'sales' ,
n_targets = 1 ,
step_size = 1 ,
step_name = 'month'
)
100%|██████████| 3/3 [00:00<00:00, 205.58it/s]
pd .concat ([features , targets ], axis = 1 )
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sales_previous_3_month
sales_previous_2_month
sales_previous_1_month
date
city
target_sales_next_month
0
4944.0
3435.0
4543.0
2020-04-01
FOR
3879.0
1
3435.0
4543.0
3879.0
2020-05-01
FOR
2601.0
2
4543.0
3879.0
2601.0
2020-06-01
FOR
2922.0
3
3879.0
2601.0
2922.0
2020-07-01
FOR
4542.0
4
2601.0
2922.0
4542.0
2020-08-01
FOR
1338.0
...
...
...
...
...
...
...
91
4197.0
4141.0
2899.0
2022-07-01
RJ
0.0
92
4141.0
2899.0
0.0
2022-08-01
RJ
0.0
93
2899.0
0.0
0.0
2022-09-01
RJ
1856.0
94
0.0
0.0
1856.0
2022-10-01
RJ
4804.0
95
0.0
1856.0
4804.0
2022-11-01
RJ
1764.0
96 rows × 6 columns
Embedding on Sklearn Pipelines
from sklearn .pipeline import make_pipeline
from sklearn .preprocessing import FunctionTransformer
add_missing_slots_transformer = FunctionTransformer (
add_missing_slots ,
kw_args = {
'datetime_col' : 'date' ,
'entity_col' : 'city' ,
'value_col' : 'sales' ,
'freq' : 'MS'
}
)
transform_ts_data_into_features_and_target_transformer = FunctionTransformer (
transform_ts_data_into_features_and_target ,
kw_args = {
'n_features' : 3 ,
'datetime_col' : 'date' ,
'entity_col' : 'city' ,
'value_col' : 'sales' ,
'n_targets' : 1 ,
'step_size' : 1 ,
'step_name' : 'month' ,
'concat_Xy' : True
}
)
ts_data_to_features_and_target_pipeline = make_pipeline (
add_missing_slots_transformer ,
transform_ts_data_into_features_and_target_transformer
)
ts_data_to_features_and_target_pipeline
<style>#sk-container-id-3 {color: black;background-color: white;}#sk-container-id-3 pre{padding: 0;}#sk-container-id-3 div.sk-toggleable {background-color: white;}#sk-container-id-3 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-3 label.sk-toggleable__label-arrow:before {content: "▸";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-3 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-3 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-3 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: "▾";}#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-3 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-3 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-3 div.sk-parallel-item::after {content: "";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-serial::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-3 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-3 div.sk-item {position: relative;z-index: 1;}#sk-container-id-3 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-3 div.sk-item::before, #sk-container-id-3 div.sk-parallel-item::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-3 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-3 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-3 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-3 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-3 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-3 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-3 div.sk-label-container {text-align: center;}#sk-container-id-3 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: scikit-learn/scikit-learn#21755 */display: inline-block !important;position: relative;}#sk-container-id-3 div.sk-text-repr-fallback {display: none;}</style>Pipeline(steps=[('functiontransformer-1',
FunctionTransformer(func=<function add_missing_slots at 0x11f8f49d0>,
kw_args={'datetime_col': 'date',
'entity_col': 'city',
'freq': 'MS',
'value_col': 'sales'})),
('functiontransformer-2',
FunctionTransformer(func=<function transform_ts_data_into_features_and_target at 0x11f925ca0>,
kw_args={'concat_Xy': True,
'datetime_col': 'date',
'entity_col': 'city',
'n_features': 3, 'n_targets': 1,
'step_name': 'month',
'step_size': 1,
'value_col': 'sales'}))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-7" type="checkbox" ><label for="sk-estimator-id-7" class="sk-toggleable__label sk-toggleable__label-arrow">Pipeline</label><div class="sk-toggleable__content"><pre>Pipeline(steps=[('functiontransformer-1',
FunctionTransformer(func=<function add_missing_slots at 0x11f8f49d0>,
kw_args={'datetime_col': 'date',
'entity_col': 'city',
'freq': 'MS',
'value_col': 'sales'})),
('functiontransformer-2',
FunctionTransformer(func=<function transform_ts_data_into_features_and_target at 0x11f925ca0>,
kw_args={'concat_Xy': True,
'datetime_col': 'date',
'entity_col': 'city',
'n_features': 3, 'n_targets': 1,
'step_name': 'month',
'step_size': 1,
'value_col': 'sales'}))])</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-8" type="checkbox" ><label for="sk-estimator-id-8" class="sk-toggleable__label sk-toggleable__label-arrow">FunctionTransformer</label><div class="sk-toggleable__content"><pre>FunctionTransformer(func=<function add_missing_slots at 0x11f8f49d0>,
kw_args={'datetime_col': 'date', 'entity_col': 'city',
'freq': 'MS', 'value_col': 'sales'})</pre></div></div></div><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-9" type="checkbox" ><label for="sk-estimator-id-9" class="sk-toggleable__label sk-toggleable__label-arrow">FunctionTransformer</label><div class="sk-toggleable__content"><pre>FunctionTransformer(func=<function transform_ts_data_into_features_and_target at 0x11f925ca0>,
kw_args={'concat_Xy': True, 'datetime_col': 'date',
'entity_col': 'city', 'n_features': 3,
'n_targets': 1, 'step_name': 'month',
'step_size': 1, 'value_col': 'sales'})</pre></div></div></div></div></div></div></div>
Xy_df = ts_data_to_features_and_target_pipeline .fit_transform (df )
Xy_df
100%|██████████| 3/3 [00:00<00:00, 715.47it/s]
100%|██████████| 3/3 [00:00<00:00, 184.12it/s]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sales_previous_3_month
sales_previous_2_month
sales_previous_1_month
date
city
target_sales_next_month
0
4944.0
3435.0
4543.0
2020-04-01
FOR
3879.0
1
3435.0
4543.0
3879.0
2020-05-01
FOR
2601.0
2
4543.0
3879.0
2601.0
2020-06-01
FOR
2922.0
3
3879.0
2601.0
2922.0
2020-07-01
FOR
4542.0
4
2601.0
2922.0
4542.0
2020-08-01
FOR
1338.0
...
...
...
...
...
...
...
91
4197.0
4141.0
2899.0
2022-07-01
RJ
0.0
92
4141.0
2899.0
0.0
2022-08-01
RJ
0.0
93
2899.0
0.0
0.0
2022-09-01
RJ
1856.0
94
0.0
0.0
1856.0
2022-10-01
RJ
4804.0
95
0.0
1856.0
4804.0
2022-11-01
RJ
1764.0
96 rows × 6 columns