## Dealing with timeseries

- https://pola-rs.github.io/polars-book/user-guide/howcani/timeseries/intro.html

In [1]:
import polars as pl

# to enrich the examples in this quickstart with dates
from datetime import datetime, timedelta 
# to generate data for the examples
import numpy as np 

In [8]:
file_csv = "data/yahoofinance-SPY-20080101-20180101.csv"

In [5]:
df = pl.read_csv(file_csv, parse_dates=True)   # enable prase_dates

In [6]:
df.tail(10)

Date,Open,High,Low,Close,Adj Close,Volume
date,f64,f64,f64,f64,f64,i64
2017-12-15,265.450012,267.040009,265.390015,266.51001,266.51001,144610300
2017-12-18,268.100006,268.600006,267.980011,268.200012,268.200012,83653600
2017-12-19,268.480011,268.529999,267.089996,267.170013,267.170013,82382900
2017-12-20,268.269989,268.329987,266.690002,267.029999,267.029999,76751500
2017-12-21,267.73999,268.390015,267.299988,267.579987,267.579987,67032300
2017-12-22,267.600006,267.640015,266.899994,267.51001,267.51001,78720900
2017-12-26,267.049988,267.440002,266.890015,267.190002,267.190002,45244400
2017-12-27,267.380005,267.730011,267.01001,267.320007,267.320007,57751000
2017-12-28,267.890015,267.920013,267.450012,267.869995,267.869995,45116100
2017-12-29,268.529999,268.549988,266.640015,266.859985,266.859985,96007400


In [10]:
df.shape

(2519, 7)

In [13]:
filtered_df = df.filter(
    pl.col("Date") == datetime(2017, 11, 2),
)
filtered_df

Date,Open,High,Low,Close,Adj Close,Volume
date,f64,f64,f64,f64,f64,i64
2017-11-02,257.410004,257.75,256.190002,257.589996,256.280029,56449500


filter rows by date range

`is_between` is exclusive on both start and stop

In [11]:
df_1 = df.filter(pl.col("Date").is_between(datetime(2017, 11, 1), datetime(2017, 12, 1)))



In [12]:
df_1

Date,Open,High,Low,Close,Adj Close,Volume
date,f64,f64,f64,f64,f64,i64
2017-11-02,257.410004,257.75,256.190002,257.589996,256.280029,56449500
2017-11-03,257.769989,258.5,257.299988,258.450012,257.135681,59589700
2017-11-06,258.299988,259.0,258.220001,258.850006,257.53363,49652600
2017-11-07,258.970001,259.350006,258.089996,258.670013,257.354553,57502200
2017-11-08,258.470001,259.220001,258.149994,259.109985,257.792297,50469600
2017-11-09,257.730011,258.390015,256.359985,258.170013,256.857086,95085500
2017-11-10,257.730011,258.290009,257.369995,258.089996,256.777496,59984700
2017-11-13,257.309998,258.589996,257.269989,258.329987,257.016266,50228600
2017-11-14,257.410004,257.850006,256.519989,257.730011,256.419342,61315200
2017-11-15,256.619995,257.220001,255.630005,256.440002,255.13588,80811500


use `strptime` to parse string to date

In [9]:
df2 = pl.read_csv(file_csv, parse_dates=False)

df2 = df2.with_column(pl.col("Date").str.strptime(pl.Date, fmt="%Y-%m-%d"))
df2.tail(10)

Date,Open,High,Low,Close,Adj Close,Volume
date,f64,f64,f64,f64,f64,i64
2017-12-15,265.450012,267.040009,265.390015,266.51001,266.51001,144610300
2017-12-18,268.100006,268.600006,267.980011,268.200012,268.200012,83653600
2017-12-19,268.480011,268.529999,267.089996,267.170013,267.170013,82382900
2017-12-20,268.269989,268.329987,266.690002,267.029999,267.029999,76751500
2017-12-21,267.73999,268.390015,267.299988,267.579987,267.579987,67032300
2017-12-22,267.600006,267.640015,266.899994,267.51001,267.51001,78720900
2017-12-26,267.049988,267.440002,266.890015,267.190002,267.190002,45244400
2017-12-27,267.380005,267.730011,267.01001,267.320007,267.320007,57751000
2017-12-28,267.890015,267.920013,267.450012,267.869995,267.869995,45116100
2017-12-29,268.529999,268.549988,266.640015,266.859985,266.859985,96007400


### Fixed and rolling temporal groupby

In [15]:
df.shape, df.head(), df.tail()

((2519, 7),
 shape: (5, 7)
 ┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬───────────┐
 │ Date       ┆ Open       ┆ High       ┆ Low        ┆ Close      ┆ Adj Close  ┆ Volume    │
 │ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---       │
 │ date       ┆ f64        ┆ f64        ┆ f64        ┆ f64        ┆ f64        ┆ i64       │
 ╞════════════╪════════════╪════════════╪════════════╪════════════╪════════════╪═══════════╡
 │ 2007-12-31 ┆ 147.100006 ┆ 147.610001 ┆ 146.059998 ┆ 146.210007 ┆ 118.624741 ┆ 108126800 │
 │ 2008-01-02 ┆ 146.529999 ┆ 146.990005 ┆ 143.880005 ┆ 144.929993 ┆ 117.586205 ┆ 204935600 │
 │ 2008-01-03 ┆ 144.910004 ┆ 145.490005 ┆ 144.070007 ┆ 144.860001 ┆ 117.529449 ┆ 125133300 │
 │ 2008-01-04 ┆ 143.339996 ┆ 143.440002 ┆ 140.910004 ┆ 141.309998 ┆ 114.649185 ┆ 232330900 │
 │ 2008-01-07 ┆ 141.809998 ┆ 142.229996 ┆ 140.100006 ┆ 141.190002 ┆ 114.551826 ┆ 234991000 │
 └────────────┴────────────┴────────────┴──

In [16]:
df_with_year = df.with_column(pl.col("Date").dt.year().alias("year"))
df_with_year

Date,Open,High,Low,Close,Adj Close,Volume,year
date,f64,f64,f64,f64,f64,i64,i32
2007-12-31,147.100006,147.610001,146.059998,146.210007,118.624741,108126800,2007
2008-01-02,146.529999,146.990005,143.880005,144.929993,117.586205,204935600,2008
2008-01-03,144.910004,145.490005,144.070007,144.860001,117.529449,125133300,2008
2008-01-04,143.339996,143.440002,140.910004,141.309998,114.649185,232330900,2008
2008-01-07,141.809998,142.229996,140.100006,141.190002,114.551826,234991000,2008
2008-01-08,142.080002,142.899994,138.440002,138.910004,112.702011,326365700,2008
2008-01-09,139.089996,140.789993,137.699997,140.369995,113.886528,301824900,2008
2008-01-10,139.679993,142.800003,139.369995,141.289993,114.632973,335701200,2008
2008-01-11,140.779999,141.899994,139.0,140.149994,113.708046,267076600,2008
2008-01-14,141.160004,141.860001,140.399994,141.279999,114.624893,170365500,2008


In [17]:
df.sample?