## Overview
This notebook contains all the code covered in the UnseriousAI YouTube video:

This notebook and video are the first in the time series tutorials.

## Packages Needed

In [1]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas

import numpy as np
import pandas as pd



## Creating Fake Data
For our fake data we are first going to create some clean data then a few dirty examples.

### Example 1

Using the native pandas function date_range we can create a series of date
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html

Signature:
pandas.date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=_NoDefault.no_default, inclusive=None, **kwargs)[source]

In [2]:
example_series_1 = pd.date_range(start='01/01/2023', end="01/03/2023")
for date in example_series_1:
    print(date, type(date))

2023-01-01 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-01-02 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-01-03 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [3]:
# NOTE: the type is timestamp not string.

In [4]:
# Generates the same as 1 but using periods instead. Periods defaults to days
example_series_2 = pd.date_range(start='01/01/2023',periods=3)
for date in example_series_2:
    print(date, type(date))

2023-01-01 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-01-02 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-01-03 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [5]:
# with 3 periods within 2 days we start to see times other than midnight
example_series_3 = pd.date_range(start='01/01/2023',end='01/02/2023',periods=3)
for date in example_series_3:
    print(date, type(date))

2023-01-01 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-01-01 12:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-01-02 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [6]:
# to use specific interval the freq parameter is used
example_series_4 = pd.date_range(start='01/01/2023',periods=3, freq='23H')
for date in example_series_4:
    print(date, type(date))

2023-01-01 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-01-01 23:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2023-01-02 22:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


For other examples see the docs

In [7]:
# Using in a dataframe
zeros_df = pd.DataFrame(np.zeros((3, 3)), columns=["a","b","c"]) # 3 x 3 zero filled DF
zeros_df["date"] = example_series_4
zeros_df.head()


Unnamed: 0,a,b,c,date
0,0.0,0.0,0.0,2023-01-01 00:00:00
1,0.0,0.0,0.0,2023-01-01 23:00:00
2,0.0,0.0,0.0,2023-01-02 22:00:00


In [8]:
## NOTE: Very important to notice this is a datetime dtype in the DF
print(zeros_df["date"].dtypes)

datetime64[ns]


### Extracting attributes

In [9]:
# create new columns and pull values out
zeros_df['year'] = zeros_df["date"].dt.year
zeros_df['month'] = zeros_df["date"].dt.month
zeros_df['day'] = zeros_df["date"].dt.day
zeros_df.head()

Unnamed: 0,a,b,c,date,year,month,day
0,0.0,0.0,0.0,2023-01-01 00:00:00,2023,1,1
1,0.0,0.0,0.0,2023-01-01 23:00:00,2023,1,1
2,0.0,0.0,0.0,2023-01-02 22:00:00,2023,1,2


In [10]:
# convert to names or day of week/year
zeros_df['day_name'] = zeros_df.date.dt.day_name() # notice dot notation on the column name is also an options
zeros_df['month_name'] = zeros_df.date.dt.month_name()
zeros_df['day_of_year'] = zeros_df.date.dt.dayofyear
zeros_df['day_of_week'] = zeros_df.date.dt.dayofweek

zeros_df.head()

Unnamed: 0,a,b,c,date,year,month,day,day_name,month_name,day_of_year,day_of_week
0,0.0,0.0,0.0,2023-01-01 00:00:00,2023,1,1,Sunday,January,1,6
1,0.0,0.0,0.0,2023-01-01 23:00:00,2023,1,1,Sunday,January,1,6
2,0.0,0.0,0.0,2023-01-02 22:00:00,2023,1,2,Monday,January,2,0


In [11]:
# similar functions for times also exist
zeros_df['hour'] = zeros_df.date.dt.hour
zeros_df['minute'] = zeros_df.date.dt.minute
zeros_df['second'] = zeros_df.date.dt.second

# sub selecting columns for viewing
zeros_df[['date', 'hour', 'minute', 'second']].head()

Unnamed: 0,date,hour,minute,second
0,2023-01-01 00:00:00,0,0,0
1,2023-01-01 23:00:00,23,0,0
2,2023-01-02 22:00:00,22,0,0


## Example 2
Converting string based dates into datetime objects.

In [12]:
bad_dates = ["4/12/2023 2:47:30 AM",
             "20230101",
             "1-1-23",
             "1-01-23 8:00",
             "1-jan-2023",
             "13/12/2023",
             "Oct 20 2023",
             "July 4 2023",
             "March 8, 2022, 22:00",
             "31 December 2022, 23:59:59",
             "",
             None]

print(len(bad_dates))

zeros_df2 = pd.DataFrame(np.zeros((len(bad_dates), 3)), columns=["a","b","c"])
zeros_df2["dates"] = bad_dates
zeros_df2.head(len(bad_dates)+1)

12


Unnamed: 0,a,b,c,dates
0,0.0,0.0,0.0,4/12/2023 2:47:30 AM
1,0.0,0.0,0.0,20230101
2,0.0,0.0,0.0,1-1-23
3,0.0,0.0,0.0,1-01-23 8:00
4,0.0,0.0,0.0,1-jan-2023
5,0.0,0.0,0.0,13/12/2023
6,0.0,0.0,0.0,Oct 20 2023
7,0.0,0.0,0.0,July 4 2023
8,0.0,0.0,0.0,"March 8, 2022, 22:00"
9,0.0,0.0,0.0,"31 December 2022, 23:59:59"


In [13]:
# the Pandas to_datetime can infer most formats but not all.
for d in bad_dates:
    try:
        newdate = pd.to_datetime(d, infer_datetime_format=True)
        print(d, "\t|\t", newdate)
    except Exception as e:
        print(d, "\t|\t",e)

4/12/2023 2:47:30 AM 	|	 2023-04-12 02:47:30
20230101 	|	 2023-01-01 00:00:00
1-1-23 	|	 2023-01-01 00:00:00
1-01-23 8:00 	|	 2023-01-01 08:00:00
1-jan-2023 	|	 2023-01-01 00:00:00
13/12/2023 	|	 2023-12-13 00:00:00
Oct 20 2023 	|	 2023-10-20 00:00:00
July 4 2023 	|	 2023-07-04 00:00:00
March 8, 2022, 22:00 	|	 2022-03-08 22:00:00
31 December 2022, 23:59:59 	|	 2022-12-31 23:59:59
 	|	 NaT
None 	|	 None


In [14]:
no_year = "1-jan"
clean_date1 = pd.to_datetime(no_year, infer_datetime_format=True)
print(clean_date1)
# dates without at least a day month year will fail to be convereted

OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1-01-01 00:00:00

In [15]:
zeros_df2["inferred"] = pd.to_datetime(zeros_df2["dates"], infer_datetime_format=True)

  zeros_df2["inferred"] = pd.to_datetime(zeros_df2["dates"], infer_datetime_format=True)


In [16]:
zeros_df2["inferred"] = [pd.to_datetime(x, infer_datetime_format=True) for x in bad_dates]
zeros_df2.head(20)

Unnamed: 0,a,b,c,dates,inferred
0,0.0,0.0,0.0,4/12/2023 2:47:30 AM,2023-04-12 02:47:30
1,0.0,0.0,0.0,20230101,2023-01-01 00:00:00
2,0.0,0.0,0.0,1-1-23,2023-01-01 00:00:00
3,0.0,0.0,0.0,1-01-23 8:00,2023-01-01 08:00:00
4,0.0,0.0,0.0,1-jan-2023,2023-01-01 00:00:00
5,0.0,0.0,0.0,13/12/2023,2023-12-13 00:00:00
6,0.0,0.0,0.0,Oct 20 2023,2023-10-20 00:00:00
7,0.0,0.0,0.0,July 4 2023,2023-07-04 00:00:00
8,0.0,0.0,0.0,"March 8, 2022, 22:00",2022-03-08 22:00:00
9,0.0,0.0,0.0,"31 December 2022, 23:59:59",2022-12-31 23:59:59


In [17]:
## https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
# for cases that cannot be inffered the pthon strptime string formats need to be used

from_format = pd.to_datetime("01-31-2023 05:15", format="%m-%d-%Y %H:%M")
print(from_format, type(from_format))


2023-01-31 05:15:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


## Filtering / Selecting

### Example 3

In [18]:
large_date_series = pd.date_range(start='01/01/2023',end='05/15/2023',periods=300)
zeros_df3 = pd.DataFrame(np.zeros((len(large_date_series), 3)), columns=["a","b","c"])
zeros_df3["dates"] = large_date_series

In [19]:
zeros_df3.head()

Unnamed: 0,a,b,c,dates
0,0.0,0.0,0.0,2023-01-01 00:00:00.000000000
1,0.0,0.0,0.0,2023-01-01 10:45:21.070234113
2,0.0,0.0,0.0,2023-01-01 21:30:42.140468227
3,0.0,0.0,0.0,2023-01-02 08:16:03.210702341
4,0.0,0.0,0.0,2023-01-02 19:01:24.280936454


In [20]:
print(zeros_df3['dates'].max() - zeros_df3['dates'].min())
print(len(zeros_df3))

134 days 00:00:00
300


In [21]:
zeros_df3 = zeros_df3.set_index('dates')
zeros_df3.head()

Unnamed: 0_level_0,a,b,c
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-01 00:00:00.000000000,0.0,0.0,0.0
2023-01-01 10:45:21.070234113,0.0,0.0,0.0
2023-01-01 21:30:42.140468227,0.0,0.0,0.0
2023-01-02 08:16:03.210702341,0.0,0.0,0.0
2023-01-02 19:01:24.280936454,0.0,0.0,0.0


In [22]:
zeros_df3.loc["2023-02-01" : "2023-03-01"]

Unnamed: 0_level_0,a,b,c
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-02-01 08:54:34.916387960,0.0,0.0,0.0
2023-02-01 19:39:55.986622073,0.0,0.0,0.0
2023-02-02 06:25:17.056856187,0.0,0.0,0.0
2023-02-02 17:10:38.127090301,0.0,0.0,0.0
2023-02-03 03:55:59.197324414,0.0,0.0,0.0
...,...,...,...
2023-02-27 19:30:18.060200669,0.0,0.0,0.0
2023-02-28 06:15:39.130434782,0.0,0.0,0.0
2023-02-28 17:01:00.200668896,0.0,0.0,0.0
2023-03-01 03:46:21.270903010,0.0,0.0,0.0
