In [30]:
import numpy as np
import os
import datetime
import pandas as pd
import random
# from tqdm import tqdm
from matplotlib import pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn import preprocessing
from sklearn import datasets

### Date and Time Processing

In [9]:
date_strings = np.array(['03-04-2020 11:33:12 PM', '01-12-1994 03:21:22 AM', '22-10-2002 09:48:22 PM'])

In [10]:
[pd.to_datetime(d, format='%d-%m-%Y %I:%M:%S %p', errors='coerce') for d in date_strings]

[Timestamp('2020-04-03 23:33:12'),
 Timestamp('1994-12-01 03:21:22'),
 Timestamp('2002-10-22 21:48:22')]

### Time Zones

In [19]:
from pytz import all_timezones

In [20]:
len(all_timezones), all_timezones[3:4]

(592, ['Africa/Algiers'])

In [11]:
pd.Timestamp("11-03-1992 10:09 PM", tz='Europe/London')

Timestamp('1992-11-03 22:09:00+0000', tz='Europe/London')

In [13]:
# Add a timezone
london_timestamp = pd.Timestamp("11-03-1992 10:09 PM")
london_timestamp = london_timestamp.tz_localize('Europe/London')
london_timestamp

Timestamp('1992-11-03 22:09:00+0000', tz='Europe/London')

In [15]:
# convert to a different one
africa_timestamp = london_timestamp.tz_convert('Africa/Abidjan')
africa_timestamp

Timestamp('1992-11-03 22:09:00+0000', tz='Africa/Abidjan')

In [16]:
dates_one = pd.Series(pd.date_range('2/2/2022', periods=4, freq='M'))
dates_one

0   2022-02-28
1   2022-03-31
2   2022-04-30
3   2022-05-31
dtype: datetime64[ns]

In [17]:
dates_one.dt.tz_localize('Africa/Abidjan')

0   2022-02-28 00:00:00+00:00
1   2022-03-31 00:00:00+00:00
2   2022-04-30 00:00:00+00:00
3   2022-05-31 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

### Selecting Date Times & Parsing

In [21]:
dates_two = pd.Series(pd.date_range('2/2/2022', periods=200, freq='H'))
dates_two[33:36]

33   2022-02-03 09:00:00
34   2022-02-03 10:00:00
35   2022-02-03 11:00:00
dtype: datetime64[ns]

In [27]:
# Can set as index
df = pd.DataFrame(dates_two, columns=['my_date'])
df.head(3)

Unnamed: 0,my_date
0,2022-02-02 00:00:00
1,2022-02-02 01:00:00
2,2022-02-02 02:00:00


In [28]:
df = df.set_index(df['my_date'])
df.head(3)

Unnamed: 0_level_0,my_date
my_date,Unnamed: 1_level_1
2022-02-02 00:00:00,2022-02-02 00:00:00
2022-02-02 01:00:00,2022-02-02 01:00:00
2022-02-02 02:00:00,2022-02-02 02:00:00


In [29]:
df = pd.DataFrame(dates_two, columns=['my_date'])
df['year'] = df['my_date'].dt.year
df['month'] = df['my_date'].dt.month
df['day'] = df['my_date'].dt.day
df['hour'] = df['my_date'].dt.hour
df['minute'] = df['my_date'].dt.minute
df.head(3)

Unnamed: 0,my_date,year,month,day,hour,minute
0,2022-02-02 00:00:00,2022,2,2,0,0
1,2022-02-02 01:00:00,2022,2,2,1,0
2,2022-02-02 02:00:00,2022,2,2,2,0


### Rolling Window Calculations

In [36]:
df['price'] = random.choices([1, 4, 3, 8], k=len(df))
df.head(3)

Unnamed: 0,my_date,year,month,day,hour,minute,price
0,2022-02-02 00:00:00,2022,2,2,0,0,8
1,2022-02-02 01:00:00,2022,2,2,1,0,4
2,2022-02-02 02:00:00,2022,2,2,2,0,4


In [37]:
df['RollPrice'] = df.price.rolling(window=5, min_periods=2).mean()
df.iloc[12:16]

Unnamed: 0,my_date,year,month,day,hour,minute,price,RollPrice
12,2022-02-02 12:00:00,2022,2,2,12,0,3,2.2
13,2022-02-02 13:00:00,2022,2,2,13,0,1,1.8
14,2022-02-02 14:00:00,2022,2,2,14,0,8,3.2
15,2022-02-02 15:00:00,2022,2,2,15,0,1,3.2


### Fill in gaps
- interpolate
- bfill
- ffill

In [39]:
# limit: how many NaN's in a roww to fill
# Limit_direction: consecutive NaNs will be filled in this direction
help(df.interpolate)

Help on method interpolate in module pandas.core.generic:

interpolate(method='linear', axis=0, limit=None, inplace=False, limit_direction='forward', limit_area=None, downcast=None, **kwargs) method of pandas.core.frame.DataFrame instance
    Interpolate values according to different methods.
    
    Please note that only ``method='linear'`` is supported for
    DataFrame/Series with a MultiIndex.
    
    Parameters
    ----------
    method : str, default 'linear'
        Interpolation technique to use. One of:
    
        * 'linear': Ignore the index and treat the values as equally
          spaced. This is the only method supported on MultiIndexes.
        * 'time': Works on daily and higher resolution data to interpolate
          given length of interval.
        * 'index', 'values': use the actual numerical values of the index.
        * 'pad': Fill in NaNs using existing values.
        * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline',
          'barycentric', '