# Time series

Sequence of data objects indexed by time

# Applications
- NLP(natural language processing)
    + How computer understand what we speak
- Brain signals/ brain function understanding.
    +  electroencephalograms (EEG), magnetoencephalograms (MEG), and magnetic resonance imaging (MRI).
- Understanding national/global social issues, economic issues    
- Video understanding
- Finance, health care, Physics

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

# Date Time Data Types in pandas

We already used pandas to_datetime functionality

In [None]:
dates = ['2011-07-06 12:00:00', '2011-08-06 00:00:00', None]

In [None]:
index = pd.to_datetime(dates)
index

In [None]:
index.dtype

In [None]:
index[0]

# timestamp

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html

In [None]:
ts= pd.Timestamp('1/2/2018')
ts

In [None]:
ts.day_name(), ts.dayofweek, ts.days_in_month, ts.tzinfo

In [None]:
pd.to_datetime(['04-01-2012 10:00'], dayfirst=True)

# timeDelta
https://pandas.pydata.org/pandas-docs/stable/user_guide/timedeltas.html

In [None]:
interval = pd.Timedelta("1 day 2 hours")
interval

In [None]:
ts+interval

# Creating time index based on Timestamp

In [None]:
time_range= pd.date_range(start='1/1/2018', periods=5)
time_range

In [None]:
time_range[0]

# Can you change frequency to month

In [None]:
date_range = pd.date_range(start='1/1/2018', periods=5, freq='M')
date_range

In [None]:
date_range = pd.date_range(start='1/1/2018', periods=5, freq='1h30min')
date_range

In [None]:
date_range + interval

# Period

In [None]:
p = pd.Period('3/11/2019')
p,p.start_time, p.end_time

In [None]:
pq= pd.Period('2017Q1', freq='Q')
pq.start_time, pq.end_time

## period_range

In [None]:
pd.period_range(start='2017-01-01', end='2017-03-01', freq='D')


In [None]:
pd.period_range(start=pd.Period('2017Q1'),
               end=pd.Period('2017Q2'), freq='M')

# time zone

In [None]:
rng =pd.date_range('2019-2-1', periods=5, freq='D', tz='Europe/London')
rng

In [None]:
rng.tz

In [None]:
rng[0].tz_convert('Asia/Rangoon')

In [None]:
from pytz import common_timezones, all_timezones, country_timezones

In [None]:
country_timezones('ch')

In [None]:
common_timezones[:15]

In [None]:
# Find time zone in all time zone but not in common time zone
#???s

# Day light saving 

In [None]:
rng =pd.date_range('2019-3-8 9 pm', periods=10, tz='US/Mountain')
rng

# Let's make some time series

In [None]:

pd.Series(np.random.randint(low =1, high =10, size = 10), pd.date_range('2019 1 1', periods= 10, freq = 'H'))

# reading time series data

https://archive.ics.uci.edu/ml/datasets/Air+Quality#

In [None]:

!curl -O https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip

In [None]:
!unzip AirQualityUCI.zip

In [None]:
!ls *.csv

In [None]:
!shuf -n 10 AirQualityUCI.csv

In [None]:
# write grep command to remove ;; and store resutls in AirQualityUCI_clean.csv file


In [None]:
air_quality_df = pd.read_csv('AirQualityUCI_clean.csv',sep = ';')
air_quality_df.head(10)

In [None]:
%timeit air_quality_df.Time.apply(lambda x: ':'.join(x.split('.')))

In [None]:
%timeit air_quality_df[['Time']].apply(lambda x: x.str.replace('.', ':'))

In [None]:
air_quality_df.Time = air_quality_df[['Time']].apply(lambda x: x.str.replace('.', ':'))
air_quality_df.head(10)

In [None]:
air_quality_df.to_csv('AirQualityUCI1.csv', index=False)

In [None]:
!head -n 10 AirQualityUCI1.csv

In [None]:
! ls *.csv

In [None]:
air_quality_df0 = pd.read_csv('AirQualityUCI1.csv',sep= ',',
                index_col=0,parse_dates = [[0,1] ], infer_datetime_format= True )
air_quality_df0.head(10)

In [None]:
air_quality_df0.index

# Let's say we want to avoid Time manipulation and   write custom date parser for reading csv file

In [None]:
air_quality_df1 = pd.read_csv('AirQualityUCI_clean.csv',sep= ';',
                index_col=0,parse_dates = [[0, 1]] )
air_quality_df1.head(10)

In [None]:
air_quality_df1.index

# side: string to datetime object

In [None]:
date_str = '3/11/19'
pd.datetime.strptime(date_str, '%m/%d/%y')

## List of formating options

In [None]:
tbls = pd.read_html('http://strftime.org/')

In [None]:
tbls[0]

In [None]:
# Complete the lambda
my_date_parser= lambda y,t : 
air_quality_df2 = pd.read_csv('AirQualityUCI_clean.csv',sep= ';',
                index_col=0,parse_dates = [[0,1] ], date_parser = my_date_parser )

In [None]:
air_quality_df2.index

In [None]:
air_quality_df2.head(10)

# Re sampling

In [None]:
sample_df =  air_quality_df2['PT08.S1(CO)']
sample_df.head(10)

In [None]:
sample_df.index

In [None]:
sampled_30min_df =sample_df.asfreq('30min')
sampled_30min_df.head(10)

In [None]:
# upsample
sampled_30min_df =sample_df.asfreq('30min', method='ffill')
sampled_30min_df.head(10)

In [None]:
# downsample
sampled_30min_df.asfreq('4H').head(10)

# resample

In [None]:
sample_df =sample_df.asfreq('1H')
sample_df.head(10)

In [None]:
sample_df.index

In [None]:
downsample_df = sample_df.resample('2H')
type(downsample_df)

In [None]:
sample_df.head(10)

In [None]:
(1360+1292)/2

In [None]:
downsample_df.mean()

# Shifting time series(Leading and Lagging))

In [None]:
import numpy as np
ts = pd.Series(np.random.randn(10) +2,
               index=pd.date_range('1/1/2018', periods=10, freq='D'))
ts

In [None]:
ts.shift()

In [None]:
ts.diff()

In [None]:
lag_analysis = pd.concat([ts, ts.shift(1), ts.diff(1)], axis=1)

In [None]:
lag_analysis.columns= ['ori', 'shifted', 'diff']

In [None]:
lag_analysis

In [None]:
lag_analysis.plot()

Use periods to move forward (positive) or backward (negative)

# Moving Window Functions
Moving aggregates measures

In [None]:
df= pd.DataFrame(np.random.randn(400,1), index=pd.date_range('2018/7/10', freq= 'D', periods =400 ), columns=['D1'])

In [None]:
df.head()

In [None]:
rolling_df = df.rolling(window = 30)
rolling_df

In [None]:
ax = df.plot(color = 'gray', figsize= (12, 10), label = 'raw')
rolling_df.mean().plot(ax= ax, color='green', label = 'mean')
rolling_df.max().plot(ax= ax, color='red', label = 'max')
rolling_df.min().plot(ax= ax, color='blue', label = 'min')
rolling_df.quantile(.40).plot(ax= ax, color='orange', label='40th qunatile')
plt.legend()

Computing more aggregate functions

In [None]:
rolling_df.agg(['median', 'mean']).head(40)

If you have some requirement not baked into pandas you can use apply function

In [None]:
epsilon = .8

In [None]:
# apply a rolling window of 2 weigth epsilon and 1-epsilon
ax = df.plot(color = 'gray', figsize = (12,10))
## ????

# Exponentially Weighted average


In [None]:
# From wes book
close_px_all = pd.read_csv('https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/examples/stock_px_2.csv',
                               parse_dates=True, index_col=0)
close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]
close_px = close_px.resample('B').ffill()
aapl_px = close_px.AAPL['2006':'2007']
ma60 = aapl_px.rolling(30, min_periods=20).mean()
ewma60 = aapl_px.ewm(span=30).mean()
ma60.plot(style='k--', label='Simple MA')
ewma60.plot(style='k-', label='EW MA')
plt.legend()

In [None]:
aapl_px.index

In [None]:
ma60.plot(style='k--', label='Simple MA')
aapl_px.ewm(span = 30, min_periods= 20).mean().plot(style='k-', label='EW MA')
plt.legend()

# Binary Moving Window Functions

correlation and covariance, between two time series

In [None]:
spx_px = close_px_all['SPX']

spx_rets = spx_px.pct_change()

returns = close_px.pct_change()

In [None]:
corr = returns.rolling(125, min_periods=100).corr(spx_rets)

corr.plot(figsize= (10, 10))


## Resources
- https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.expanding.html