# Chapter 7

### 將字串轉成日期

In [2]:
import numpy as np
import pandas as pd 

In [3]:
# Example
data_strings = np.array(['03-04-2005 11:35 PM',
                                 '23-05-2010 12:01 AM',
                                 '04-09-2009 09:09 PM'])

In [7]:
# 利用pandas轉成datetime 預設為遇到錯誤coerce(不管發生什麼事都不會產生錯誤)
[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce") for date in data_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

#### Notes:
碼 | 說明 | 範例
---|---|---|
%Y| 年| 1996
%m| 補0後的月| 05
%d| 補0後的日| 24
%I| 補0後的時(12時制)| 10
%p| AM或PM| AM
%M| 補0後的分| 05
%S| 補0後的秒| 09

### 時區的處理

In [13]:
import pandas as pd
# 加入時區
# Method 1
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')
# Method 2
date = pd.Timestamp('2017-05-01 06:00:00')
date_in_london = date.tz_localize('Europe/London')

In [16]:
# 改變時區
date_in_london.tz_convert('Africa/Abidjan')

Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

In [18]:
# 產生三組日期
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))
dates.dt.tz_localize('Africa/Abidjan')

0   2002-02-28 00:00:00+00:00
1   2002-03-31 00:00:00+00:00
2   2002-04-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

In [21]:
# 所有時區
from pytz import all_timezones
all_timezones[:5]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara']

### 日期時間的選取

In [31]:
import pandas as pd
df = pd.DataFrame()
df["date"] = pd.date_range('1/1/2001', periods=100000, freq='H') # 注意日期格式
df[((df["date"]>'2002-1-1 01:00:00') & (df["date"] <= '2002-1-1 04:00:00'))]

Unnamed: 0,date
8762,2002-01-01 02:00:00
8763,2002-01-01 03:00:00
8764,2002-01-01 04:00:00


In [38]:
# Method 2 利用索引方式
df = df.set_index(df["date"])
# df.loc[('2002-1-1 01:00:00'<df.index) & (df.index<= '2002-1-1 04:00:00')] 笨死了
df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00'] # slice is good

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2002-01-01 01:00:00,2002-01-01 01:00:00
2002-01-01 02:00:00,2002-01-01 02:00:00
2002-01-01 03:00:00,2002-01-01 03:00:00
2002-01-01 04:00:00,2002-01-01 04:00:00


 ### 資料日期拆分成幾個特徵(.dt)

In [39]:
df = pd.DataFrame()
df["date"] = pd.date_range('1/1/2001', periods=150, freq='W')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df.head(3)

Unnamed: 0,date,year,month,day,hour,minute
0,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001,1,21,0,0


### 日期差值計算(TimeDelta)

In [41]:
import pandas as pd 
df = pd.DataFrame()
df["Arrived"] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df["Left"] = [pd.Timestamp('01-04-2017'), pd.Timestamp('01-06-2017')]
df['Left']-df["Arrived"]

0   3 days
1   2 days
dtype: timedelta64[ns]

In [45]:
df["Interval"] = df['Left']-df["Arrived"] # dtype:timedelta64[ns]
df # 但這個有Days

Unnamed: 0,Arrived,Left,Interval
0,2017-01-01,2017-01-04,3 days
1,2017-01-04,2017-01-06,2 days


In [49]:
df["Interval"] = pd.Series(delta.days for delta in (df["Left"]-df["Arrived"]))
df # dtype:int64

Unnamed: 0,Arrived,Left,Interval
0,2017-01-01,2017-01-04,3
1,2017-01-04,2017-01-06,2


### 星期編碼

In [59]:
import pandas as pd
dates = pd.Series(pd.date_range('2/2/2002', periods=150, freq='M'))
dates.dt.weekday_name

0       Thursday
1         Sunday
2        Tuesday
3         Friday
4         Sunday
         ...    
145       Monday
146    Wednesday
147     Saturday
148       Monday
149     Thursday
Length: 150, dtype: object

In [60]:
# 轉成星期幾(阿拉伯數字)
dates.dt.weekday

0      3
1      6
2      1
3      4
4      6
      ..
145    0
146    2
147    5
148    0
149    3
Length: 150, dtype: int64

### 後延特徵(lagging)

In [63]:
import pandas as pd
df = pd.DataFrame()
df["dates"] = pd.date_range("1/1/2001", periods=5, freq="D")
df["stock_price"] = [1.1, 2.2, 3.3, 4.4, 5.5]
df["previous_days_stock_price"] = df["stock_price"].shift(1)

In [69]:
df # 第一個沒值

Unnamed: 0,dates,stock_price,previous_days_stock_price
0,2001-01-01,1.1,
1,2001-01-02,2.2,1.1
2,2001-01-03,3.3,2.2
3,2001-01-04,4.4,3.3
4,2001-01-05,5.5,4.4


### 使用滾動時間的視窗

In [71]:
import pandas as pd
time_index = pd.date_range("01/01/2010", periods=5, freq="M")
df = pd.DataFrame(index=time_index)
df["Stock_Price"] = [1, 2, 3, 4, 5]
df

Unnamed: 0,Stock_Price
2010-01-31,1
2010-02-28,2
2010-03-31,3
2010-04-30,4
2010-05-31,5


In [73]:
df.rolling(window=2).mean()
# 也有很多方法，包括max(), mean(), count()

Unnamed: 0,Stock_Price
2010-01-31,
2010-02-28,1.5
2010-03-31,2.5
2010-04-30,3.5
2010-05-31,4.5


### 時間序列的填補空值處理

In [78]:
import pandas as pd
import numpy as np
time_index = pd.date_range("01/01/2010", periods=5, freq="M")
df = pd.DataFrame(index=time_index)
df["Sales"] = [1.0, 2.0, np.nan, np.nan, 5.0]
df

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,
2010-04-30,
2010-05-31,5.0


#### 插補法

In [79]:
df.interpolate()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,4.0
2010-05-31,5.0


In [87]:
# limit限制插補數量，從前面開始算
df.interpolate(limit=1, limit_direction="forward")

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,
2010-05-31,5.0


In [88]:
# limit限制插補數量，從後面開始算
df.interpolate(limit=1, limit_direction="backward")

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,
2010-04-30,4.0
2010-05-31,5.0


In [83]:
# 使用非線性方法
df.interpolate(method="quadratic")

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.059808
2010-04-30,4.038069
2010-05-31,5.0


#### 往前填充(forward-filling) 中文應該用說用前項去補後面空值

In [80]:
df.ffill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


#### 往後填充(back-filling) 中文應該用說用後項去補前面空值

In [81]:
df.bfill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0


# Chapter 8

In [91]:
import cv2
cv2.__version__

'4.1.2'