## 時系列データ処理の練習

In [1]:
from datetime import datetime
from datetime import timedelta

### 各日時要素へのアクセス

In [50]:
d = datetime(2019, 1, 12, 7, 40, 10)
d

datetime.datetime(2019, 1, 12, 7, 40, 10)

In [57]:
print(d.year)
print(d.month)
print(d.day)
print(d.weekday()) #0:日曜日～6:土曜日
print(d.hour)
print(d.minute)
print(d.second)

2019
1
12
5
7
40
10


### datetime型の変換/加工の練習

In [2]:
d = datetime(2019, 1, 12, 7, 40, 10)
d

datetime.datetime(2019, 1, 12, 7, 40, 10)

#### datetimeの加減算

In [60]:
d2 = datetime(2019, 1, 14, 7, 42, 10)
d2

datetime.datetime(2019, 1, 14, 7, 42, 10)

In [59]:
d2-d

datetime.timedelta(2)

In [3]:
print(d + timedelta(2)) #days=2
print(d + timedelta(hours=2))
print(d + timedelta(minutes=2))
print(d + timedelta(seconds=2))

2019-01-14 07:40:10
2019-01-12 09:40:10
2019-01-12 07:42:10
2019-01-12 07:40:12


In [4]:
print(d - timedelta(2)) #days=2
print(d - timedelta(hours=2))
print(d - timedelta(minutes=2))
print(d - timedelta(seconds=2))

2019-01-10 07:40:10
2019-01-12 05:40:10
2019-01-12 07:38:10
2019-01-12 07:40:08


#### 文字列⇔datetime

In [5]:
str_datetime = "2019-01-12 7:40:10"
datetime.strptime(str_datetime, "%Y-%m-%d %H:%M:%S")

datetime.datetime(2019, 1, 12, 7, 40, 10)

In [6]:
#国際的なローケルではこの形式が一般的
str_datetime = "12/1/2019 7:40:10"
datetime.strptime(str_datetime, "%d/%m/%Y %H:%M:%S")

datetime.datetime(2019, 1, 12, 7, 40, 10)

In [7]:
import numpy as np
import pandas as pd

### pandasでのTimestamp(datetime64)の作成
* Timestampは、datetimeを継承した型

In [42]:
dt = pd.to_datetime("2000-01-01 10:11:12", format="%Y-%m-%d %H:%M:%S")
dt

Timestamp('2000-01-01 10:11:12')

In [49]:
#datetime型を取り出す
dt.to_pydatetime()

datetime.datetime(2000, 1, 1, 10, 11, 12)

In [47]:
dt.date()

datetime.date(2000, 1, 1)

In [48]:
dt.time()

datetime.time(10, 11, 12)

In [54]:
dt.dayofweek

5

In [67]:
dts = pd.date_range("1/1/2000", periods=10)
dts

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10'],
              dtype='datetime64[ns]', freq='D')

### Timestamp(datetime64)の加減算

In [77]:
dt1 = pd.to_datetime("2000-01-01 10:11:12", format="%Y-%m-%d %H:%M:%S")
dt1

Timestamp('2000-01-01 10:11:12')

In [76]:
dt2 = pd.to_datetime("2001-01-02 12:12:13", format="%Y-%m-%d %H:%M:%S")
dt2

Timestamp('2001-01-02 12:12:13')

In [80]:
dt2 - dt1

Timedelta('367 days 02:01:01')

In [84]:
print(dt2.year - dt1.year)
print(dt2.month - dt1.month)
print(dt2.day - dt1.day)
print(dt2.hour - dt1.hour)
print(dt2.minute - dt1.minute)
print(dt2.second - dt1.second)

1
0
1
2
1
1


In [69]:
dts = pd.date_range("1/1/2000", periods=10)
dts

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10'],
              dtype='datetime64[ns]', freq='D')

In [86]:
dlt = dts[2] - dts[1]
dlt

Timedelta('1 days 00:00:00')

In [85]:
print(dts[2].year - dts[1].year)
print(dts[2].month - dts[1].month)
print(dts[2].day - dts[1].day)
print(dts[2].hour - dts[1].hour)
print(dts[2].minute - dts[1].minute)
print(dts[2].second - dts[1].second)

0
0
1
0
0
0


### pandasでの参照

In [8]:
ts = pd.Series(np.random.randn(10), index=pd.date_range("1/1/2000", periods=10))
ts

2000-01-01    0.584602
2000-01-02   -0.068160
2000-01-03   -1.109544
2000-01-04   -0.871536
2000-01-05   -0.033493
2000-01-06   -0.244895
2000-01-07    1.172066
2000-01-08    0.493776
2000-01-09   -0.405087
2000-01-10    1.323217
Freq: D, dtype: float64

In [9]:
#インデックスや（解釈可能な）文字列、datetimeオブジェクトで参照可能
print(ts[1])
print(ts["2000-01-02"])
print(ts["01/02/2000"])
print(ts["20000102"])
print(ts[datetime(2000, 1, 2)])

-0.06815961138810982
-0.06815961138810982
-0.06815961138810982
-0.06815961138810982
-0.06815961138810982


In [10]:
#年や月で指定も可能
print(ts["2000"])
print(ts["2000-01"])

2000-01-01    0.584602
2000-01-02   -0.068160
2000-01-03   -1.109544
2000-01-04   -0.871536
2000-01-05   -0.033493
2000-01-06   -0.244895
2000-01-07    1.172066
2000-01-08    0.493776
2000-01-09   -0.405087
2000-01-10    1.323217
Freq: D, dtype: float64
2000-01-01    0.584602
2000-01-02   -0.068160
2000-01-03   -1.109544
2000-01-04   -0.871536
2000-01-05   -0.033493
2000-01-06   -0.244895
2000-01-07    1.172066
2000-01-08    0.493776
2000-01-09   -0.405087
2000-01-10    1.323217
Freq: D, dtype: float64


In [11]:
#範囲指定も可能
d = datetime(2000, 1, 4)
print(ts[d:d+timedelta(days=2)])
print(ts["2000-01-04":"2000-01-06"])

2000-01-04   -0.871536
2000-01-05   -0.033493
2000-01-06   -0.244895
Freq: D, dtype: float64
2000-01-04   -0.871536
2000-01-05   -0.033493
2000-01-06   -0.244895
Freq: D, dtype: float64


### 時系列データに重複がないことの確認
* 特段、重複していても問題はない

In [12]:
ts.index.is_unique

True

### 時系列範囲（date_range）

In [13]:
#開始日と日数を指定
pd.date_range("2000-01-01", periods=5)

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05'],
              dtype='datetime64[ns]', freq='D')

In [14]:
#2日間隔で指定
pd.date_range("2000-01-01", periods=5, freq="2d")

DatetimeIndex(['2000-01-01', '2000-01-03', '2000-01-05', '2000-01-07',
               '2000-01-09'],
              dtype='datetime64[ns]', freq='2D')

In [15]:
#
pd.date_range("2000-01-01", periods=5, freq="1h30min")

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00'],
              dtype='datetime64[ns]', freq='90T')

In [16]:
pd.date_range("2000-01-01", periods=5, freq="WOM-2MON")

DatetimeIndex(['2000-01-10', '2000-02-14', '2000-03-13', '2000-04-10',
               '2000-05-08'],
              dtype='datetime64[ns]', freq='WOM-2MON')

### タイムゾーン

In [17]:
#pandasではタイムゾーンをpytzを使って取得している
import pytz

In [18]:
pytz.common_timezones

['Africa/Abidjan', 'Africa/Accra', 'Africa/Addis_Ababa', 'Africa/Algiers', 'Africa/Asmara', 'Africa/Bamako', 'Africa/Bangui', 'Africa/Banjul', 'Africa/Bissau', 'Africa/Blantyre', 'Africa/Brazzaville', 'Africa/Bujumbura', 'Africa/Cairo', 'Africa/Casablanca', 'Africa/Ceuta', 'Africa/Conakry', 'Africa/Dakar', 'Africa/Dar_es_Salaam', 'Africa/Djibouti', 'Africa/Douala', 'Africa/El_Aaiun', 'Africa/Freetown', 'Africa/Gaborone', 'Africa/Harare', 'Africa/Johannesburg', 'Africa/Juba', 'Africa/Kampala', 'Africa/Khartoum', 'Africa/Kigali', 'Africa/Kinshasa', 'Africa/Lagos', 'Africa/Libreville', 'Africa/Lome', 'Africa/Luanda', 'Africa/Lubumbashi', 'Africa/Lusaka', 'Africa/Malabo', 'Africa/Maputo', 'Africa/Maseru', 'Africa/Mbabane', 'Africa/Mogadishu', 'Africa/Monrovia', 'Africa/Nairobi', 'Africa/Ndjamena', 'Africa/Niamey', 'Africa/Nouakchott', 'Africa/Ouagadougou', 'Africa/Porto-Novo', 'Africa/Sao_Tome', 'Africa/Tripoli', 'Africa/Tunis', 'Africa/Windhoek', 'America/Adak', 'America/Anchorage', 'Amer

In [19]:
pytz.timezone("US/Hawaii")

<DstTzInfo 'US/Hawaii' LMT-1 day, 13:29:00 STD>

In [20]:
#デフォルトではタイムゾーンは指定されない
ts1= pd.date_range("1/1/2000 10:00", periods=10)
ts1

DatetimeIndex(['2000-01-01 10:00:00', '2000-01-02 10:00:00',
               '2000-01-03 10:00:00', '2000-01-04 10:00:00',
               '2000-01-05 10:00:00', '2000-01-06 10:00:00',
               '2000-01-07 10:00:00', '2000-01-08 10:00:00',
               '2000-01-09 10:00:00', '2000-01-10 10:00:00'],
              dtype='datetime64[ns]', freq='D')

In [21]:
#ローカライゼーションされた状態にする
ts1 = ts1.tz_localize("UTC")
ts1

DatetimeIndex(['2000-01-01 10:00:00+00:00', '2000-01-02 10:00:00+00:00',
               '2000-01-03 10:00:00+00:00', '2000-01-04 10:00:00+00:00',
               '2000-01-05 10:00:00+00:00', '2000-01-06 10:00:00+00:00',
               '2000-01-07 10:00:00+00:00', '2000-01-08 10:00:00+00:00',
               '2000-01-09 10:00:00+00:00', '2000-01-10 10:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [22]:
#明示的に指定する
ts2 = pd.date_range("1/1/2000 10:00", periods=10, tz="UTC")
ts2

DatetimeIndex(['2000-01-01 10:00:00+00:00', '2000-01-02 10:00:00+00:00',
               '2000-01-03 10:00:00+00:00', '2000-01-04 10:00:00+00:00',
               '2000-01-05 10:00:00+00:00', '2000-01-06 10:00:00+00:00',
               '2000-01-07 10:00:00+00:00', '2000-01-08 10:00:00+00:00',
               '2000-01-09 10:00:00+00:00', '2000-01-10 10:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [23]:
#別のタイムゾーンに変換
ts2.tz_convert("US/Hawaii")

DatetimeIndex(['2000-01-01 00:00:00-10:00', '2000-01-02 00:00:00-10:00',
               '2000-01-03 00:00:00-10:00', '2000-01-04 00:00:00-10:00',
               '2000-01-05 00:00:00-10:00', '2000-01-06 00:00:00-10:00',
               '2000-01-07 00:00:00-10:00', '2000-01-08 00:00:00-10:00',
               '2000-01-09 00:00:00-10:00', '2000-01-10 00:00:00-10:00'],
              dtype='datetime64[ns, US/Hawaii]', freq='D')

### 期間（Period, PeriodIndex）

In [24]:
pd.Period("2000", "A-DEC")

Period('2000', 'A-DEC')

In [25]:
index = pd.PeriodIndex(["2000"], freq="A-DEC")
index

PeriodIndex(['2000'], dtype='period[A-DEC]', freq='A-DEC')

In [26]:
rng = pd.period_range("2000", "2001", freq="Q-DEC")
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2000Q1    1.723710
2000Q2   -0.851864
2000Q3   -0.883801
2000Q4   -0.481505
2001Q1    0.190178
Freq: Q-DEC, dtype: float64

In [27]:
ts.asfreq("M", how="start")

2000-01    1.723710
2000-04   -0.851864
2000-07   -0.883801
2000-10   -0.481505
2001-01    0.190178
Freq: M, dtype: float64

### 再サンプル

#### ダウンサンプリング

In [28]:
rng = pd.date_range("2000/1/1", periods=100)
ts = pd.Series(range(1, len(rng)+1), index=rng)
ts.head()

2000-01-01    1
2000-01-02    2
2000-01-03    3
2000-01-04    4
2000-01-05    5
Freq: D, dtype: int64

In [29]:
ts.resample("M").mean()

2000-01-31    16
2000-02-29    46
2000-03-31    76
2000-04-30    96
Freq: M, dtype: int64

In [30]:
ts.resample("M", kind="period").mean()

2000-01    16
2000-02    46
2000-03    76
2000-04    96
Freq: M, dtype: int64

In [31]:
rng = pd.date_range("2000/1/1 0:0", freq="1min", periods=12)
ts = pd.Series(range(1, len(rng)+1), index=rng)
ts

2000-01-01 00:00:00     1
2000-01-01 00:01:00     2
2000-01-01 00:02:00     3
2000-01-01 00:03:00     4
2000-01-01 00:04:00     5
2000-01-01 00:05:00     6
2000-01-01 00:06:00     7
2000-01-01 00:07:00     8
2000-01-01 00:08:00     9
2000-01-01 00:09:00    10
2000-01-01 00:10:00    11
2000-01-01 00:11:00    12
Freq: T, dtype: int64

In [32]:
ts.resample("5min").sum()

2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    23
Freq: 5T, dtype: int64

In [33]:
ts.resample("5min", label="right").sum()

2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    23
Freq: 5T, dtype: int64

In [34]:
ts.resample("5min", closed="left", label="right").sum()

2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    23
Freq: 5T, dtype: int64

In [35]:
ts.resample("5min", closed="right", label="right").sum()

2000-01-01 00:00:00     1
2000-01-01 00:05:00    20
2000-01-01 00:10:00    45
2000-01-01 00:15:00    12
Freq: 5T, dtype: int64

In [36]:
ts.resample("5min", closed="right", label="right").ohlc()

Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,1,1,1,1
2000-01-01 00:05:00,2,6,2,6
2000-01-01 00:10:00,7,11,7,11
2000-01-01 00:15:00,12,12,12,12


#### アップサンプリング

In [37]:
rng = pd.date_range("2000-01-01", periods=2, freq="W-MON")
ts = pd.Series([1, 2], index=rng)
ts

2000-01-03    1
2000-01-10    2
Freq: W-MON, dtype: int64

In [38]:
ts.asfreq("D")

2000-01-03    1.0
2000-01-04    NaN
2000-01-05    NaN
2000-01-06    NaN
2000-01-07    NaN
2000-01-08    NaN
2000-01-09    NaN
2000-01-10    2.0
Freq: D, dtype: float64

In [39]:
ts.asfreq("D", fill_value=1.5)

2000-01-03    1.0
2000-01-04    1.5
2000-01-05    1.5
2000-01-06    1.5
2000-01-07    1.5
2000-01-08    1.5
2000-01-09    1.5
2000-01-10    2.0
Freq: D, dtype: float64

In [40]:
ts.resample("D").ffill()

2000-01-03    1
2000-01-04    1
2000-01-05    1
2000-01-06    1
2000-01-07    1
2000-01-08    1
2000-01-09    1
2000-01-10    2
Freq: D, dtype: int64