In [1]:
# 时间序列基础
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
from pandas import DataFrame, Series

In [2]:
dates = [datetime(2011, 1, 2),
         datetime(2011, 1, 5),
         datetime(2011, 1, 7),
         datetime(2011, 1, 8),
         datetime(2011, 1, 10),
         datetime(2011, 1, 12)]
ts = Series(np.random.randn(6), index=dates)
print(type(ts)) # 注意，和书上说的TimeSeries不一样。
print(type(ts.index))
ts

<class 'pandas.core.series.Series'>
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


2011-01-02    0.772477
2011-01-05    1.753706
2011-01-07    0.687584
2011-01-08    1.050631
2011-01-10   -1.166636
2011-01-12   -1.201186
dtype: float64

In [3]:
ts + ts[::2] # 无法对齐的地方自动填充NA

2011-01-02    1.544953
2011-01-05         NaN
2011-01-07    1.375168
2011-01-08         NaN
2011-01-10   -2.333272
2011-01-12         NaN
dtype: float64

In [4]:
ts.index.dtype # 以ns为单位保存时间戳

dtype('<M8[ns]')

In [5]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

In [6]:
# 索引、选取、子集构造

In [7]:
ts['1/10/2011']

-1.1666361345741005

In [8]:
ts['20110110']

-1.1666361345741005

In [9]:
longer_ts = Series(np.random.randn(1000),
                   index=pd.date_range('1/1/2000', periods=1000)) # 连续1000天的数据
print(longer_ts.head())
print(longer_ts.tail())

2000-01-01    0.535422
2000-01-02    0.218639
2000-01-03   -0.798365
2000-01-04    1.438230
2000-01-05    1.365312
Freq: D, dtype: float64
2002-09-22   -0.180060
2002-09-23   -0.270658
2002-09-24   -0.963958
2002-09-25   -0.499083
2002-09-26    0.598380
Freq: D, dtype: float64


In [10]:
longer_ts['2001'].head() # 直接选年份

2001-01-01    0.562310
2001-01-02    0.256819
2001-01-03   -1.480738
2001-01-04   -1.004320
2001-01-05    0.318483
Freq: D, dtype: float64

In [11]:
longer_ts['2001-05'].head() # 年 + 月

2001-05-01    0.871160
2001-05-02    0.664891
2001-05-03   -0.472416
2001-05-04   -0.626066
2001-05-05   -0.594843
Freq: D, dtype: float64

In [12]:
dates = [datetime(2011, 1, 2),
         datetime(2011, 1, 5),
         datetime(2011, 1, 7),
         datetime(2011, 1, 8),
         datetime(2011, 1, 10),
         datetime(2011, 1, 12)] # 重新构造一遍，便于查询数据
ts = Series(np.random.randn(6), index=dates)
ts[datetime(2011, 1, 7):] # 为什么切片只返回4天？
ts

2011-01-02   -1.027031
2011-01-05   -0.055124
2011-01-07   -1.830234
2011-01-08    2.434101
2011-01-10    0.404510
2011-01-12   -0.593850
dtype: float64

In [13]:
ts['1/6/2011':'1/20/2011'] # 1/6和1/20不存在没关系，自动会查。

2011-01-07   -1.830234
2011-01-08    2.434101
2011-01-10    0.404510
2011-01-12   -0.593850
dtype: float64

In [14]:
ts.truncate(after='1/8/2011') # 最远到2011/1/8

2011-01-02   -1.027031
2011-01-05   -0.055124
2011-01-07   -1.830234
2011-01-08    2.434101
dtype: float64

In [15]:
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = DataFrame(np.random.randn(100, 4),
                    index=dates,
                    columns=['Colorado', 'Texas', 'New York', 'Ohio'])
long_df.head()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.341666,-1.23155,-1.016394,-1.301498
2000-01-12,0.744188,0.186536,-1.014354,-0.497517
2000-01-19,0.273858,-2.213061,1.391597,1.384184
2000-01-26,0.859288,0.995682,-0.511652,-0.617411
2000-02-02,0.019173,0.207258,-0.474774,-0.428453


In [16]:
long_df.loc['5-2001'] # 2001年5月

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,1.26719,-0.556569,-0.722254,-2.236654
2001-05-09,-0.685621,1.460324,0.942097,0.950523
2001-05-16,1.059505,-1.452266,1.687294,0.636464
2001-05-23,-0.194557,-0.142167,1.460389,-0.28487
2001-05-30,-1.556288,-2.129977,0.125112,-1.470031


In [17]:
# 带有重复索引的时间序列

In [18]:
dates = pd.DatetimeIndex(['1/1/2000',
                          '1/2/2000',
                          '1/2/2000',
                          '1/2/2000',
                          '1/3/2000'])
dup_ts = Series(np.arange(5), index=dates)

In [19]:
dup_ts.index.is_unique

False

In [20]:
dup_ts['1/3/2000'] # 不重复

4

In [21]:
dup_ts['1/2/2000'] # 重复

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [22]:
grouped = dup_ts.groupby(level=0)
print(grouped.mean())

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32


In [23]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64