In [2]:
import pandas as pd
import numpy as np

date_string =  np.array(['03-04-2005 11:35 PM',
                         '23-05-2010 12:01 AM',
                         '01-09-2004 09:09 PM'])

[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p") for date in date_string]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2004-09-01 21:09:00')]

In [3]:
# 添加一个error参数确保程序运行不会出现错误
[pd.to_datetime(date , format="%d-%m-%Y %I:%M %p", errors='coerce') for date in date_string]  # 将错误值设为NaT

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2004-09-01 21:09:00')]

In [4]:
# 处理时区
import pandas as pd

# 创建datetime
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')


Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [5]:
date = pd.Timestamp('2017-05-01 06:00:00')
# 设置时区
date_in_london = date.tz_localize("Europe/London")

date_in_london

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [9]:
# 改变时区
date_in_london.tz_convert('Africa/Abidjan')

Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

In [11]:
# 选择日期和时间
import pandas as pd

# 创建数据帧
dataframe = pd.DataFrame()

# 创建datetime
dataframe['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')

# 筛选出2个日期之间的观察值
dataframe[(dataframe['date']> '2002-1-1 01:00:00')&
           (dataframe['date'] <= '2002-1-1 04:00:00')]

Unnamed: 0,date
8762,2002-01-01 02:00:00
8763,2002-01-01 03:00:00
8764,2002-01-01 04:00:00


In [12]:
# 将日期设置为索引列
dataframe = dataframe.set_index(dataframe['date'])

dataframe.loc['2002-1-1 01:00:00': '2002-1-1 04:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2002-01-01 01:00:00,2002-01-01 01:00:00
2002-01-01 02:00:00,2002-01-01 02:00:00
2002-01-01 03:00:00,2002-01-01 03:00:00
2002-01-01 04:00:00,2002-01-01 04:00:00


In [15]:
# 将日期切分为多个特征

dataframe = pd.DataFrame()

#创建5个日期
dataframe['date'] = pd.date_range('1/1/2001', periods=150, freq='W')

# 创建年月日的特征区别
dataframe['year'] = dataframe['date'].dt.year
dataframe['month'] = dataframe['date'].dt.month
dataframe['day'] = dataframe['date'].dt.day
dataframe['hour'] = dataframe['date'].dt.hour
dataframe['minute'] = dataframe['date'].dt.minute

dataframe.head()


Unnamed: 0,date,year,month,day,hour,minute
0,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001,1,21,0,0
3,2001-01-28,2001,1,28,0,0
4,2001-02-04,2001,2,4,0,0


In [17]:
# 计算两个日期之间的时间差
dataframe = pd.DataFrame()

# 创建2个datetime特征
dataframe['Arrived'] = [pd.Timestamp('01/01/2001'), pd.Timestamp('01-04-2017')]
dataframe['left'] = [pd.Timestamp('01/01/2017'), pd.Timestamp('01-06-2017')]

# 计算特征时间差
dataframe['left'] - dataframe['Arrived']

0   5844 days
1      2 days
dtype: timedelta64[ns]

In [18]:
# 除去days 只保留数值
pd.Series(delta.days for delta in (dataframe['left'] - dataframe['Arrived']))

0    5844
1       2
dtype: int64

In [19]:
# 对一周内的各天的数据进行编码

dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq="M"))
# 查看星期几
dates.dt.weekday_name

0    Thursday
1      Sunday
2     Tuesday
dtype: object

In [20]:
# 输出数值型数据
dates.dt.weekday

0    3
1    6
2    1
dtype: int64

In [1]:
# 创建一个滞后特征
import pandas as pd
dataframe = pd.DataFrame()

dataframe['dates'] = pd.date_range('1/1/2001', periods=5, freq="D" )
dataframe['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
# 让值开始滞后一行
dataframe['previous_days_stock_price'] = dataframe['stock_price'].shift(1)

dataframe

Unnamed: 0,dates,stock_price,previous_days_stock_price
0,2001-01-01,1.1,
1,2001-01-02,2.2,1.1
2,2001-01-03,3.3,2.2
3,2001-01-04,4.4,3.3
4,2001-01-05,5.5,4.4


In [2]:
# 使用滚动时间窗口

time_index = pd.date_range("01/01/2010", periods=5, freq="M")

dataframe = pd.DataFrame(index=time_index)

dataframe['Stock_Price'] = [1,2,3,4,5]

dataframe.rolling(window=2).mean()

Unnamed: 0,Stock_Price
2010-01-31,
2010-02-28,1.5
2010-03-31,2.5
2010-04-30,3.5
2010-05-31,4.5


In [3]:
dataframe.rolling(window=3).mean()

Unnamed: 0,Stock_Price
2010-01-31,
2010-02-28,
2010-03-31,2.0
2010-04-30,3.0
2010-05-31,4.0


In [4]:
# 处理时间序列的缺失值
import numpy as np
time_index = pd.date_range('01/01/2010', periods=5, freq='M')

dataframe = pd.DataFrame(index=time_index)

dataframe['Sales'] = [1.0, 2.0, np.nan, np.nan, 5.0]

dataframe

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,
2010-04-30,
2010-05-31,5.0


In [5]:
# 对数据集进行插值
dataframe.interpolate()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,4.0
2010-05-31,5.0


In [6]:
# 向前填充
dataframe.ffill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


In [7]:
# 向后填充
dataframe.bfill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0
