# 特征工程

# 导入包

In [1]:
import pandas as pd 
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder

# 导入数据

In [2]:
inputfile = '../datasets/data_processed.csv'
outputfile = '../datasets/data_feature_processed.csv'

In [24]:
data = pd.read_csv(inputfile)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218592 entries, 0 to 218591
Data columns (total 3 columns):
time_stamp       218592 non-null object
loc_id           218592 non-null int64
num_of_people    218592 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 5.0+ MB


In [8]:
data.head()

Unnamed: 0,time_stamp,loc_id,num_of_people
0,2017-01-01 00:00:00,1,120.0
1,2017-01-01 01:00:00,1,143.0
2,2017-01-01 02:00:00,1,87.0
3,2017-01-01 03:00:00,1,90.0
4,2017-01-01 04:00:00,1,49.0


# 特征工程

In [25]:
df = data.copy()

## 1. 增加历史特征和季节性特征

In [26]:
%time df = feature_engineering(df)

Wall time: 23.5 s


In [18]:
df[(df.loc_id==2)&(df.time_stamp>='2017-10-01')]

Unnamed: 0,time_stamp,loc_id,num_of_people,month_of_year,day_of_week,week_of_year,history_1h_population,history_2h_population,history_3h_population,history_4h_population,...,history_21d_population,history_22d_population,history_23d_population,history_24d_population,history_25d_population,history_26d_population,history_27d_population,history_28d_population,history_29d_population,history_30d_population
12504,2017-10-01 00:00:00,2,497.0,10,6,39,335.0,516.0,407.0,368.0,...,226.0,170.0,174.0,210.0,119.0,209.0,120.0,284.0,222.0,244.0
12505,2017-10-01 01:00:00,2,389.0,10,6,39,497.0,335.0,516.0,407.0,...,151.0,183.0,142.0,117.0,97.0,133.0,167.0,208.0,162.0,174.0
12506,2017-10-01 02:00:00,2,305.0,10,6,39,389.0,497.0,335.0,516.0,...,92.0,147.0,154.0,144.0,131.0,108.0,165.0,215.0,192.0,300.0
12507,2017-10-01 03:00:00,2,215.0,10,6,39,305.0,389.0,497.0,335.0,...,86.0,131.0,138.0,121.0,98.0,133.0,133.0,196.0,201.0,189.0
12508,2017-10-01 04:00:00,2,249.0,10,6,39,215.0,305.0,389.0,497.0,...,124.0,135.0,148.0,109.0,130.0,112.0,141.0,172.0,250.0,151.0
12509,2017-10-01 05:00:00,2,127.0,10,6,39,249.0,215.0,305.0,389.0,...,126.0,140.0,127.0,130.0,136.0,121.0,147.0,118.0,237.0,333.0
12510,2017-10-01 06:00:00,2,174.0,10,6,39,127.0,249.0,215.0,305.0,...,127.0,200.0,198.0,177.0,201.0,201.0,245.0,187.0,303.0,476.0
12511,2017-10-01 07:00:00,2,281.0,10,6,39,174.0,127.0,249.0,215.0,...,303.0,318.0,519.0,491.0,519.0,532.0,679.0,325.0,407.0,399.0
12512,2017-10-01 08:00:00,2,448.0,10,6,39,281.0,174.0,127.0,249.0,...,406.0,415.0,877.0,855.0,694.0,855.0,762.0,504.0,521.0,467.0
12513,2017-10-01 09:00:00,2,558.0,10,6,39,448.0,281.0,174.0,127.0,...,368.0,497.0,468.0,459.0,423.0,403.0,406.0,381.0,453.0,455.0


## 2. one-hot编码loc_id 

In [27]:
df = df.join(pd.get_dummies(df.loc_id))

# 删除数据

In [28]:
# 构造历史24小时数据，需要删除33个地点第一天的数据，即删除20170101
# 构造历史一个月数据，需要删除第一个月数据
df.sort_values(by=['time_stamp','loc_id'],inplace=True)
df = df[df.time_stamp>='2017-02-01'] 

In [29]:
nulldata = df.isnull().sum()
nulldata[nulldata>0]

Series([], dtype: int64)

In [22]:
df.iloc[:,-34:].head()

Unnamed: 0,history_30d_population,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
744,96.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7368,110.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13992,29.0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20616,49.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27240,66.0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 导出数据

In [30]:
df.to_csv(outputfile,index=False)

# 以下是模型用到的公共函数

In [11]:
reg = re.compile('\s+|\t|\n') # 正则匹配空格、Tab键、换行
#列名，历史24小时人数
col_24h='''history_1h_population,history_2h_population,history_3h_population,history_4h_population
    ,history_5h_population,history_6h_population,history_7h_population,history_8h_population
    ,history_9h_population,history_10h_population,history_11h_population,history_12h_population
    ,history_13h_population,history_14h_population,history_15h_population,history_16h_population,history_17h_population
    ,history_18h_population,history_19h_population,history_20h_population,history_21h_population,history_22h_population
    ,history_23h_population'''
columns_24h=re.sub(reg,'',col_24h).split(',')  
#列名，同一时刻历史15天人数
col_30d='''history_1d_population,history_2d_population,history_3d_population,
    history_4d_population,history_5d_population,history_6d_population,
    history_7d_population,history_8d_population,history_9d_population,history_10d_population,history_11d_population,
    history_12d_population,history_13d_population,history_14d_population,history_15d_population,history_16d_population
    ,history_17d_population,history_18d_population,history_19d_population,history_20d_population,history_21d_population
    ,history_22d_population,history_23d_population,history_24d_population,history_25d_population,history_26d_population
    ,history_27d_population,history_28d_population,history_29d_population,history_30d_population'''
columns_30d=re.sub(reg,'',col_30d).split(',')    

In [12]:
def feature_engineering(data):
    df = data.copy()
    df = seasonal_feature_engineering(df)
    df = history_features(df)
    return df

In [13]:
# 按地点分组，保证地点之间shift操作数据不会互相覆盖，每个地点的历史数据都移动正确
# 例如地点1历史特征构造完，最后的数据不会shift到地点2上
def history_features(data):
    res = [None]*33
    df = data.copy()
    df.sort_values(by=['loc_id','time_stamp'],inplace=True)
    for i in range(33):
        res[i] = df.loc[df.loc_id == i+1]
    train = pd.DataFrame()
    for tmp in res:
        tmp = history_24h_population(tmp,columns_24h)
        tmp = history_30d_population(tmp,columns_30d)
        train = train.append(tmp)
    return train

In [14]:
def history_24h_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift(i+1)
    return df

In [15]:
def history_30d_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift((i+1)*24)
    return df

In [16]:
def seasonal_feature_engineering(data):
    df=data.copy()
    df.time_stamp = pd.to_datetime(df.time_stamp)
    df['month_of_year']=df.time_stamp.apply(lambda x : x.month) #月份
    df['day_of_week']=df.time_stamp.apply(lambda x : x.weekday()) #周几，从0开始
    df['week_of_year']=df.time_stamp.apply(lambda x : x.weekofyear) #一年第几周，从0开始
    return df   