# 特征工程

# 导入包

In [1]:
import pandas as pd 
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder

# 导入数据

In [41]:
inputfile = '../datasets/data_processed.csv'
outputfile = '../datasets/data_feature_processed.csv'

In [57]:
data = pd.read_csv(inputfile)

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234432 entries, 0 to 234431
Data columns (total 3 columns):
time_stamp       234432 non-null object
loc_id           234432 non-null int64
num_of_people    234432 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 5.4+ MB


In [44]:
data.head()

Unnamed: 0,time_stamp,loc_id,num_of_people
0,2017-01-01 00:00:00,1,0.0
1,2017-01-01 01:00:00,1,0.0
2,2017-01-01 02:00:00,1,0.0
3,2017-01-01 03:00:00,1,0.0
4,2017-01-01 04:00:00,1,0.0


# 特征工程

In [51]:
df = data.copy()

## 1. 增加历史特征和季节性特征

In [52]:
%time df = feature_engineering(df)

Wall time: 8.91 s


## 2. one-hot编码loc_id 

In [53]:
df = df.join(pd.get_dummies(df.loc_id))

# 删除数据

In [54]:
# 构造历史24小时数据，需要删除33个地点第一天的数据，即删除20170101
# 构造历史一个月数据，需要删除第一个月数据
df = df[df.time_stamp>='2017-02-01'] 

In [55]:
nulldata = df.isnull().sum()
nulldata[nulldata>0]

Series([], dtype: int64)

## 导出数据

In [56]:
df.to_csv(outputfile,index=False)

# 以下是模型用到的公共函数

In [8]:
reg = re.compile('\s+|\t|\n') # 正则匹配空格、Tab键、换行
#列名，历史24小时人数
col_24h='''history_1h_population,history_2h_population,history_3h_population,history_4h_population
    ,history_5h_population,history_6h_population,history_7h_population,history_8h_population
    ,history_9h_population,history_10h_population,history_11h_population,history_12h_population
    ,history_13h_population,history_14h_population,history_15h_population,history_16h_population,history_17h_population
    ,history_18h_population,history_19h_population,history_20h_population,history_21h_population,history_22h_population
    ,history_23h_population'''
columns_24h=re.sub(reg,'',col_24h).split(',')  
#列名，同一时刻历史15天人数
col_30d='''history_1d_population,history_2d_population,history_3d_population,
    history_4d_population,history_5d_population,history_6d_population,
    history_7d_population,history_8d_population,history_9d_population,history_10d_population,history_11d_population,
    history_12d_population,history_13d_population,history_14d_population,history_15d_population,history_16d_population
    ,history_17d_population,history_18d_population,history_19d_population,history_20d_population,history_21d_population
    ,history_22d_population,history_23d_population,history_24d_population,history_25d_population,history_26d_population
    ,history_27d_population,history_28d_population,history_29d_population,history_30d_population'''
columns_30d=re.sub(reg,'',col_30d).split(',')    

In [9]:
def feature_engineering(data):
    df = data.copy()
    df = seasonal_feature_engineering(df)
    df = history_features(df)
    return df

In [10]:
# 按地点分组，保证地点之间shift操作数据不会互相覆盖，每个地点的历史数据都移动正确
# 例如地点1历史特征构造完，最后的数据不会shift到地点2上
def history_features(data):
    res = [None]*33
    df = data.copy()
    df.sort_values(by=['loc_id','time_stamp'],inplace=True)
    for i in range(33):
        res[i] = df.loc[df.loc_id == i+1]
    train = pd.DataFrame()
    for tmp in res:
        tmp = history_24h_population(tmp,columns_24h)
        tmp = history_30d_population(tmp,columns_30d)
        train = train.append(tmp)
    return train

In [11]:
def history_24h_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift(i+1)
    return df

In [12]:
def history_30d_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift((i+1)*24)
    return df

In [50]:
def seasonal_feature_engineering(data):
    df=data.copy()
    df.time_stamp = pd.to_datetime(df.time_stamp)
    df['hour_of_day']=df.time_stamp.apply(lambda x : x.hour) #小时
    df['month_of_year']=df.time_stamp.apply(lambda x : x.month) #月份
    df['day_of_week']=df.time_stamp.apply(lambda x : x.weekday()) #周几，从0开始
    df['week_of_year']=df.time_stamp.apply(lambda x : x.weekofyear) #一年第几周，从0开始
    return df   