# 特征工程
* 数据来源：../datasets/data_processed.csv
* 代码内容：对经过数据处理的数据，进行特征工程
    * 历史24小时人数及统计特征
    * 历史30天人数及统计特征
    * 季节性特征：小时、星期、每个月第几天

# 导入包

In [15]:
import pandas as pd 
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder

# 导入数据

In [13]:
inputfile = '../datasets/data_processed.csv'
outputfile = '../datasets/data_feature_processed.csv'

In [14]:
data = pd.read_csv(inputfile)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242352 entries, 0 to 242351
Data columns (total 3 columns):
time_stamp       242352 non-null object
loc_id           242352 non-null int64
num_of_people    242352 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 5.5+ MB


In [17]:
data.head()

Unnamed: 0,time_stamp,loc_id,num_of_people
0,2017-01-01 00:00:00,1,120.0
1,2017-01-01 01:00:00,1,143.0
2,2017-01-01 02:00:00,1,87.0
3,2017-01-01 03:00:00,1,90.0
4,2017-01-01 04:00:00,1,63.0


# 特征工程

In [86]:
df = data.copy()

## 1. 增加历史特征和季节性特征

In [87]:
%time df = feature_engineering(df)

Wall time: 7.43 s


## 2. one-hot编码loc_id 

In [88]:
df = df.join(pd.get_dummies(df.loc_id))

In [89]:
df.shape

(242352, 101)

# 删除数据

In [90]:
# 构造历史24小时数据，需要删除33个地点第一天的数据，即删除20170101
# 构造历史一个月数据，需要删除第一个月数据
df = df[df.time_stamp>='2017-04-01'] 

In [92]:
# 检验缺失值
nulldata = df.isnull().sum()
nulldata[nulldata>0]

Series([], dtype: int64)

## 导出数据

In [94]:
%time df.to_csv(outputfile,index=False)

Wall time: 13.6 s


# 以下是模型用到的公共函数

## 1. 特征列表

In [85]:
reg = re.compile('\s+|\t|\n') # 正则匹配空格、Tab键、换行
#列名，历史24小时人数
col_24h='''history_1h_population,history_2h_population,history_3h_population,history_4h_population
    ,history_5h_population,history_6h_population,history_7h_population,history_8h_population
    ,history_9h_population,history_10h_population,history_11h_population,history_12h_population
    ,history_13h_population,history_14h_population,history_15h_population,history_16h_population,history_17h_population
    ,history_18h_population,history_19h_population,history_20h_population,history_21h_population,history_22h_population
    ,history_23h_population'''
columns_24h=re.sub(reg,'',col_24h).split(',')  
#列名，同一时刻历史15天人数
col_30d='''history_1d_population,history_2d_population,history_3d_population,
    history_4d_population,history_5d_population,history_6d_population,
    history_7d_population,history_8d_population,history_9d_population,history_10d_population,history_11d_population,
    history_12d_population,history_13d_population,history_14d_population,history_15d_population,history_16d_population
    ,history_17d_population,history_18d_population,history_19d_population,history_20d_population,history_21d_population
    ,history_22d_population,history_23d_population,history_24d_population,history_25d_population,history_26d_population
    ,history_27d_population,history_28d_population,history_29d_population,history_30d_population'''
columns_30d=re.sub(reg,'',col_30d).split(',')    
col_24h_stat = '''history_24h_population_max,history_24h_population_min,history_24h_population_mean,history_24h_population_median,history_24h_population_std'''
columns_24h_stat = re.sub(reg,'',col_24h_stat).split(',')  
#列名，同一时刻历史15天人数
col_30d_stat = '''history_30d_population_max,history_30d_population_min,history_30d_population_mean,history_30d_population_median,history_30d_population_std'''
columns_30d_stat = re.sub(reg,'',col_30d_stat).split(',')   

## 2. 特征工程调用函数

In [84]:
def feature_engineering(data):
    df = data.copy()
    df = seasonal_feature_engineering(df)
    df = history_features(df)
    df = history_24h_population_stat(df,columns_24h_stat)
    df = history_30d_population_stat(df,columns_30d_stat)
    return df

### 2.1 历史特征构建函数

In [83]:
# 按地点分组，保证地点之间shift操作数据不会互相覆盖，每个地点的历史数据都移动正确
# 例如地点1历史特征构造完，最后的数据不会shift到地点2上
def history_features(data):
    res = [None]*33
    df = data.copy()
    df.sort_values(by=['loc_id','time_stamp'],inplace=True)
    for i in range(33):
        res[i] = df.loc[df.loc_id == i+1]
    train = pd.DataFrame()
    for tmp in res:
        tmp = history_24h_population(tmp,columns_24h)
        tmp = history_30d_population(tmp,columns_30d)
        train = train.append(tmp)
    return train

#### 2.1.1 历史24小时人数特征构建函数

In [82]:
def history_24h_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift(i+1)
    return df

#### 2.2.2 历史30天特征构建函数

In [80]:
def history_30d_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift((i+1)*24)
    return df

### 2.3 历史24小时人数统计特征构建函数

In [None]:
col_24h_stat = '''history_24h_population_max,history_24h_population_min,history_24h_population_mean,history_24h_population_median,history_24h_population_std'''
def history_24h_population_stat(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    df.history_24h_population_max = np.round(df[columns_24h].max(axis = 1))
    df.history_24h_population_min = np.round(df[columns_24h].min(axis = 1))
    df.history_24h_population_mean = np.round(df[columns_24h].mean(axis = 1))
    df.history_24h_population_median = np.round(df[columns_24h].median(axis = 1))
    df.history_24h_population_std = np.round(df[columns_24h].std(axis = 1))
    return df

### 2.4 历史30天人数统计特征构建函数

In [79]:
col_30d_stat = '''history_30d_population_max,history_30d_population_min,history_30d_population_mean,history_30d_population_median,history_30d_population_std'''
def history_30d_population_stat(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    df.history_30d_population_max = np.round(df[columns_30d].max(axis = 1))
    df.history_30d_population_min = np.round(df[columns_30d].min(axis = 1))
    df.history_30d_population_mean = np.round(df[columns_30d].mean(axis = 1))
    df.history_30d_population_median = np.round(df[columns_30d].median(axis = 1))
    df.history_30d_population_std = np.round(df[columns_30d].std(axis = 1))
    return df

### 2.5 季节性特征构建函数

In [78]:
def seasonal_feature_engineering(data):
    df=data.copy()
    df.time_stamp = pd.to_datetime(df.time_stamp)
    df['hour_of_day'] = df.time_stamp.apply(lambda x : x.hour) #小时
    df['day_of_week'] = df.time_stamp.apply(lambda x : x.weekday()) #周几，从0开始
    df['day_of_month'] = df.time_stamp.apply(lambda x: x.day)
    return df   