# 特征工程

# 导入包

In [2]:
import pandas as pd 
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
import datetime

# 导入数据

In [3]:
inputfile = '../datasets/data_processed.csv'
outputfile = '../datasets/data_feature_processed.csv'

In [21]:
data = pd.read_csv(inputfile,parse_dates=['time_stamp'])

In [22]:
data.head()

Unnamed: 0,time_stamp,loc_id,num_of_people
0,2017-03-01 00:00:00,1,0.0
1,2017-03-01 01:00:00,1,0.0
2,2017-03-01 02:00:00,1,54.0
3,2017-03-01 03:00:00,1,456.0
4,2017-03-01 04:00:00,1,470.0


In [23]:
data.tail()

Unnamed: 0,time_stamp,loc_id,num_of_people
187699,2017-10-31 19:00:00,33,407.0
187700,2017-10-31 20:00:00,33,496.0
187701,2017-10-31 21:00:00,33,434.0
187702,2017-10-31 22:00:00,33,272.0
187703,2017-10-31 23:00:00,33,120.0


# 10月测试数据导出

In [33]:
df = data.copy()
df.loc[df.time_stamp>='2017-10-01','num_of_people'] = np.nan
df = feature_engineering(df)
df = df.join(pd.get_dummies(df.loc_id))
df[df.time_stamp>='2017-10-01'].to_csv('../datasets/Oct_test_data.csv',index=False)

# 11月测试数据导出

In [35]:
df1 = data.copy()

In [36]:
df_nov = df1[df1.time_stamp>='2017-09-01']
df_nov.time_stamp = df_nov.time_stamp+datetime.timedelta(61)
df_nov = df_nov[df_nov.time_stamp<'2017-12-01']
df_nov.loc[df_nov.time_stamp>='2017-11-01','num_of_people'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [37]:
df_nov.head()

Unnamed: 0,time_stamp,loc_id,num_of_people
4416,2017-11-01 00:00:00,1,
4417,2017-11-01 01:00:00,1,
4418,2017-11-01 02:00:00,1,
4419,2017-11-01 03:00:00,1,
4420,2017-11-01 04:00:00,1,


In [38]:
df_nov.tail()

Unnamed: 0,time_stamp,loc_id,num_of_people
187147,2017-11-30 19:00:00,33,
187148,2017-11-30 20:00:00,33,
187149,2017-11-30 21:00:00,33,
187150,2017-11-30 22:00:00,33,
187151,2017-11-30 23:00:00,33,


In [39]:
df1 = df1.append(df_nov)
df1.reset_index(inplace=True)
df1.drop('index',inplace=True,axis=1)

In [40]:
df1 = feature_engineering(df1)
df1 = df1.join(pd.get_dummies(df1.loc_id))

In [42]:
len(df1[df1.time_stamp>='2017-11-01'])

23760

In [41]:
df1[df1.time_stamp>='2017-11-01'].to_csv('../datasets/Nov_test_data.csv',index=False)

# 以下是模型用到的公共函数

In [7]:
reg = re.compile('\s+|\t|\n') # 正则匹配空格、Tab键、换行
#列名，历史24小时人数
col_24h='''history_1h_population,history_2h_population,history_3h_population,history_4h_population
    ,history_5h_population,history_6h_population,history_7h_population,history_8h_population
    ,history_9h_population,history_10h_population,history_11h_population,history_12h_population
    ,history_13h_population,history_14h_population,history_15h_population,history_16h_population,history_17h_population
    ,history_18h_population,history_19h_population,history_20h_population,history_21h_population,history_22h_population
    ,history_23h_population'''
columns_24h=re.sub(reg,'',col_24h).split(',')  
#列名，同一时刻历史15天人数
col_30d='''history_1d_population,history_2d_population,history_3d_population,
    history_4d_population,history_5d_population,history_6d_population,
    history_7d_population,history_8d_population,history_9d_population,history_10d_population,history_11d_population,
    history_12d_population,history_13d_population,history_14d_population,history_15d_population,history_16d_population
    ,history_17d_population,history_18d_population,history_19d_population,history_20d_population,history_21d_population
    ,history_22d_population,history_23d_population,history_24d_population,history_25d_population,history_26d_population
    ,history_27d_population,history_28d_population,history_29d_population,history_30d_population'''
columns_30d=re.sub(reg,'',col_30d).split(',')    

In [8]:
def feature_engineering(data):
    df = data.copy()
    df = seasonal_feature_engineering(df)
    df = history_features(df)
    return df

In [9]:
# 按地点分组
def history_features(data):
    res = [None]*33
    df = data.copy()
    df.sort_values(by=['loc_id','time_stamp'],inplace=True)
    for i in range(33):
        res[i] = df.loc[df.loc_id == i+1]
    train = pd.DataFrame()
    for tmp in res:
        tmp = history_24h_population(tmp,columns_24h)
        tmp = history_30d_population(tmp,columns_30d)
        train = train.append(tmp)
    return train

In [10]:
def history_24h_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift(i+1)
    return df

In [11]:
def history_30d_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift((i+1)*24)
    return df

In [43]:
def seasonal_feature_engineering(data):
    df = data.copy()
    df.time_stamp = pd.to_datetime(df.time_stamp)
    df['hour_of_day']=df.time_stamp.apply(lambda x : x.hour) #小时
    df['day_of_week'] = df.time_stamp.apply(lambda x : x.weekday()) #周几，从0开始
    return df   