# 测试数据生成
* 第一阶段：生成10月和11月测试数据
* 第二阶段：生成12月测试数据

# 导入包

In [4]:
import pandas as pd 
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
import datetime

# 导入数据

In [2]:
inputfile = '../datasets/data_processed.csv'
outputfile = '../datasets/data_feature_processed.csv'

In [88]:
data = pd.read_csv(inputfile,parse_dates=['time_stamp'])

# 10月测试数据导出

In [33]:
df = data.copy()
df.loc[df.time_stamp>='2017-10-01','num_of_people'] = np.nan
df = feature_engineering(df)
df = df.join(pd.get_dummies(df.loc_id))
df[df.time_stamp>='2017-10-01'].to_csv('../datasets/Oct_test_data.csv',index=False)

# 11月测试数据导出

In [35]:
df1 = data.copy()
df_nov = df1[df1.time_stamp>='2017-09-01']
df_nov.time_stamp = df_nov.time_stamp+datetime.timedelta(61)
df_nov = df_nov[df_nov.time_stamp<'2017-12-01']
df_nov.loc[df_nov.time_stamp>='2017-11-01','num_of_people'] = np.nan

In [39]:
df1 = df1.append(df_nov)
df1.reset_index(inplace=True)
df1.drop('index',inplace=True,axis=1)

In [40]:
df1 = feature_engineering(df1)
df1 = df1.join(pd.get_dummies(df1.loc_id))

In [42]:
len(df1[df1.time_stamp>='2017-11-01'])

23760

In [41]:
df1[df1.time_stamp>='2017-11-01'].to_csv('../datasets/Nov_test_data.csv',index=False)

# 12月测试数据导出

In [91]:
df2 = data.copy()

In [93]:
df_dec = df2[df2.time_stamp>='2017-10-01']
df_dec.time_stamp = df_dec.time_stamp+datetime.timedelta(61)
df_dec = df_dec[(df_dec.time_stamp>='2017-12-01')&(df_dec.time_stamp<'2018')]
df_dec.loc[df_dec.time_stamp>='2017-12-01','num_of_people'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [94]:
df_dec.shape

(24552, 3)

In [95]:
df2 = df2.append(df_dec)
df2.reset_index(inplace=True)
df2.drop('index',inplace=True,axis=1)

In [96]:
%time df2 = feature_engineering(df2)
df2 = df2.join(pd.get_dummies(df2.loc_id))

Wall time: 11.5 s


In [97]:
df2[df2.time_stamp>='2017-12-01'].shape

(24552, 101)

In [98]:
df2[df2.time_stamp>='2017-12-01'].to_csv('../datasets/Dec_test_data.csv',index=False)

# 以下是模型用到的公共函数

In [80]:
reg = re.compile('\s+|\t|\n') # 正则匹配空格、Tab键、换行
#列名，历史24小时人数
col_24h='''history_1h_population,history_2h_population,history_3h_population,history_4h_population
    ,history_5h_population,history_6h_population,history_7h_population,history_8h_population
    ,history_9h_population,history_10h_population,history_11h_population,history_12h_population
    ,history_13h_population,history_14h_population,history_15h_population,history_16h_population,history_17h_population
    ,history_18h_population,history_19h_population,history_20h_population,history_21h_population,history_22h_population
    ,history_23h_population'''
columns_24h=re.sub(reg,'',col_24h).split(',')  
#列名，同一时刻历史15天人数
col_30d='''history_1d_population,history_2d_population,history_3d_population,
    history_4d_population,history_5d_population,history_6d_population,
    history_7d_population,history_8d_population,history_9d_population,history_10d_population,history_11d_population,
    history_12d_population,history_13d_population,history_14d_population,history_15d_population,history_16d_population
    ,history_17d_population,history_18d_population,history_19d_population,history_20d_population,history_21d_population
    ,history_22d_population,history_23d_population,history_24d_population,history_25d_population,history_26d_population
    ,history_27d_population,history_28d_population,history_29d_population,history_30d_population'''
columns_30d=re.sub(reg,'',col_30d).split(',')    
col_24h_stat = '''history_24h_population_max,history_24h_population_min,history_24h_population_mean,history_24h_population_median,history_24h_population_std'''
columns_24h_stat = re.sub(reg,'',col_24h_stat).split(',')  
#列名，同一时刻历史15天人数
col_30d_stat = '''history_30d_population_max,history_30d_population_min,history_30d_population_mean,history_30d_population_median,history_30d_population_std'''
columns_30d_stat = re.sub(reg,'',col_30d_stat).split(',')   

In [81]:
def feature_engineering(data):
    df = data.copy()
    df = seasonal_feature_engineering(df)
    df = history_features(df)
    df = history_24h_population_stat(df,columns_24h_stat)
    df = history_30d_population_stat(df,columns_30d_stat)
    return df

In [82]:
# 按地点分组
def history_features(data):
    res = [None]*33
    df = data.copy()
    df.sort_values(by=['loc_id','time_stamp'],inplace=True)
    for i in range(33):
        res[i] = df.loc[df.loc_id == i+1]
    train = pd.DataFrame()
    for tmp in res:
        tmp = history_24h_population(tmp,columns_24h)
        tmp = history_30d_population(tmp,columns_30d)
        train = train.append(tmp)
    return train

In [83]:
def history_24h_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift(i+1)
    return df

In [84]:
col_24h_stat = '''history_24h_population_max,history_24h_population_min,history_24h_population_mean,history_24h_population_median,history_24h_population_std'''
def history_24h_population_stat(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    df.history_24h_population_max = np.round(df[columns_24h].max(axis = 1))
    df.history_24h_population_min = np.round(df[columns_24h].min(axis = 1))
    df.history_24h_population_mean = np.round(df[columns_24h].mean(axis = 1))
    df.history_24h_population_median = np.round(df[columns_24h].median(axis = 1))
    df.history_24h_population_std = np.round(df[columns_24h].std(axis = 1))
    return df

In [85]:
def history_30d_population(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    for i in range(len(features)):
        df[features[i]] = df['num_of_people'].shift((i+1)*24)
    return df

In [86]:
col_30d_stat = '''history_30d_population_max,history_30d_population_min,history_30d_population_mean,history_30d_population_median,history_30d_population_std'''
def history_30d_population_stat(data,features):
    df = data.copy()
    for item in features:
        df[item] = np.nan
    df.history_30d_population_max = np.round(df[columns_30d].max(axis = 1))
    df.history_30d_population_min = np.round(df[columns_30d].min(axis = 1))
    df.history_30d_population_mean = np.round(df[columns_30d].mean(axis = 1))
    df.history_30d_population_median = np.round(df[columns_30d].median(axis = 1))
    df.history_30d_population_std = np.round(df[columns_30d].std(axis = 1))
    return df

In [99]:
def seasonal_feature_engineering(data):
    df=data.copy()
    df.time_stamp = pd.to_datetime(df.time_stamp)
    df['hour_of_day'] = df.time_stamp.apply(lambda x : x.hour) #小时
    df['day_of_week'] = df.time_stamp.apply(lambda x : x.weekday()) #周几，从0开始
    df['day_of_month'] = df.time_stamp.apply(lambda x: x.day)
    return df   