In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm as tqdm
from multiprocessing import Pool
import time,gc

In [2]:
# 第一步:准备好所有数据
# 第二部:建模测试

In [3]:
# 2 根据上一天的特征预测下一天的流量,end2end
# f1:上一天此刻的流量
# f2:上一天此刻(n时刻(半小时/附近三小时)范围内的流量)
# f3:上一天该站总流量
# f5:上一天该线路总流量
# f6:上一天该线路该时刻总流量
# f7:上一天该节点附近节点流入总流量
# f8:上一天该节点附近节点该时刻流入总流量

# f4:周几?
# f9:时刻
# f12:第几周


In [4]:
# 统计进出站每天的变化情况
def get_daily_activity(i):
    if i<10:
        s = '0'+str(i)
    else:
        s = str(i)
    day_activ = pd.read_csv('data/Metro_train/record_2019-01-%s.csv'%s)
    day_activ['time'] = pd.to_datetime(day_activ.time)
    day_activ['stationID'] = day_activ['stationID']
    day_activ['deviceID'] = day_activ['deviceID']
    day_activ['status'] = day_activ['status']
    day_activ['payType'] = day_activ['payType']
    day_activ['lineID'] = day_activ['lineID'].apply(lambda x: {'A':0,'B':2,'C':1}[x])
    # 去除1-3用户
    return day_activ
# 上一天附近n个十分钟的总流量
def count_cum(params):
    i,n = params
    time_range = [i/10+x for x in range(0-n,1+n)]
    time_range = [143+1+x if x<0 else x for x in time_range]
    count = 0
    for t in time_range:
        try:
            count+=come_byminute.loc[t,'count_10min_all']
        except:
            count+=0
    return [i/10,count]

def get_cum_feature(function,n):
    pool = Pool(6)  #创建拥有5个进程数量的进程池
    timestep = [(x,n) for x in range(1440)]
    count_result =pool.map(function, timestep) 
    pool.close()#关闭进程池，不再接受新的进程
    pool.join()#主进程阻塞等待子进程的退出
    return count_result

# 分站点统计
def count_cum_bystation(params):
    i,n = params
    time_range = [i/10+x for x in range(0-n,1+n)]
    time_range = [143+1+x if x<0 else x for x in time_range]
    station_num = []
    for s in range(80):
        count = 0
        for t in time_range:
            try:
                count = come_bystation_dict[(s,t)]
                #count = come_bystation.loc[(s,t),'count_10min']
            except:
                count += 0
        station_num.append([s,i/10,count])
    return station_num
# 按邻居来做统计
def count_cum_bynearstation(params):
    i,n = params
    time_range = [i/10+x for x in range(0-n,1+n)]
    time_range = [143+1+x if x<0 else x for x in time_range]
    station_num = []
    for s in range(80):
        nodes = near_node[s]+[s]
        count = 0
        for node in nodes:
            for t in time_range:
                try:
                    count = come_bystation_dict[(s,t)]
                    #count = come_bystation.loc[(s,t),'count_10min']
                except:
                    count += 0
        station_num.append([s,i/10,count])
    return station_num
# 按邻居来做统计
def count_cum_bynearstation_all():
    station_num = []
    for s in range(80):
        nodes = near_node[s]+[s]
        count = 0
        for node in nodes:
            try:
                count += come_bystation.loc[(s,),'count_10min'].sum()
            except:
                count += 0
        station_num.append([s,count])
    return station_num

In [5]:
# 构建邻接矩阵
roadmap = pd.read_csv('data/Metro_roadMap.csv',index_col=0)
near_node = {}
for index,row in roadmap.iterrows():
    for i in range(80):
        if row[i]==1:
            if index not in near_node:
                near_node[index] = []
            near_node[index].append(i)

In [6]:
def construct_df():
    final = pd.DataFrame()
    for i in range(81):
        temp = pd.DataFrame()
        temp['minute'] = list(range(1440))
        temp['stationID'] = i
        final = final.append(temp)
    final['minute'] = final['minute']/10
    return final

In [7]:
t1 = time.time()
all_df = pd.DataFrame()
for date in [12,13,14,19,20,21,26,28]:
    final = construct_df()
    day_activ = get_daily_activity(date)[['time','lineID','stationID','deviceID','status','payType']]
    df = pd.DataFrame()
    for i in range(10):
        day_activ['minute'] = (day_activ['time'].dt.minute-i+day_activ['time'].dt.hour*60)//10+0.1*i
        day_activ['minute'] = day_activ['minute'].apply(lambda x:143+1+x if x<0 else x)
        df = df.append(day_activ)
    df.sort_values(by='minute',inplace=True)
    come = df[df.status==1]
    come_byminute = come.groupby('minute').agg({'status':'count'}).rename(columns={'status':'count_10min_all'})
    
    f1 = come.groupby('minute',as_index=False).agg({'status':'count'}).rename(columns={'status':'count_10min_all'})
    count_result = get_cum_feature(count_cum,1)
    f2 = pd.DataFrame(count_result)
    f2.rename(columns={0:'minute',1:'count_30min_all'},inplace=True)
    count_result = get_cum_feature(count_cum,6)
    f3 = pd.DataFrame(count_result)
    f3.rename(columns={0:'minute',1:'count_2h_all'},inplace=True)
    f4 = come.shape[0]
    t2 = time.time()
    final = final.merge(f1,on='minute',how='left')
    final = final.merge(f2,on='minute',how='left')
    final = final.merge(f3,on='minute',how='left')
    final['count_24h_all'] = f4
    print('now f4 %f 秒'%(t2-t1))
    come_bystation = come.groupby(['stationID','minute']).agg({'status':'count'})\
    .rename(columns={'status':'count_10min'})
    come_bystation_dict = come_bystation['count_10min'].to_dict()
    f5 = come.groupby(['stationID','minute'],as_index=False).agg({'status':'count'})\
    .rename(columns={'status':'count_10min_bystation'})
    final = final.merge(f5,on=['stationID','minute'],how='left')
    count_result = get_cum_feature(count_cum_bystation,1)
    result = []
    for x in count_result:
        for i in x:
            result.append(i)
    f6 = pd.DataFrame(result)
    f6.rename(columns={0:'stationID',1:'minute',2:'count_30min_bystation'},inplace=True)
    final = final.merge(f6,on=['stationID','minute'],how='left')

    count_result = get_cum_feature(count_cum_bystation,6)
    result = []
    for x in count_result:
        for i in x:
            result.append(i)
    f7 = pd.DataFrame(result)
    f7.rename(columns={0:'stationID',1:'minute',2:'count_2h_bystation'},inplace=True)
    final = final.merge(f7,on=['stationID','minute'],how='left')

    f8 = come_bystation.reset_index().groupby('stationID',as_index=False).agg({'count_10min':'sum'}).rename(columns={'count_10min':'count_24h_bystation'})
    count_result = get_cum_feature(count_cum_bynearstation,0)
    result = []
    for x in count_result:
        for i in x:
            result.append(i)
    final = final.merge(f8,on='stationID',how='left')


    f88 = come_bystation.reset_index().groupby('minute',as_index=False).agg({'count_10min':'sum'}).rename(columns={'count_10min':'count_24h_byminute'})
    count_result = get_cum_feature(count_cum_bynearstation,0)
    result = []
    for x in count_result:
        for i in x:
            result.append(i)
    final = final.merge(f88,on='minute',how='left')
    
    f9 = pd.DataFrame(result)
    f9.rename(columns={0:'stationID',1:'minute',2:'count_10min_near'},inplace=True)
    final = final.merge(f9,on=['stationID','minute'],how='left')

    count_result = get_cum_feature(count_cum_bynearstation,1)
    result = []
    for x in count_result:
        for i in x:
            result.append(i)
    f10 = pd.DataFrame(result)
    f10.rename(columns={0:'stationID',1:'minute',2:'count_30min_near'},inplace=True)
    final = final.merge(f10,on=['stationID','minute'],how='left')

    count_result = get_cum_feature(count_cum_bynearstation,6)
    result = []
    for x in count_result:
        for i in x:
            result.append(i)
    f11 = pd.DataFrame(result)
    f11.rename(columns={0:'stationID',1:'minute',2:'count_2h_near'},inplace=True)
    final = final.merge(f11,on=['stationID','minute'],how='left')

    count_result = count_cum_bynearstation_all()
    f12 = pd.DataFrame(count_result)
    f12.rename(columns={0:'stationID',1:'count_24h_near'},inplace=True)
    final = final.merge(f12,on=['stationID'],how='left')
    
    # 加入时间特征 第几周和周几
    final['week_th'] = (date-7)//7
    final['weekday'] = (date-7)%7+1
    t3 = time.time()
    # 数据格式减少内存
    final['date'] = date
    final['date'] = final['date']
    final.fillna(0,inplace=True)
    for c in final.dtypes.index:
        if final.dtypes[c] == 'float64':
            final[c] = final[c].astype('float32')
    all_df = all_df.append(final)
    print('epoch %d, now f12 %f 秒'%(date,t3-t1))
    #def final
    gc.collect()
    #break

now f4 14.916130 秒
epoch 12, now f12 18.897938 秒
now f4 33.024836 秒
epoch 13, now f12 37.430923 秒
now f4 54.939477 秒
epoch 14, now f12 59.417694 秒
now f4 74.818809 秒
epoch 19, now f12 79.149479 秒
now f4 93.905863 秒
epoch 20, now f12 98.227571 秒
now f4 115.260324 秒
epoch 21, now f12 119.869306 秒
now f4 135.391105 秒
epoch 26, now f12 139.824406 秒
now f4 156.532858 秒
epoch 28, now f12 161.149729 秒


In [8]:
all_df.to_pickle('features/progress2.pkl')

In [9]:
all_df

Unnamed: 0,minute,stationID,count_10min_all,count_30min_all,count_2h_all,count_24h_all,count_10min_bystation,count_30min_bystation,count_2h_bystation,count_24h_bystation,count_24h_byminute,count_10min_near,count_30min_near,count_2h_near,count_24h_near,week_th,weekday,date
0,0.000000,0,16.0,56,2281,10837970,0.0,0.0,0.0,88840.0,16.0,0.0,0.0,0.0,177680.0,0,6,12
1,0.100000,0,21.0,52,2120,10837970,0.0,0.0,0.0,88840.0,21.0,0.0,0.0,0.0,177680.0,0,6,12
2,0.200000,0,21.0,53,1990,10837970,0.0,0.0,0.0,88840.0,21.0,0.0,0.0,0.0,177680.0,0,6,12
3,0.300000,0,0.0,34,1841,10837970,0.0,0.0,0.0,88840.0,0.0,0.0,0.0,0.0,177680.0,0,6,12
4,0.400000,0,21.0,54,1726,10837970,0.0,0.0,0.0,88840.0,21.0,0.0,0.0,0.0,177680.0,0,6,12
5,0.500000,0,22.0,53,1598,10837970,0.0,0.0,0.0,88840.0,22.0,0.0,0.0,0.0,177680.0,0,6,12
6,0.600000,0,0.0,29,1444,10837970,0.0,0.0,0.0,88840.0,0.0,0.0,0.0,0.0,177680.0,0,6,12
7,0.700000,0,0.0,13,1316,10837970,0.0,0.0,0.0,88840.0,0.0,0.0,0.0,0.0,177680.0,0,6,12
8,0.800000,0,22.0,48,1222,10837970,0.0,0.0,0.0,88840.0,22.0,0.0,0.0,0.0,177680.0,0,6,12
9,0.900000,0,22.0,47,1113,10837970,0.0,0.0,0.0,88840.0,22.0,0.0,0.0,0.0,177680.0,0,6,12


In [10]:
all_df['date'].value_counts()

28    116640
26    116640
21    116640
20    116640
19    116640
14    116640
13    116640
12    116640
Name: date, dtype: int64

In [11]:
import sys
sys.getsizeof(all_df)/1024/1024

96.1084213256836