In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook
import xgboost as xgb
from sklearn.model_selection import KFold, RepeatedKFold

In [2]:
##10分钟进出站人数统计
def status_sum(df_data, lineID=None, stationID=None):
    df_copy = df_data.copy().set_index('time')
    
    series_status1 = None
    series_status0 = None
    dict_series = {}
    
    if lineID != None:
        df_copy = df_copy[df_copy.lineID == lineID]
        dict_series['lineID'] = lineID
    if stationID != None:
        df_copy = df_copy[df_copy.stationID == stationID]
        dict_series['stationID'] = stationID
    
    series_resample = df_copy['status'].resample('10min', closed='left')
    series_status1 = series_resample.sum()
    series_status0 = series_resample.count() - series_status1
    dict_series.update({'outNums':series_status0, 'inNums':series_status1})
    return pd.DataFrame(dict_series)

##将时间戳转换为顺序序列
def time_convert_to_index(df_data):
    str_date = df_data.time[0].strftime('%Y-%m-%d')
    range_time = pd.date_range(str_date, periods = 144, freq='10min')
    dict_timestamp = {}
    for n_index, i in enumerate(range_time):
        dict_timestamp[i] = n_index
    df_copy = df_data.copy()
    df_copy['time_index'] = df_copy['time'].apply(lambda x: dict_timestamp[x])
    return df_copy

In [4]:
data_path = '../dataset/Metro_train'
df_train = pd.read_csv(os.path.join(data_path, 'record_2019-01-01.csv'))
# train.head()

##换乘点数据
metromap_path = '../dataset/'
metromap = pd.read_csv(os.path.join(metromap_path, 'Metro_roadMap.csv'), index_col=0)
list_joint = []
for i in range(81):
    list_joint.append('joint_' + str(i))
metromap.columns = list_joint

##时间戳onehot
df_time_index = pd.DataFrame(np.eye(144, dtype=np.uint8))
list_time = []
for i in range(144):
    list_time.append('time_index_' + str(i))
df_time_index.columns = list_time
df_time_index.head()

##星期onehot
df_weekday_onehot = pd.DataFrame(np.eye(7, dtype=np.uint8))
    
list_weekday = []
for i in range(1, 8):
    list_weekday.append('week_' + str(i))
df_weekday_onehot.columns = list_weekday


##地铁站点和线路onehot
list_lineID = []
for i in sorted(df_train.lineID.unique()):
    list_lineID.append('lineID_' + str(i))
for i in sorted(df_train.stationID.unique()):
    list_lineID.append('stationID_' + str(i))

columns_train = list_joint + list_time + list_weekday + list_lineID
# columns_train

del df_train

In [5]:
# data_path = 'Metro_train'
# train = pd.read_csv(os.path.join(data_path, 'record_2019-01-01.csv'))
# train.head()


def process_single_file(filename):
    ##文件读取
    train = pd.read_csv(filename)
    ##时间格式转换
    train['time'] = pd.to_datetime(train['time'], format='%Y-%m-%d %H:%M:%S')
        
    ##10分钟各个站点进出人数
    df_train = pd.DataFrame()
    for i in sorted(train.stationID.unique()):
        df_train = df_train.append(status_sum(train, None, i).reset_index(), ignore_index=True)
    
    ##添加时间序列
    df_train = time_convert_to_index(df_train)
    
    ##星期转换
    df_train['weekdays'] = df_train['time'].dt.dayofweek
    
    ##线路ID补全
    dict_lineID = train.drop_duplicates('stationID', 'first')[['lineID', 'stationID']].set_index('stationID').to_dict()['lineID']
    df_train['lineID'] = df_train['stationID'].apply(lambda x: dict_lineID[x])
    
    ##星期onehot
    df_train = df_train.merge(df_weekday_onehot, left_on='weekdays', right_index=True)
    
    ##地铁换乘点数据添加
    df_train = df_train.merge(metromap, left_on='stationID', right_index=True)

    ##地铁站点和线路onehot
    df_tmp = df_train.copy()
    df_train = pd.get_dummies(df_tmp, columns=['lineID', 'stationID'])
    
    
    ##时间戳onehot
    df_train = df_train.merge(df_time_index, left_on='time_index', right_index=True)
    
    return df_train

In [6]:
list_filepath = []
for root, dirs, files in os.walk(data_path):
    for n_index, file in enumerate(sorted(files)):
        list_filepath.append(os.path.join(root, file))
list_filepath

['../dataset/Metro_train/record_2019-01-01.csv',
 '../dataset/Metro_train/record_2019-01-02.csv',
 '../dataset/Metro_train/record_2019-01-03.csv',
 '../dataset/Metro_train/record_2019-01-04.csv',
 '../dataset/Metro_train/record_2019-01-05.csv',
 '../dataset/Metro_train/record_2019-01-06.csv',
 '../dataset/Metro_train/record_2019-01-07.csv',
 '../dataset/Metro_train/record_2019-01-08.csv',
 '../dataset/Metro_train/record_2019-01-09.csv',
 '../dataset/Metro_train/record_2019-01-10.csv',
 '../dataset/Metro_train/record_2019-01-11.csv',
 '../dataset/Metro_train/record_2019-01-12.csv',
 '../dataset/Metro_train/record_2019-01-13.csv',
 '../dataset/Metro_train/record_2019-01-14.csv',
 '../dataset/Metro_train/record_2019-01-15.csv',
 '../dataset/Metro_train/record_2019-01-16.csv',
 '../dataset/Metro_train/record_2019-01-17.csv',
 '../dataset/Metro_train/record_2019-01-18.csv',
 '../dataset/Metro_train/record_2019-01-19.csv',
 '../dataset/Metro_train/record_2019-01-20.csv',
 '../dataset/Metro_t

In [7]:
df_train_set = pd.DataFrame()
df_temp1 = process_single_file(list_filepath[0])
for n_index, file in tqdm_notebook(enumerate(list_filepath[1:])):
    print(n_index, file)
    
    df_temp2= process_single_file(file)
    df_temp1['next_day'] = df_temp1['time'] + pd.Timedelta(1, 'D')
    df_temp3 = df_temp1.merge(df_temp2[['time', 'inNums', 'outNums']],
                    left_on='next_day', right_on='time',
                    suffixes=("", "_pre"))
    
    df_train_set = df_train_set.append(df_temp3, ignore_index=True)
    df_temp1 = df_temp2

del df_temp1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0 ../dataset/Metro_train/record_2019-01-02.csv
1 ../dataset/Metro_train/record_2019-01-03.csv
2 ../dataset/Metro_train/record_2019-01-04.csv
3 ../dataset/Metro_train/record_2019-01-05.csv
4 ../dataset/Metro_train/record_2019-01-06.csv
5 ../dataset/Metro_train/record_2019-01-07.csv
6 ../dataset/Metro_train/record_2019-01-08.csv
7 ../dataset/Metro_train/record_2019-01-09.csv
8 ../dataset/Metro_train/record_2019-01-10.csv
9 ../dataset/Metro_train/record_2019-01-11.csv
10 ../dataset/Metro_train/record_2019-01-12.csv
11 ../dataset/Metro_train/record_2019-01-13.csv
12 ../dataset/Metro_train/record_2019-01-14.csv
13 ../dataset/Metro_train/record_2019-01-15.csv
14 ../dataset/Metro_train/record_2019-01-16.csv
15 ../dataset/Metro_train/record_2019-01-17.csv
16 ../dataset/Metro_train/record_2019-01-18.csv
17 ../dataset/Metro_train/record_2019-01-19.csv
18 ../dataset/Metro_train/record_2019-01-20.csv
19 ../dataset/Metro_train/record_2019-01-21.csv
20 ../dataset/Metro_train/record_2019-01-22.csv
21

In [None]:
X_train = df_train_set[columns_train + ['inNums']]
y_train = df_train_set['inNums_pre'].values

del df_train_set

# y_train[0]
xgb_params = { 'booster':'gblinear','eval_metric': 'mae', 'silent': True, 'nthread': 4}

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    
    trn_data = xgb.DMatrix(X_train.iloc[trn_idx], y_train[trn_idx])
    val_data = xgb.DMatrix(X_train.iloc[val_idx], y_train[val_idx])

    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000,
                    evals=watchlist, early_stopping_rounds=200, 
                    verbose_eval=100, params=xgb_params)

fold n°1
