## 分析提取特征
- data/com_training.txt：预处理过后的数据
 - 2017.03.01-2017.07.31
 - [6, 7, 8, 13, 14, 15, 16, 17, 18]
 - log1p处理
 - 填充nan值
 - 5049000 rows 

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.model_selection import ParameterGrid

In [3]:
df = pd.read_csv('../../TimeSeriesPrediction/data/com_training.txt', delimiter=';', parse_dates=['time_interval_begin'], dtype={'link_ID': object})
df.head(10)

Unnamed: 0,link_ID,date,time_interval_begin,travel_time,imputation1
0,3377906280028510514,2017-03-01,2017-03-01 06:00:00,1.647113,True
1,3377906280028510514,2017-03-01,2017-03-01 06:02:00,1.656736,True
2,3377906280028510514,2017-03-01,2017-03-01 06:04:00,1.658209,True
3,3377906280028510514,2017-03-01,2017-03-01 06:06:00,1.663662,True
4,3377906280028510514,2017-03-01,2017-03-01 06:08:00,1.672619,True
5,3377906280028510514,2017-03-01,2017-03-01 06:10:00,1.629241,False
6,3377906280028510514,2017-03-01,2017-03-01 06:12:00,1.629241,False
7,3377906280028510514,2017-03-01,2017-03-01 06:14:00,1.629241,False
8,3377906280028510514,2017-03-01,2017-03-01 06:16:00,1.689991,True
9,3377906280028510514,2017-03-01,2017-03-01 06:18:00,1.695072,True


In [None]:
# 时间特征
def create_lagging(df, df_original, i):
    df1 = df_original.copy()
    df1['time_interval_begin'] = df1['time_interval_begin'] + pd.DateOffset(minutes=i * 2)
    df1 = df1.rename(columns={'travel_time': 'lagging' + str(i)})
    df2 = pd.merge(df, df1[['link_ID', 'time_interval_begin', 'lagging' + str(i)]],
                   on=['link_ID', 'time_interval_begin'],
                   how='left')
    return df2

df1 = create_lagging(df, df, 1)

for i in range(2, 6):
    df1 = create_lagging(df1, df, i)

In [10]:
# 长、宽特征
link_infos = pd.read_csv('data/gy_contest_link_info.txt', delimiter=';', dtype={'link_ID': object})
link_tops = pd.read_csv('data/gy_contest_link_top.txt', delimiter=';', dtype={'link_ID': object})
link_tops['in_links'] = link_tops['in_links'].str.len().apply(lambda x: np.floor(x / 19))
link_tops['out_links'] = link_tops['out_links'].str.len().apply(lambda x: np.floor(x / 19))
link_tops = link_tops.fillna(0)
link_infos = pd.merge(link_infos, link_tops, on=['link_ID'], how='left')
link_infos['links_num'] = link_infos["in_links"].astype('str') + "," + link_infos["out_links"].astype('str')
link_infos['area'] = link_infos['length'] * link_infos['width']
df2 = pd.merge(df1, link_infos[['link_ID', 'length', 'width', 'links_num', 'area']], on=['link_ID'], how='left')

# links_num feature
df2.loc[df2['links_num'].isin(['0.0,2.0', '2.0,0.0', '1.0,0.0']), 'links_num'] = 'other'
# df.boxplot(by=['links_num'], column='travel_time')
# plt.show()

# vacation feature
df2.loc[df2['date'].isin(
    ['2017-04-02', '2017-04-03', '2017-04-04', '2017-04-29', '2017-04-30', '2017-05-01',
     '2017-05-28', '2017-05-29', '2017-05-30']), 'vacation'] = 1
df2.loc[~df2['date'].isin(
    ['2017-04-02', '2017-04-03', '2017-04-04', '2017-04-29', '2017-04-30', '2017-05-01',
     '2017-05-28', '2017-05-29', '2017-05-30']), 'vacation'] = 0

# minute_series for CV
# 早午晚都编码成0-178
df2.loc[df2['time_interval_begin'].dt.hour.isin([6, 7, 8]), 'minute_series'] = \
    df2['time_interval_begin'].dt.minute + (df2['time_interval_begin'].dt.hour - 6) * 60

df2.loc[df2['time_interval_begin'].dt.hour.isin([13, 14, 15]), 'minute_series'] = \
    df2['time_interval_begin'].dt.minute + (df2['time_interval_begin'].dt.hour - 13) * 60

df2.loc[df2['time_interval_begin'].dt.hour.isin([16, 17, 18]), 'minute_series'] = \
    df2['time_interval_begin'].dt.minute + (df2['time_interval_begin'].dt.hour - 16) * 60

# day_of_week_en feature
df2['day_of_week'] = df2['time_interval_begin'].map(lambda x: x.weekday() + 1)
df2.loc[df2['day_of_week'].isin([1, 2, 3]), 'day_of_week_en'] = 1
df2.loc[df2['day_of_week'].isin([4, 5]), 'day_of_week_en'] = 2
df2.loc[df2['day_of_week'].isin([6, 7]), 'day_of_week_en'] = 3

# hour_en feature
df2.loc[df['time_interval_begin'].dt.hour.isin([6, 7, 8]), 'hour_en'] = 1
df2.loc[df['time_interval_begin'].dt.hour.isin([13, 14, 15]), 'hour_en'] = 2
df2.loc[df['time_interval_begin'].dt.hour.isin([16, 17, 18]), 'hour_en'] = 3

# week_hour feature
df2['week_hour'] = df2["day_of_week_en"].astype('str') + "," + df2["hour_en"].astype('str')

# df2.boxplot(by=['week_hour'], column='travel_time')
# plt.show()

df2 = pd.get_dummies(df2, columns=['week_hour', 'links_num', 'width'])

# ID Label Encode
def mean_time(group):
    group['link_ID_en'] = group['travel_time'].mean()
    return group

df2 = df2.groupby('link_ID').apply(mean_time)
sorted_link = np.sort(df2['link_ID_en'].unique())
df2['link_ID_en'] = df2['link_ID_en'].map(lambda x: np.argmin(x >= sorted_link))
# df.boxplot(by=['link_ID_en'], column='travel_time')
# plt.show()

In [11]:
df2.to_csv('../../TimeSeriesPrediction/data/training.txt', header=True, index=None, sep=';', mode='w')

In [23]:
def xgboost_submit(df, params):
    train_df = df.loc[df['time_interval_begin'] < pd.to_datetime('2017-07-01')]

    train_df = train_df.dropna()
    X = train_df[train_feature].values
    y = train_df['travel_time'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

    eval_set = [(X_test, y_test)]
    regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'], n_estimators=params['n_estimators'],
                                 booster='gbtree', objective='reg:linear', n_jobs=-1, subsample=params['subsample'],
                                 colsample_bytree=params['colsample_bytree'], random_state=0,
                                 max_depth=params['max_depth'], gamma=params['gamma'],
                                 min_child_weight=params['min_child_weight'], reg_alpha=params['reg_alpha'])
    regressor.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, eval_metric=mape_ln,
                  eval_set=eval_set)
    feature_vis(regressor, train_feature)
    joblib.dump(regressor, 'model/xgbr.pkl')
    print regressor
    submission(train_feature, regressor, df, 'submission/xgbr1.txt', 'submission/xgbr2.txt', 'submission/xgbr3.txt',
               'submission/xgbr4.txt')


def fit_evaluate(df, df_test, params):
    df = df.dropna()
    X = df[train_feature].values
    y = df['travel_time'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

    df_test = df_test[valid_feature].values
    valid_data = bucket_data(df_test)

    eval_set = [(X_test, y_test)]
    regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'], n_estimators=params['n_estimators'],
                                 objective='reg:linear', subsample=params['subsample'],
                                 colsample_bytree=params['colsample_bytree'],
                                 max_depth=params['max_depth'], gamma=params['gamma'],
                                 min_child_weight=params['min_child_weight'], reg_alpha=params['reg_alpha'])
    regressor.fit(X_train, y_train, verbose=False, early_stopping_rounds=10, eval_metric=mape_ln,
                  eval_set=eval_set)
    # feature_vis(regressor, train_feature)

    return regressor, cross_valid(regressor, valid_data,
                                  lagging=lagging), regressor.best_iteration, regressor.best_score


def train(df, params, best, vis=False):
    train1 = df.loc[df['time_interval_begin'] <= pd.to_datetime('2017-03-24')]  # 0301-0324
    train2 = df.loc[
        (df['time_interval_begin'] > pd.to_datetime('2017-03-24')) & (
            df['time_interval_begin'] <= pd.to_datetime('2017-04-18'))]  # 0324-0418
    train3 = df.loc[
        (df['time_interval_begin'] > pd.to_datetime('2017-04-18')) & (
            df['time_interval_begin'] <= pd.to_datetime('2017-05-12'))]  # 0418-0512
    train4 = df.loc[
        (df['time_interval_begin'] > pd.to_datetime('2017-05-12')) & (
            df['time_interval_begin'] <= pd.to_datetime('2017-06-06'))]  # 0512-0606
    train5 = df.loc[
        (df['time_interval_begin'] > pd.to_datetime('2017-06-06')) & (
            df['time_interval_begin'] <= pd.to_datetime('2017-06-30'))]  # 0606-0630

    regressor, loss1, best_iteration1, best_score1 = fit_evaluate(pd.concat([train1, train2, train3, train4]), train5,
                                                                  params)
    print (best_iteration1, best_score1, loss1)

    regressor, loss2, best_iteration2, best_score2 = fit_evaluate(pd.concat([train1, train2, train3, train5]), train4,
                                                                  params)
    print (best_iteration2, best_score2, loss2)

    regressor, loss3, best_iteration3, best_score3 = fit_evaluate(pd.concat([train1, train2, train4, train5]), train3,
                                                                  params)
    print (best_iteration3, best_score3, loss3)

    regressor, loss4, best_iteration4, best_score4 = fit_evaluate(pd.concat([train1, train3, train4, train5]), train2,
                                                                  params)
    print (best_iteration4, best_score4, loss4)

    regressor, loss5, best_iteration5, best_score5 = fit_evaluate(pd.concat([train2, train3, train4, train5]), train1,
                                                                  params)
    print (best_iteration5, best_score5, loss5)

    if vis:
        xgb.plot_tree(regressor, num_trees=5)
        results = regressor.evals_result()
        epochs = len(results['validation_0']['rmse'])
        x_axis = range(0, epochs)
        fig, ax = plt.subplots()
        ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
        ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
        ax.legend()
        plt.ylabel('rmse Loss')
        plt.ylim((0.2, 0.3))
        plt.show()

    loss = [loss1, loss2, loss3, loss4, loss5]
    params['loss_std'] = np.std(loss)
    params['loss'] = str(loss)
    params['mean_loss'] = np.mean(loss)
    params['n_estimators'] = str([best_iteration1, best_iteration2, best_iteration3, best_iteration4, best_iteration5])
    params['best_score'] = str([best_score1, best_score2, best_score3, best_score4, best_score5])

    print str(params)
    if np.mean(loss) <= best:
        best = np.mean(loss)
        print "best with: " + str(params)
#         feature_vis(regressor, train_feature)
    return best

In [6]:
# train_feature中包括lagging特征
# valid_feature中不包括lagging特征，包括['minute_series', 'travel_time']

lagging = 5
df = pd.read_csv('../../TimeSeriesPrediction/data/training.txt', delimiter=';', parse_dates=['time_interval_begin'], dtype={'link_ID': object})
lagging_feature = ['lagging%01d' % e for e in range(lagging, 0, -1)]  # 生成lagging特征
base_feature = [x for x in df.columns.values.tolist() if x not in ['time_interval_begin', 'link_ID', 'link_ID_int',
                                                                   'date', 'travel_time', 'imputation1',
                                                                   'minute_series', 'area', 'hour_en', 'day_of_week']]
base_feature = [x for x in base_feature if x not in lagging_feature]
train_feature = list(base_feature)
train_feature.extend(lagging_feature)
valid_feature = list(base_feature)
valid_feature.extend(['minute_series', 'travel_time'])
print train_feature
print valid_feature

['length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_0.0,1.0', 'links_num_1.0,1.0', 'links_num_1.0,2.0', 'links_num_1.0,3.0', 'links_num_1.0,4.0', 'links_num_2.0,1.0', 'links_num_2.0,2.0', 'links_num_3.0,1.0', 'links_num_4.0,1.0', 'links_num_other', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'link_ID_en', 'lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']
['length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_0.0,1.0', 'links_num_1.0,1.0', 'links_num_1.0,2.0', 'links_num_1.0,3.0', 'links_num_1.0,4.0', 'links_num_2.0,1.0', 'links_num_2.0,2.0', 'links_num_3.0,1.0', 'links_num_4.0,1.0', 'links_num_o

In [24]:
# train
params_grid = {
    'learning_rate': [0.05],
    'n_estimators': [100],
    'subsample': [0.6],
    'colsample_bytree': [0.6],
    'max_depth': [7],
    'min_child_weight': [1],
    'reg_alpha': [2],
    'gamma': [0]
}

grid = ParameterGrid(params_grid)
best = 1

for params in grid:
    best = train(df, params, best)

(0, 0.865125, 0.80786214674780721)
(0, 0.866922, 0.8240044555484739)
(0, 0.866211, 0.79502990524713169)
(0, 0.866606, 0.79436080458845282)
(0, 0.866445, 0.77696943932234408)
{'loss': '[0.80786214674780721, 0.8240044555484739, 0.79502990524713169, 0.79436080458845282, 0.77696943932234408]', 'reg_alpha': 2, 'colsample_bytree': 0.6, 'learning_rate': 0.05, 'min_child_weight': 1, 'best_score': '[0.865125, 0.866922, 0.866211, 0.866606, 0.866445]', 'n_estimators': '[0, 0, 0, 0, 0]', 'subsample': 0.6, 'mean_loss': 0.79964535029084194, 'loss_std': 0.015648050661031108, 'max_depth': 7, 'gamma': 0}
best with: {'loss': '[0.80786214674780721, 0.8240044555484739, 0.79502990524713169, 0.79436080458845282, 0.77696943932234408]', 'reg_alpha': 2, 'colsample_bytree': 0.6, 'learning_rate': 0.05, 'min_child_weight': 1, 'best_score': '[0.865125, 0.866922, 0.866211, 0.866606, 0.866445]', 'n_estimators': '[0, 0, 0, 0, 0]', 'subsample': 0.6, 'mean_loss': 0.79964535029084194, 'loss_std': 0.015648050661031108, '

In [18]:
# key为倒数第二位，value为余下特征
def bucket_data(lines):  # lines是df[valid_feature].values
    bucket = {}
    for line in lines:
        time_series = line[-2]  #　倒数第二位为time_series特征，0-178
        bucket[time_series] = []  # bucket的keys为0-178
    for line in lines:
        time_series, y1 = line[-2:]  # 倒数第二位，y1最后一位
        line = np.delete(line, -2, axis=0)  # 删除倒数第二位
        bucket[time_series].append(line)  # key为倒数第二位，value为余下特征
    return bucket


def cross_valid(regressor, bucket, lagging):
    valid_loss = []
    last = [[] for i in range(len(bucket[bucket.keys()[0]]))]
    for time_series in sorted(bucket.keys(), key=float):
        if time_series >= 120:
            if int(time_series) in range(120, 120 + lagging * 2, 2):
                last = np.concatenate((last, np.array(bucket[time_series], dtype=float)[:, -1].reshape(-1, 1)), axis=1)
            else:
                batch = np.array(bucket[time_series], dtype=float)
                y = batch[:, -1]
                batch = np.delete(batch, -1, axis=1)
                batch = np.concatenate((batch, last), axis=1)
                y_pre = regressor.predict(batch)
                last = np.delete(last, 0, axis=1)
                last = np.concatenate((last, y_pre.reshape(-1, 1)), axis=1)
                loss = np.mean(abs(np.expm1(y) - np.expm1(y_pre)) / np.expm1(y))
                valid_loss.append(loss)
    # print 'day: %d loss: %f' % (int(day), day_loss)
    return np.mean(valid_loss)


def mape_ln(y, d):
    c = d.get_label()
    result = np.sum(np.abs((np.expm1(y) - np.expm1(c)) / np.expm1(c))) / len(c)
    return "mape", result


def feature_vis(regressor, train_feature):
    importances = regressor.feature_importances_
    indices = np.argsort(importances)[::-1]
    selected_features = [train_feature[e] for e in indices]
    plt.figure(figsize=(20, 10))
    plt.title("train_feature importances")
    plt.bar(range(len(train_feature)), importances[indices],
            color="r", align="center")
    plt.xticks(range(len(selected_features)), selected_features, rotation=70)
    plt.show()


# ------------------------------------------------Submission ---------------------------------------------


def submission(train_feature, regressor, df, file1, file2, file3, file4):
    test_df = df.loc[((df['time_interval_begin'].dt.year == 2017) & (df['time_interval_begin'].dt.month == 7)
                      & (df['time_interval_begin'].dt.hour.isin([7, 14, 17])) & (
                          df['time_interval_begin'].dt.minute == 58))].copy()

    test_df['lagging5'] = test_df['lagging4']
    test_df['lagging4'] = test_df['lagging3']
    test_df['lagging3'] = test_df['lagging2']
    test_df['lagging2'] = test_df['lagging1']
    test_df['lagging1'] = test_df['travel_time']

    with open(file1, 'w'):
        pass
    with open(file2, 'w'):
        pass
    with open(file3, 'w'):
        pass
    with open(file4, 'w'):
        pass

    for i in range(30):
        test_X = test_df[train_feature]
        y_prediction = regressor.predict(test_X.values)

        test_df['lagging5'] = test_df['lagging4']
        test_df['lagging4'] = test_df['lagging3']
        test_df['lagging3'] = test_df['lagging2']
        test_df['lagging2'] = test_df['lagging1']
        test_df['lagging1'] = y_prediction

        test_df['predicted'] = np.expm1(y_prediction)
        test_df['time_interval_begin'] = test_df['time_interval_begin'] + pd.DateOffset(minutes=2)
        test_df['time_interval'] = test_df['time_interval_begin'].map(
            lambda x: '[' + str(x) + ',' + str(x + pd.DateOffset(minutes=2)) + ')')
        test_df.time_interval = test_df.time_interval.astype(object)
        if i < 7:
            test_df[['link_ID', 'date', 'time_interval', 'predicted']].to_csv(file1, mode='a', header=False,
                                                                              index=False,
                                                                              sep=';')
        elif (7 <= i) and (i < 14):
            test_df[['link_ID', 'date', 'time_interval', 'predicted']].to_csv(file2, mode='a', header=False,
                                                                              index=False,
                                                                              sep=';')
        elif (14 <= i) and (i < 22):
            test_df[['link_ID', 'date', 'time_interval', 'predicted']].to_csv(file3, mode='a', header=False,
                                                                              index=False,
                                                                              sep=';')
        else:
            test_df[['link_ID', 'date', 'time_interval', 'predicted']].to_csv(file4, mode='a', header=False,
                                                                              index=False,
                                                                              sep=';')

In [2]:
0.80786214674780721+0.8240044555484739+0.79502990524713169+0.79436080458845282+0.77696943932234408

3.99822675145421

In [4]:
1- 3.99822675145421/5

0.20035464970915806

In [16]:
# key为倒数第二位，value为余下特征
def bucket_data(lines):  # lines是df[valid_feature].values
    bucket = {}
    for line in lines:
        time_series = line[-2]  #　倒数第二位为time_series特征，0-178
        bucket[time_series] = []  # bucket的keys为0-178
    for line in lines:
        time_series, y1 = line[-2:]  # 倒数第二位，y1最后一位
        line = np.delete(line, -2, axis=0)  # 删除倒数第二位
        bucket[time_series].append(line)  # key为倒数第二位，value为余下特征
    return bucket


def cross_valid(regressor, bucket, lagging):
    valid_loss = []
    last = [[] for i in range(len(bucket[bucket.keys()[0]]))]  # 有多少个数据
    for time_series in sorted(bucket.keys(), key=float):
        if time_series >= 120:  # 增加lagging特征
            if int(time_series) in range(120, 120 + lagging * 2, 2):
                last = np.concatenate((last, np.array(bucket[time_series], dtype=float)[:, -1].reshape(-1, 1)), axis=1)
            else:
                batch = np.array(bucket[time_series], dtype=float)  # 取出特征的值
                y = batch[:, -1]  # travel_time
                batch = np.delete(batch, -1, axis=1)  # 删除最后一行travel_time
                batch = np.concatenate((batch, last), axis=1)
                y_pre = regressor.predict(batch)
                last = np.delete(last, 0, axis=1)
                last = np.concatenate((last, y_pre.reshape(-1, 1)), axis=1)
                loss = np.mean(abs(np.expm1(y) - np.expm1(y_pre)) / np.expm1(y))
                valid_loss.append(loss)
    # print 'day: %d loss: %f' % (int(day), day_loss)
    return np.mean(valid_loss)

In [17]:
df_test = train1[valid_feature].values
valid_data = bucket_data(df_test)

In [14]:
train1 = df.loc[df['time_interval_begin'] <= pd.to_datetime('2017-03-24')]  # 0301-0324

In [33]:
batch = np.array(valid_data[valid_data.keys()[0]], dtype=float)

In [38]:
range(120, 120 + 5 * 2, 2)

[120, 122, 124, 126, 128]

In [None]:
last = np.concatenate((last, np.array(bucket[time_series], dtype=float)[:, -1].reshape(-1, 1)), axis=1)

In [55]:
last = np.concatenate((last, np.array(valid_data[valid_data.keys()[70]], dtype=float)[:, -1].reshape(-1, 1)), axis=1)

In [54]:
np.concatenate((last, np.array(valid_data[valid_data.keys()[70]], dtype=float)[:, -1].reshape(-1, 1)), axis=1)

array([[ 1.93703611,  1.93703611],
       [ 1.98123143,  1.98123143],
       [ 1.80828877,  1.80828877],
       ..., 
       [ 1.22377543,  1.22377543],
       [ 1.60943791,  1.60943791],
       [ 1.45861502,  1.45861502]])

In [59]:
np.delete(last, 0, axis=1)

array([[ 1.93703611],
       [ 1.98123143],
       [ 1.80828877],
       ..., 
       [ 1.22377543],
       [ 1.60943791],
       [ 1.45861502]])