In [34]:
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')
%matplotlib inline

In [40]:
def label_holiday_miss(data, holiday):
    holiday = holiday.astype(str)
    data = data.set_index('时间')
    data['holiday_flag'] = 0
    start_date = holiday.iloc[:,1].tolist()+holiday.iloc[:,3].tolist()+holiday.iloc[:,5].tolist()
    end_date = holiday.iloc[:,2].tolist()+holiday.iloc[:,4].tolist()+holiday.iloc[:,6].tolist()
    ss = holiday.iloc[:,0].tolist()*3
    for s, e, n in zip(start_date, end_date, ss):
        data.loc[s:e, 'holiday_flag'] = n
    return data

def split_abc(data):
    data_a = data[data['地市'] == 'A'].drop(['地市'], axis=1)
    data_b = data[data['地市'] == 'B'].drop(['地市'], axis=1)
    data_c = data[data['地市'] == 'C'].drop(['地市'], axis=1)
    data_a.fillna(method='bfill', inplace=True)
    data_b.fillna(method='bfill', inplace=True)
    data_c.fillna(method='bfill', inplace=True)
    return data_a, data_b, data_c

def xgb_mape(y_predicted, dtrain_a):
    y_true = dtrain_a.get_label()
    score = np.mean(np.abs((y_predicted - y_true) / y_true))
    return ('mape', score)

def xgb_smape(y_predicted, dtrain_a):
    y_true = dtrain_a.get_label()
    score =  2.0 * np.mean(np.abs(y_predicted - y_true) / (np.abs(y_predicted) + np.abs(y_true))) * 100
    return ('smape', score)

def add_date_features(train_df, test_df, encode_cols=['Year','Month','WeekofYear','Dayofyear','holiday_flag']):
    # extract a few features from datetime
    df = train_df.append(test_df)
    df['Year'] = df.index.year
    df['Month'] = df.index.month
    df['WeekofYear'] = df.index.weekofyear
    df['Dayofyear'] = df.index.dayofyear
    df['DayofWeek'] = df.index.weekday
    #df['Hour'] = df.index.hour
    df['quarter'] = df.index.quarter
    #df['holiday'] = df['holiday_flag']  #
    # one hot encoder
    for col in encode_cols:
        df[col] = df[col].astype('category')
    df = pd.get_dummies(df, columns=encode_cols)
    return df

def get_test_data(start_date, end_date, bucket_size):
    index = pd.date_range(start_date,end_date,freq=bucket_size)
    df = pd.DataFrame({'时间':index})
    df['流量'] = -11
    return df

def add_trend(train_df, test_df):
    lr = LinearRegression()
    x = np.arange(train_df.shape[0]).reshape(-1, 1)
    train_df['log流量'] = np.log(train_df['流量'])
    test_df['log流量'] = -11
    lr.fit(x, train_df['log流量'])
    a, b = lr.coef_, lr.intercept_
    x = np.arange(train_df.shape[0]+test_df.shape[0]).reshape(-1,1)
    trend = x*a+b
    df = train_df.append(test_df)
    df['trend'] = trend
    df['diff'] = df['log流量'] - df.trend
    train_df = df[df['流量'] != -11]
    test_df = df[df['流量'] == -11]
    return train_df, test_df

In [41]:
def xgb_regression(data, params,n_fold=5,feval=None):
    train = data[data['流量'] != -11]
    test = data[data['流量'] == -11]
    X = train.drop(['流量','log流量','diff','trend'], axis=1).values
    Y = train['diff'].values
    test_X = test.drop(['流量','log流量','diff','trend'], axis=1).values
    trend = train['trend'].values
    test_trend = test['trend'].values
    kf = KFold(n_splits=n_fold, random_state=2019)
    oof = np.zeros(X.shape[0])
    pre = np.zeros(test_X.shape[0])
    dtest = xgb.DMatrix(test_X)
    importance_df = []
    for train_index, valid_index in kf.split(X):
        train_x, train_y = X[train_index,:],Y[train_index]
        valid_x, valid_y = X[valid_index,:],Y[valid_index]
        dtrain = xgb.DMatrix(train_x, train_y)
        dvalid = xgb.DMatrix(valid_x, valid_y)
        model = xgb.train(params, dtrain, 10000, evals=[(dtrain, 'train'), (dvalid, 'validate')],
                          early_stopping_rounds=30, verbose_eval=0, feval=None)
        #importance = xgb_model_a.get_fscore()
        oof[valid_index] = model.predict(dvalid)
        pre = pre+model.predict(dtest)/n_fold
        #importance = model.get_fscore()
        #importance_df.append(importance)
    oof = np.exp(oof+trend)
    pre = np.exp(pre+test_trend)
    oof_df = pd.DataFrame({'时间':train.index,'流量':oof})
    pre_df = pd.DataFrame({'时间':test.index,'流量':pre})
    return oof_df, pre_df#importance_df

def result_visualization(pre, oof, original_data):
    plt.figure(figsize=(30,15))
    #original_data = original_data[original_data['流量']!=-11]
    plt.plot(original_data['流量'].values,alpha=0.6, label='train')
    plt.plot(oof['流量'].values,alpha=0.5, label='predict')
    plt.plot(np.arange(original_data.shape[0],original_data.shape[0]+pre.shape[0]),pre['流量'].values, label='forecast')
    plt.legend()

# load data

In [42]:
data = pd.read_excel('../data/data_1.xlsx')
holiday = pd.read_excel('../data/holiday_dates.xlsx')
#holiday = holiday.iloc[:2,:]

data = label_holiday_miss(data, holiday)  # 标记节假日
data_a, data_b, data_c = split_abc(data)  # 划分城市ABC
test= get_test_data('2018-11-16 00:00:00','2019-2-18 23:00:00','H')  # 公共的测试集
test = label_holiday_miss(test, holiday)  # 标记节假日     only 春节和元旦

# city A

In [43]:
params = {'booster':'gbtree',
              'objective':'reg:linear',
              'subsample':0.8, # 80
              'colsample':0.8, # 80
              'eta':0.03,   #0.03
              'max_depth':10,  # 10
              'seed':42}

In [44]:
result_a = pd.DataFrame()
pred_a = pd.DataFrame()
for i in tqdm(range(24)):
    train_a = data_a[data_a.index.hour ==i]
    #test= get_test_data('2018-11-16 00:00:00','2019-2-18 23:00:00','H')  # 公共的测试集
    #test = label_holiday_miss(test, holiday)  # 标记节假日     only 春节和元旦
    test_a = test[test.index.hour==i]
    train_a, test_a = add_trend(train_a, test_a)
    temp = add_date_features(train_a, test_a)
    oof, pre = xgb_regression(temp, params, feval=xgb_mape)
    result_a = result_a.append(pre)
    pred_a = pred_a.append(oof)
pred_a = pred_a.sort_values(by='时间')
result_a = result_a.sort_values(by='时间')

100%|██████████| 24/24 [03:57<00:00,  9.68s/it]


In [46]:
result_b = pd.DataFrame()
pred_b = pd.DataFrame()
for i in tqdm(range(24)):
    train_b = data_b[data_b.index.hour ==i]
    #test= get_test_data('2018-11-16 00:00:00','2019-2-18 23:00:00','H')  # 公共的测试集
    #test = label_holiday_miss(test, holiday)  # 标记节假日     only 春节和元旦
    test_b = test[test.index.hour==i]
    train_b, test_b = add_trend(train_b, test_b)
    temp = add_date_features(train_b, test_b)
    oof, pre = xgb_regression(temp, params, feval=xgb_mape)
    result_b = result_b.append(pre)
    pred_b = pred_b.append(oof)
pred_b = pred_b.sort_values(by='时间')
result_b = result_b.sort_values(by='时间')

100%|██████████| 24/24 [03:35<00:00,  8.95s/it]


# city c

In [48]:
result_c = pd.DataFrame()
pred_c = pd.DataFrame()
for i in tqdm(range(24)):
    train_c = data_c[data_c.index.hour ==i]
    #test= get_test_data('2018-11-16 00:00:00','2019-2-18 23:00:00','H')  # 公共的测试集
    #test = label_holiday_miss(test, holiday)  # 标记节假日     only 春节和元旦
    test_c = test[test.index.hour==i]
    train_c, test_c = add_trend(train_c, test_c)
    temp = add_date_features(train_c, test_c)
    oof, pre = xgb_regression(temp, params, feval=xgb_mape)
    result_c = result_c.append(pre)
    pred_c = pred_c.append(oof)
pred_c = pred_c.sort_values(by='时间')
result_c = result_c.sort_values(by='时间')

100%|██████████| 24/24 [02:45<00:00,  6.30s/it]


In [32]:
def make_submission(ra, rb, rc):
    result_a = pd.DataFrame({'时间':pd.date_range('2018-11-16 00:00:00','2019-2-18 23:00:00',freq='H')})
    result_a['地市'] = ['A']*result_a.shape[0]
    result_a['流量'] = ra
    result_b = pd.DataFrame({'时间':pd.date_range('2018-11-16 00:00:00','2019-2-18 23:00:00',freq='H')})
    result_b['地市'] = ['B']*result_b.shape[0]
    result_b['流量'] = rb
    result_c = pd.DataFrame({'时间':pd.date_range('2018-11-16 00:00:00','2019-2-18 23:00:00',freq='H')})
    result_c['地市'] = ['C']*result_c.shape[0]
    result_c['流量'] = rc
    result = result_a.append(result_b)
    result = result.append(result_c)
    return result

submit = make_submission(result_a['流量'].values,result_b['流量'].values,result_c['流量'].values)

submit['流量'] = submit['流量'].apply(lambda x: np.round(x,3))
submit.to_csv('../result_baseline2.csv',index=False,encoding='gbk')