In [40]:
import pandas as pd
from borax.calendars.lunardate import LunarDate
import numpy as np

from tqdm import tqdm
import warnings
import lightgbm as lgb
import xgboost as xgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,mean_squared_error
import re
import optuna
from optuna.samplers import TPESampler



warnings.filterwarnings('ignore')

df = pd.read_csv('../data/train.csv')
df.columns = ['date','A','B']
df1 = pd.read_csv('../data/wkd_v1.csv')
df1.columns = ['date','type']
df1.date = pd.to_datetime(df1.date)
df.date = pd.to_datetime(df.date)
df = df.merge(df1,on='date',how='left')


In [41]:
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true))) 


In [42]:
def yang2nong(date):
    
    year = int(date[:4])
    month = int(date[4:6])
    day = int(date[6:8])
    
    nong =  LunarDate.from_solar_date(year, month, day)
    
    res = nong.__format__('%y-%m-%d')
    return res

In [43]:
# 日期特征工程
def get_inner_date_feature_eng(dataframe):
    dataframe['date'] = pd.to_datetime(dataframe['date'])
    dataframe["month"] = dataframe['date'].dt.month
    dataframe["year"] = dataframe['date'].dt.year
    
    dataframe["day"] = dataframe['date'].dt.day
    dataframe["day_of_week"] = dataframe['date'].dt.dayofweek
    dataframe["dayofy"] = dataframe['date'].dt.dayofyear
    dataframe["week"] = dataframe['date'].dt.week
    dataframe["quarter"] = dataframe['date'].dt.quarter

   

   

    dataframe['week_of_year'] = dataframe.date.dt.weekofyear


    dataframe['is_wknd'] = dataframe.date.dt.weekday // 4
    dataframe['is_month_start'] = dataframe.date.dt.is_month_start.astype(int)
    dataframe['is_month_end'] = dataframe.date.dt.is_month_end.astype(int)

    dataframe['week_block_num'] = [int(x) for x in np.floor((dataframe.date - pd.to_datetime('2017-12-31')).dt.days / 7) + 1]
    dataframe['quarter_block_num'] = (dataframe['year'] - 2018) * 4 + dataframe['quarter']
    dataframe['week_of_month'] = dataframe['week_of_year'].values // 4.35    
    #新增星期几时间变量
    dataframe['is_Mon'] = np.where(dataframe['day_of_week'] == 1, 1, 0)                                                                                       
    dataframe['is_Tue'] = np.where(dataframe['day_of_week'] == 2, 1, 0)                                                                                         
    dataframe['is_Wed'] = np.where(dataframe['day_of_week'] == 3, 1, 0)                                                                                         
    dataframe['is_Thu'] = np.where(dataframe['day_of_week'] == 4, 1, 0)                                                                                         
    dataframe['is_Fri'] = np.where(dataframe['day_of_week'] == 5, 1, 0)                                                                                         
    dataframe['is_Sat'] = np.where(dataframe['day_of_week'] == 6, 1, 0)                                                                                         
    dataframe['is_Sun'] = np.where(dataframe['day_of_week'] == 7, 1, 0)
    
    dataframe['day_of_month_10days']=np.where((dataframe['day']<=10) == 1, 1, 0)
    dataframe['day_of_month_20days']=np.where(((dataframe['day']>10)&(dataframe['day']<=20)) == 1, 1, 0)
    dataframe['day_of_month_30days']=np.where((dataframe['day']>20) == 1, 1, 0)
    dataframe['abs_month']=dataframe['day'].apply(lambda x: abs(x-16)+1)
    
    
    dataframe['CH_month'] = dataframe.China_date.dt.month  #农历月
    dataframe['CH_day_of_month'] = dataframe.China_date.dt.day #农历日
    dataframe['CH_day_of_year'] = dataframe.China_date.dt.dayofyear #农历的哪一天
    dataframe['abs_year']=dataframe['CH_day_of_year'].apply(lambda x: abs(x-182)+1)
    return dataframe


In [44]:
#进行农历转换
df['date1'] = df.date.dt.strftime('%Y%m%d')
df['China_date']=df['date1'].apply(lambda x: yang2nong(x))
#农历2月不同与公历2月，
#因此采用前向填充方式以构建pandas时间戳，难题在与20年存在的闰四月现象    
df.replace(['2018-2-29','2018-2-30','2019-2-29',
            '2020-2-29','2020-2-30'],np.nan,inplace=True)
    
df['China_date']=df['China_date'].fillna(method='ffill')
df['China_date'] = pd.to_datetime(df['China_date'])

In [45]:
k = df[df.date>='2019-06-01']
k1 = k[['date','A','type','China_date']]
k2 = k[['date','B','type','China_date']]

In [49]:
k.columns

Index(['date', 'A', 'type', 'China_date', 'month', 'year', 'day',
       'day_of_week', 'dayofy', 'week', 'quarter', 'week_of_year', 'is_wknd',
       'is_month_start', 'is_month_end', 'week_block_num', 'quarter_block_num',
       'week_of_month', 'is_Mon', 'is_Tue', 'is_Wed', 'is_Thu', 'is_Fri',
       'is_Sat', 'is_Sun', 'day_of_month_10days', 'day_of_month_20days',
       'day_of_month_30days', 'abs_month', 'CH_month', 'CH_day_of_month',
       'CH_day_of_year', 'abs_year'],
      dtype='object')

In [58]:
import numpy as np
k = k1.copy()
k = get_inner_date_feature_eng(k)
train1 = k[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year',

        ]][:-30]
train1_y = k['A'][:-30]

valid1 = k[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year',

           ]][-30:]
valid1_y = k['A'][-30:]

In [59]:
test_v1 = pd.read_csv('../data/test.csv')
test_v1.columns = ['date','A','B']
test_v1.date = pd.to_datetime(test_v1.date)
test_v1 = test_v1.merge(df1,on='date',how='left')


#进行农历转换
test_v1['date1'] = test_v1.date.dt.strftime('%Y%m%d')
test_v1['China_date']=test_v1['date1'].apply(lambda x: yang2nong(x))
#农历2月不同与公历2月，
#因此采用前向填充方式以构建pandas时间戳，难题在与20年存在的闰四月现象    
test_v1.replace(['2018-2-29','2018-2-30','2019-2-29',
            '2020-2-29','2020-2-30'],np.nan,inplace=True)
    
test_v1['China_date']=test_v1['China_date'].fillna(method='ffill')
test_v1['China_date'] = pd.to_datetime(test_v1['China_date'])
test_v1_A = test_v1[['date','A','type','China_date']]
test_v1_A = get_inner_date_feature_eng(test_v1_A)
test = test_v1_A[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year']]

In [60]:
import xgboost as xgb
train_matrix = xgb.DMatrix(train1, train1_y,enable_categorical=True)
test_matrix = xgb.DMatrix(valid1, valid1_y,enable_categorical=True)
evals = [(train_matrix, 'train'), (test_matrix, 'val')]

def my_mape(real_value, pre_value): 
    real_value, pre_value = np.array(real_value), np.array(pre_value)
    return np.mean(np.abs((real_value - pre_value) /( real_value+1)))
def eval_score(pre, train_set):
    real = train_set.get_label()
    score = my_mape(real, pre)
    return 'eval_score', score

params = {
            'booster': 'gbtree',
            'objective': 'reg:squarederror',
            'min_child_weight':10,
            'max_depth': 5,
            'subsample': 0.9,
            'colsample_bytree': 0.8,
            'learning_rate': 0.1,
            'seed': 2021,
            'nthread': 8,
            'lambda':0.05
        }
num_round = 10000
early_stopping_rounds = 200    
        

model = xgb.train(params, train_matrix, num_round, evals=evals, verbose_eval=200,feval=eval_score,
                              early_stopping_rounds=early_stopping_rounds
                              )
oof_train = model.predict(xgb.DMatrix(valid1))
print(mape(valid1_y, oof_train))
test_predict = model.predict(xgb.DMatrix(test))
test_predict_A = test_predict



[0]	train-rmse:204689.45312	train-eval_score:0.89933	val-rmse:212747.39062	val-eval_score:0.88825
[200]	train-rmse:5303.02686	train-eval_score:0.01843	val-rmse:32942.39844	val-eval_score:0.10734
[400]	train-rmse:3666.91455	train-eval_score:0.01282	val-rmse:31550.20312	val-eval_score:0.10054
[600]	train-rmse:2775.86060	train-eval_score:0.00977	val-rmse:31023.03516	val-eval_score:0.09773
[800]	train-rmse:2195.92749	train-eval_score:0.00779	val-rmse:30820.58789	val-eval_score:0.09681
[1000]	train-rmse:1759.72705	train-eval_score:0.00634	val-rmse:30872.51562	val-eval_score:0.09660
[1062]	train-rmse:1654.65649	train-eval_score:0.00596	val-rmse:30827.61914	val-eval_score:0.09639
0.09639228017627116


In [None]:
0.09639228017627116

# B厂

In [62]:
k = k2.copy()
k = get_inner_date_feature_eng(k)
train1 = k[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num',
            'CH_day_of_month', 'CH_day_of_year',
            'abs_year',]][:-30]
train1_y = k['B'][:-30]

valid1 = k[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year',]][-30:]
valid1_y = k['B'][-30:]

test = test_v1_A[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year']]

In [70]:
train_matrix = xgb.DMatrix(train1, train1_y,enable_categorical=True)
test_matrix = xgb.DMatrix(valid1, valid1_y,enable_categorical=True)
evals = [(train_matrix, 'train'), (test_matrix, 'val')]

def my_mape(real_value, pre_value): 
    real_value, pre_value = np.array(real_value), np.array(pre_value)
    return np.mean(np.abs((real_value - pre_value) /( real_value+1)))
def eval_score(pre, train_set):
    real = train_set.get_label()
    score = my_mape(real, pre)
    return 'eval_score', score

params = {
            'booster': 'gbtree',
            'objective': 'reg:squarederror',
            'min_child_weight':12,
            'max_depth': 5,
            'subsample': 0.9,
            'colsample_bytree': 0.8,
            'learning_rate': 0.1,
            'seed': 2021,
            'nthread': 8,
            'lambda':0.05
        }
num_round = 10000
early_stopping_rounds = 200    
        

model = xgb.train(params, train_matrix, num_round, evals=evals, verbose_eval=200,feval=eval_score,
                              early_stopping_rounds=early_stopping_rounds
                              )
oof_train = model.predict(xgb.DMatrix(valid1))
print(mape(valid1_y, oof_train))
test_predict = model.predict(xgb.DMatrix(test))
test_predict_B = test_predict

[0]	train-rmse:185664.78125	train-eval_score:0.89936	val-rmse:192404.01562	val-eval_score:0.89040
[200]	train-rmse:3887.38013	train-eval_score:0.01439	val-rmse:12627.81348	val-eval_score:0.04859
[400]	train-rmse:2584.15527	train-eval_score:0.00963	val-rmse:12541.70410	val-eval_score:0.04729
[600]	train-rmse:1887.86206	train-eval_score:0.00722	val-rmse:12673.55469	val-eval_score:0.04713
[632]	train-rmse:1807.56812	train-eval_score:0.00692	val-rmse:12636.21289	val-eval_score:0.04699
0.04699201692521185


In [71]:
test_v11 = pd.read_csv('../data/test.csv')
test_v11['A厂'] = test_predict_A.astype(int)
test_v11['B厂'] = test_predict_B.astype(int)
test_v11

Unnamed: 0,日期,A厂,B厂
0,2020/11/01,214564,226453
1,2020/11/02,224675,230107
2,2020/11/03,224878,232736
3,2020/11/04,219831,241302
4,2020/11/05,215813,240657
...,...,...,...
146,2021/03/27,224216,165420
147,2021/03/28,221732,166339
148,2021/03/29,220489,166234
149,2021/03/30,221414,167922


In [72]:
test_v11.to_csv('./data/xgb_play.csv',index=False)

In [73]:
df1 = pd.read_csv('./data/new_xgb_A_B.csv')
df1.columns=['date','A','B']
df1

Unnamed: 0,date,A,B
0,2020/11/1,272186.00,236361.00
1,2020/11/2,266976.00,243226.00
2,2020/11/3,268541.00,240503.00
3,2020/11/4,266992.00,236450.00
4,2020/11/5,267706.00,238724.00
...,...,...,...
146,2021/3/27,275785.68,259709.40
147,2021/3/28,272730.36,261152.23
148,2021/3/29,280021.03,264312.06
149,2021/3/30,281195.78,263637.54


In [74]:
test_v11 = pd.read_csv('../data/test.csv')
test_v11['A厂'] = df1.A.astype(int)
test_v11['B厂'] = df1.B.astype(int)
test_v11

Unnamed: 0,日期,A厂,B厂
0,2020/11/01,272186,236361
1,2020/11/02,266976,243226
2,2020/11/03,268541,240503
3,2020/11/04,266992,236450
4,2020/11/05,267706,238724
...,...,...,...
146,2021/03/27,275785,259709
147,2021/03/28,272730,261152
148,2021/03/29,280021,264312
149,2021/03/30,281195,263637


In [75]:
test_v11.to_csv('./data/new_xgb_A_B.csv',index=False)