In [1]:
import pandas as pd
from borax.calendars.lunardate import LunarDate
import numpy as np

from tqdm import tqdm
import warnings
import lightgbm as lgb
import xgboost as xgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,mean_squared_error
import re
import optuna
from optuna.samplers import TPESampler



warnings.filterwarnings('ignore')

df = pd.read_csv('../data/train.csv')
df.columns = ['date','A','B']
df1 = pd.read_csv('../data/wkd_v1.csv')
df1.columns = ['date','type']
df1.date = pd.to_datetime(df1.date)
df.date = pd.to_datetime(df.date)
df = df.merge(df1,on='date',how='left')


In [2]:
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true))) 

def yang2nong(date):
    
    year = int(date[:4])
    month = int(date[4:6])
    day = int(date[6:8])
    
    nong =  LunarDate.from_solar_date(year, month, day)
    
    res = nong.__format__('%y-%m-%d')
    return res

# 日期特征工程
def get_inner_date_feature_eng(dataframe):
    dataframe['date'] = pd.to_datetime(dataframe['date'])
    dataframe["month"] = dataframe['date'].dt.month
    dataframe["year"] = dataframe['date'].dt.year
    
    dataframe["day"] = dataframe['date'].dt.day
    dataframe["day_of_week"] = dataframe['date'].dt.dayofweek
    dataframe["dayofy"] = dataframe['date'].dt.dayofyear
    dataframe["week"] = dataframe['date'].dt.week
    dataframe["quarter"] = dataframe['date'].dt.quarter

   

   
    
    dataframe['week_of_year'] = dataframe.date.dt.weekofyear


    dataframe['is_wknd'] = dataframe.date.dt.weekday // 4
    dataframe['is_month_start'] = dataframe.date.dt.is_month_start.astype(int)
    dataframe['is_month_end'] = dataframe.date.dt.is_month_end.astype(int)

    dataframe['week_block_num'] = [int(x) for x in np.floor((dataframe.date - pd.to_datetime('2017-12-31')).dt.days / 7) + 1]
    dataframe['quarter_block_num'] = (dataframe['year'] - 2018) * 4 + dataframe['quarter']
    dataframe['week_of_month'] = dataframe['week_of_year'].values // 4.35    
    #新增星期几时间变量
    dataframe['is_Mon'] = np.where(dataframe['day_of_week'] == 1, 1, 0)                                                                                       
    dataframe['is_Tue'] = np.where(dataframe['day_of_week'] == 2, 1, 0)                                                                                         
    dataframe['is_Wed'] = np.where(dataframe['day_of_week'] == 3, 1, 0)                                                                                         
    dataframe['is_Thu'] = np.where(dataframe['day_of_week'] == 4, 1, 0)                                                                                         
    dataframe['is_Fri'] = np.where(dataframe['day_of_week'] == 5, 1, 0)                                                                                         
    dataframe['is_Sat'] = np.where(dataframe['day_of_week'] == 6, 1, 0)                                                                                         
    dataframe['is_Sun'] = np.where(dataframe['day_of_week'] == 7, 1, 0)
    
    dataframe['day_of_month_10days']=np.where((dataframe['day']<=10) == 1, 1, 0)
    dataframe['day_of_month_20days']=np.where(((dataframe['day']>10)&(dataframe['day']<=20)) == 1, 1, 0)
    dataframe['day_of_month_30days']=np.where((dataframe['day']>20) == 1, 1, 0)
    dataframe['abs_month']=dataframe['day'].apply(lambda x: abs(x-16)+1)
    
    
    dataframe['CH_month'] = dataframe.China_date.dt.month  #农历月
    dataframe['CH_day_of_month'] = dataframe.China_date.dt.day #农历日
    dataframe['CH_day_of_year'] = dataframe.China_date.dt.dayofyear #农历的哪一天
    dataframe['abs_year']=dataframe['CH_day_of_year'].apply(lambda x: abs(x-182)+1)
    return dataframe


In [3]:
#进行农历转换
df['date1'] = df.date.dt.strftime('%Y%m%d')
df['China_date']=df['date1'].apply(lambda x: yang2nong(x))
#农历2月不同与公历2月，
#因此采用前向填充方式以构建pandas时间戳，难题在与20年存在的闰四月现象    
df.replace(['2018-2-29','2018-2-30','2019-2-29',
            '2020-2-29','2020-2-30'],np.nan,inplace=True)
    
df['China_date']=df['China_date'].fillna(method='ffill')
df['China_date'] = pd.to_datetime(df['China_date'])

k = df[df.date>='2019-06-01']
k1 = k[['date','A','type','China_date']]
k2 = k[['date','B','type','China_date']]

In [40]:
import numpy as np
k = k1.copy()
k = get_inner_date_feature_eng(k)
train1 = k[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year',

        ]][:-30]
train1_y = k['A'][:-30]

valid1 = k[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year',

           ]][-30:]
valid1_y = k['A'][-30:]

In [41]:
test_v1 = pd.read_csv('../data/test.csv')
test_v1.columns = ['date','A','B']
test_v1.date = pd.to_datetime(test_v1.date)
test_v1 = test_v1.merge(df1,on='date',how='left')


#进行农历转换
test_v1['date1'] = test_v1.date.dt.strftime('%Y%m%d')
test_v1['China_date']=test_v1['date1'].apply(lambda x: yang2nong(x))
#农历2月不同与公历2月，
#因此采用前向填充方式以构建pandas时间戳，难题在与20年存在的闰四月现象    
test_v1.replace(['2018-2-29','2018-2-30','2019-2-29',
            '2020-2-29','2020-2-30'],np.nan,inplace=True)
    
test_v1['China_date']=test_v1['China_date'].fillna(method='ffill')
test_v1['China_date'] = pd.to_datetime(test_v1['China_date'])
test_v1_A = test_v1[['date','A','type','China_date']]
test_v1_A = get_inner_date_feature_eng(test_v1_A)
test = test_v1_A[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year']]

In [44]:

train_matrix = lgb.Dataset(train1, train1_y)
test_matrix = lgb.Dataset(valid1, valid1_y)


def my_mape(real_value, pre_value): 
    real_value, pre_value = np.array(real_value), np.array(pre_value)
    return np.mean(np.abs((real_value - pre_value) /( real_value+1)))
def eval_score(pre, train_set):
    real = train_set.get_label()
    score = my_mape(real, pre)
    return 'eval_score', score

params = {
                'boosting_type': 'gbdt',
                'objective': 'regression',
                'metrics':'mean_squared_error',
                'num_leaves': 2 ** 5-1,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'learning_rate': 0.05,
                'lambda_l1': 0.05,                                                                                                                    
                'lambda_l2': 0.05,
                'seed': 2021,


                'nthread': 8,
                'verbose': -1,


}
num_round = 10000
early_stopping_rounds = 200    
        

model = lgb.train(params, train_matrix, num_round, valid_sets=test_matrix, verbose_eval=200,
                                #feval=tpr_eval_score,
                                early_stopping_rounds=early_stopping_rounds)

oof_train = model.predict(valid1)
print(mape(valid1_y, oof_train))
test_predict = model.predict(test)
test_predict_A = test_predict



Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 1.22111e+09
Early stopping, best iteration is:
[109]	valid_0's l2: 1.15337e+09
0.11814220398105223


# B厂

In [45]:
k = k2.copy()
k = get_inner_date_feature_eng(k)
train1 = k[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num',
            'CH_day_of_month', 'CH_day_of_year',
            'abs_year',]][:-30]
train1_y = k['B'][:-30]

valid1 = k[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year',]][-30:]
valid1_y = k['B'][-30:]

test = test_v1_A[['day', 'dayofy', 'week',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year']]

In [52]:

train_matrix = lgb.Dataset(train1, train1_y)
test_matrix = lgb.Dataset(valid1, valid1_y)




params = {
                'boosting_type': 'gbdt',
                'objective': 'regression',
                'metrics':'mean_squared_error',
                'num_leaves': 2 ** 5-1,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'learning_rate': 0.2,
                'lambda_l1': 0.05,                                                                                                                    
                'lambda_l2': 0.05,
                'seed': 2021,


                'nthread': 8,
                'verbose': -1,


}
num_round = 10000
early_stopping_rounds = 200    
        

model = lgb.train(params, train_matrix, num_round, valid_sets=test_matrix, verbose_eval=200,
                                #feval=tpr_eval_score,
                                early_stopping_rounds=early_stopping_rounds)

oof_train = model.predict(valid1)
print(mape(valid1_y, oof_train))
test_predict = model.predict(test)
test_predict_B = test_predict



Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 1.72473e+08
Early stopping, best iteration is:
[95]	valid_0's l2: 1.60257e+08
0.050800488983888864


In [53]:
test_v11 = pd.read_csv('../data/test.csv')
test_v11['A厂'] = test_predict_A.astype(int)
test_v11['B厂'] = test_predict_B.astype(int)
test_v11

Unnamed: 0,日期,A厂,B厂
0,2020/11/01,218402,218967
1,2020/11/02,219475,219921
2,2020/11/03,220110,223912
3,2020/11/04,218151,226783
4,2020/11/05,218375,224826
...,...,...,...
146,2021/03/27,215984,163340
147,2021/03/28,215578,161085
148,2021/03/29,217024,164958
149,2021/03/30,214074,164632


In [54]:
test_v11.to_csv('./data/lgb_play.csv',index=False)

In [55]:
df1 = pd.read_csv('./data/new_lgb_A_B_.csv')
df1.columns=['date','A','B']
df1

Unnamed: 0,date,A,B
0,2020/11/1,272186.00,236361.0
1,2020/11/2,266976.00,243226.0
2,2020/11/3,268541.00,240503.0
3,2020/11/4,266992.00,236450.0
4,2020/11/5,267706.00,238724.0
...,...,...,...
146,2021/3/27,276459.52,258077.2
147,2021/3/28,273784.06,260957.7
148,2021/3/29,279960.96,263932.8
149,2021/3/30,280436.94,263411.2


In [56]:
test_v11 = pd.read_csv('../data/test.csv')
test_v11['A厂'] = df1.A.astype(int)
test_v11['B厂'] = df1.B.astype(int)
test_v11

Unnamed: 0,日期,A厂,B厂
0,2020/11/01,272186,236361
1,2020/11/02,266976,243226
2,2020/11/03,268541,240503
3,2020/11/04,266992,236450
4,2020/11/05,267706,238724
...,...,...,...
146,2021/03/27,276459,258077
147,2021/03/28,273784,260957
148,2021/03/29,279960,263932
149,2021/03/30,280436,263411


In [57]:
test_v11.to_csv('./data/new_lgb_A_B.csv',index=False)

In [58]:
df1 = pd.read_csv('./data/new_lgb_A_B.csv')
df1.columns=['date','A','B']

df2 = pd.read_csv('./data/new_xgb_A_B.csv')
df2.columns=['date','A','B']

In [59]:
test_v11 = pd.read_csv('../data/test.csv')
test_v11['A厂'] = ((df1['A']+df2['A'])/2).astype(int)
test_v11['B厂'] = ((df1['B']+df2['B'])/2).astype(int)

test_v11

Unnamed: 0,日期,A厂,B厂
0,2020/11/01,272186,236361
1,2020/11/02,266976,243226
2,2020/11/03,268541,240503
3,2020/11/04,266992,236450
4,2020/11/05,267706,238724
...,...,...,...
146,2021/03/27,276122,258893
147,2021/03/28,273257,261054
148,2021/03/29,279990,264122
149,2021/03/30,280815,263524


In [60]:
test_v11.to_csv('./data/new_lgb_xgb_mean_A_B.csv',index=False,encoding = 'utf-8')