In [1]:
import pandas as pd
import numpy as np
import os
import lightgbm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error 

from borax.calendars.lunardate import LunarDate



In [2]:
def yang2nong(date):
    
    year = int(date[:4])
    month = int(date[4:6])
    day = int(date[6:8])
    
    nong =  LunarDate.from_solar_date(year, month, day)
    
    res = nong.__format__('%y-%m-%d')
    return res

def create_features(dataframe):
    dataframe['date'] = pd.to_datetime(dataframe['date'])
    dataframe['month'] = dataframe.date.dt.month
    dataframe['day_of_month'] = dataframe.date.dt.day
    dataframe['day_of_year'] = dataframe.date.dt.dayofyear
    dataframe['week_of_year'] = dataframe.date.dt.weekofyear
    dataframe['day_of_week'] = dataframe.date.dt.dayofweek + 1
    dataframe['year'] = dataframe.date.dt.year
    dataframe['is_wknd'] = dataframe.date.dt.weekday // 4
    dataframe['is_month_start'] = dataframe.date.dt.is_month_start.astype(int)
    dataframe['is_month_end'] = dataframe.date.dt.is_month_end.astype(int)
    dataframe['quarter'] = dataframe.date.dt.quarter
    dataframe['week_block_num'] = [int(x) for x in np.floor((dataframe.date - pd.to_datetime('2017-12-31')).dt.days / 7) + 1]
    dataframe['quarter_block_num'] = (dataframe['year'] - 2018) * 4 + dataframe['quarter']
    dataframe['week_of_month'] = dataframe['week_of_year'].values // 4.35
    
    #新增星期几时间变量
    dataframe['is_Mon'] = np.where(dataframe['day_of_week'] == 1, 1, 0)                                                                                       
    dataframe['is_Tue'] = np.where(dataframe['day_of_week'] == 2, 1, 0)                                                                                         
    dataframe['is_Wed'] = np.where(dataframe['day_of_week'] == 3, 1, 0)                                                                                         
    dataframe['is_Thu'] = np.where(dataframe['day_of_week'] == 4, 1, 0)                                                                                         
    dataframe['is_Fri'] = np.where(dataframe['day_of_week'] == 5, 1, 0)                                                                                         
    dataframe['is_Sat'] = np.where(dataframe['day_of_week'] == 6, 1, 0)                                                                                         
    dataframe['is_Sun'] = np.where(dataframe['day_of_week'] == 7, 1, 0)
    #新增每月上中下旬
    dataframe['day_of_month_10days']=np.where((dataframe['day_of_month']<=10) == 1, 1, 0)
    dataframe['day_of_month_20days']=np.where(((dataframe['day_of_month']>10)&(dataframe['day_of_month']<=20)) == 1, 1, 0)
    dataframe['day_of_month_30days']=np.where((dataframe['day_of_month']>20) == 1, 1, 0)
    dataframe['day_of_year_>180days']=np.where((dataframe['day_of_year']>180) == 1, 1, 0)
    dataframe['day_of_year_<180days']=np.where((dataframe['day_of_year']<=180) == 1, 1, 0)


    dataframe['abs_month']=dataframe['day_of_month'].apply(lambda x: abs(x-16)+1)
    
    
    dataframe['CH_month'] = dataframe.China_date.dt.month  #农历月
    dataframe['CH_day_of_month'] = dataframe.China_date.dt.day #农历日
    dataframe['CH_day_of_year'] = dataframe.China_date.dt.dayofyear #农历的哪一天
    dataframe['abs_year']=dataframe['CH_day_of_year'].apply(lambda x: abs(x-182)+1)    
    
    #dataframe.drop(['date','post_id'],axis=1,inplace=True)
    return dataframe

In [5]:
#数据读取与wkd表连接
train_df=pd.read_csv('../data/train.csv')
train_df.columns=['date','A','B']
train_df.date=pd.to_datetime(train_df.date)
test_day=pd.read_csv('../data/test.csv')#按天计算
#wkd=pd.read_csv('../data/wkd_v1.csv')    #不使用用外部数据
#wkd=wkd.rename(columns={'ORIG_DT':'date'})
#wkd.date=pd.to_datetime(wkd.date)
#train_df=train_df.merge(wkd,on='date',how='left')
test_day.columns = ['date','A','B']
test_day.date = pd.to_datetime(test_day.date)
#test_day = test_day.merge(wkd,on='date',how='left')

#进行农历转换
test_day['date1'] = test_day.date.dt.strftime('%Y%m%d')
test_day['China_date']=test_day['date1'].apply(lambda x: yang2nong(x))
#农历2月不同与公历2月，
#因此采用前向填充方式以构建pandas时间戳，难题在与20年存在的闰四月现象    
test_day.replace(['2018-2-29','2018-2-30','2019-2-29',
            '2020-2-29','2020-2-30'],np.nan,inplace=True)
    
test_day['China_date']=test_day['China_date'].fillna(method='ffill')
test_day['China_date'] = pd.to_datetime(test_day['China_date'])

#进行农历转换
train_df['date1'] = train_df.date.dt.strftime('%Y%m%d')
train_df['China_date']=train_df['date1'].apply(lambda x: yang2nong(x))
#农历2月不同与公历2月，
#因此采用前向填充方式以构建pandas时间戳，难题在与20年存在的闰四月现象    
train_df.replace(['2018-2-29','2018-2-30','2019-2-29',
            '2020-2-29','2020-2-30'],np.nan,inplace=True)
    
train_df['China_date']=train_df['China_date'].fillna(method='ffill')
train_df['China_date'] = pd.to_datetime(train_df['China_date'])

train_day_df_A = train_df[['date','A','China_date']]
train_day_df_B = train_df[['date','B','China_date']]

train_day_df_A=create_features(train_day_df_A)
train_day_df_B=create_features(train_day_df_B)

train_day_df_A['A']=train_day_df_A['A']
train_day_df_B['B']=train_day_df_B['B']


test_day_df_A = test_day[['date','A','China_date']]
test_day_df_B = test_day[['date','B','China_date']]
test_day_df_A=create_features(test_day_df_A)
test_day_df_B=create_features(test_day_df_B)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

In [6]:
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true))) 

In [7]:
def lgb_cv(train_x, train_y, test_x):
    predictors = list(train_x.columns)
    train_x = train_x.values
    test_x = test_x.values
    
    
    folds = 10
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    train = np.zeros((train_x.shape[0]))
    test = np.zeros((test_x.shape[0]))
    test_pre = np.zeros((folds, test_x.shape[0]))
    test_pre_all = np.zeros((folds, test_x.shape[0]))
    cv_scores = []
    tpr_scores = []
    cv_rounds = []

    for i, (train_index, test_index) in enumerate(kf.split(train_x, train_y)):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]
        train_matrix = lightgbm.Dataset(tr_x, label=tr_y)
        test_matrix = lightgbm.Dataset(te_x, label=te_y)
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metrics':'mean_squared_error',
            'num_leaves': 2 ** 5-1,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'learning_rate': 0.05,
            'lambda_l1': 0.01,                                                                                                                    
            'lambda_l2': 0.01,
            'seed': 2021,
            
            
            'nthread': 4,
            'verbose': -1,
            
            
        }
        num_round = 10000
        early_stopping_rounds = 200    
        
        if test_matrix:
            model = lightgbm.train(params, train_matrix, num_round, valid_sets=test_matrix, verbose_eval=200,
                              #feval=tpr_eval_score,
                              early_stopping_rounds=early_stopping_rounds
                              )
            #print("\n".join(("%s: %.2f" % x) for x in list(sorted(zip(predictors, model.feature_importance("gain")),
                        #key=lambda x: x[1],reverse=True))[:30]))
            importance_list=[ x[0] for x in list(sorted(zip(predictors, model.feature_importance("gain")),
                        key=lambda x: x[1],reverse=True))]
            
            pre = model.predict(te_x, num_iteration=model.best_iteration)#
            pred = model.predict(test_x, num_iteration=model.best_iteration)#
            train[test_index] = pre
            test_pre[i, :] = pred
            cv_scores.append(mean_squared_error (te_y, pre))
            cv_rounds.append(model.best_iteration)
            test_pre_all[i, :] = pred
        #
        print("cv_score is:", cv_scores)
    use_mean=True
    if use_mean:
        test[:] = test_pre.mean(axis=0)
    else:
        pass
    #
    print("val_mean:" , np.mean(cv_scores))
    print("val_std:", np.std(cv_scores))
    return model, train, test, test_pre_all, np.mean(cv_scores),importance_list

In [8]:
if __name__=="__main__":

    select_frts=['day_of_year', 'day_of_month', 'week_of_year',
            'week_of_year', 'week_block_num', 'CH_day_of_month',
            'CH_day_of_year', 'abs_year', ]    #burata特征选择结果

    train_df=train_day_df_A#训练集A
    
    
    train_df=train_df[((train_df['date']>='2019-05-01'))].reset_index(drop=True)
    test_df=test_day_df_A#测试集A
    train_x = train_df[select_frts].copy()
    train_y = train_df['A']
    test_x = test_df[select_frts].copy()
    print(train_x.shape,train_y.shape,test_x.shape)
    model,lgb_train, lgb_test, sb, cv_scores, importance_list = lgb_cv(train_x, train_y, test_x)
    lgb_test_A=[item if item>0 else 0 for item in lgb_test]
    val_A_pre = model.predict(train_x[-30:])
    print(f"A厂的验证集mape为{mape(val_A_pre, train_y[-30:])}")
    
    
    train_df=train_day_df_B#训练集B
    train_df=train_df[((train_df['date']>='2019-05-01'))].reset_index(drop=True)
 
    

    test_df=test_day_df_B#测试集B
    train_x = train_df[select_frts].copy()
    train_y = train_df['B']
    test_x = test_df[select_frts].copy()
    print(train_x.shape,train_y.shape,test_x.shape)
    model,lgb_train, lgb_test, sb, cv_scores, importance_list = lgb_cv(train_x, train_y, test_x)
    
    val_B_pre = model.predict(train_x[-30:])
    print(f"B厂的验证集mape为{mape(val_B_pre, train_y[-30:])}")    
    
    lgb_test_B=[item if item>0 else 0 for item in lgb_test]
    print(np.mean(lgb_test_A),np.sum(lgb_test_A),np.mean(lgb_test_B),np.sum(lgb_test_B))
    pre_A=np.array(lgb_test_A)
    pre_B=np.array(lgb_test_B)
    
    pre_day_A=[]
    pre_day_B=[]
    for i in range(151):
        pre_day_A.append(pre_A[i])
        pre_day_B.append(pre_B[i])
    test_day['A']=pre_day_A
    test_day['B']=pre_day_B

(550, 8) (550,) (151, 8)
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 1.96912e+08
[400]	valid_0's l2: 1.47627e+08
[600]	valid_0's l2: 1.28195e+08
[800]	valid_0's l2: 1.20884e+08
[1000]	valid_0's l2: 1.15834e+08
[1200]	valid_0's l2: 1.13484e+08
[1400]	valid_0's l2: 1.11905e+08
[1600]	valid_0's l2: 1.11699e+08
[1800]	valid_0's l2: 1.11296e+08
Early stopping, best iteration is:
[1721]	valid_0's l2: 1.10941e+08
cv_score is: [110940659.75047639]
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 1.23176e+08
[400]	valid_0's l2: 1.08842e+08
[600]	valid_0's l2: 1.06989e+08
[800]	valid_0's l2: 1.07472e+08
Early stopping, best iteration is:
[681]	valid_0's l2: 1.06513e+08
cv_score is: [110940659.75047639, 106513195.56387173]
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 1.57461e+08
[400]	valid_0's l2: 1.4825e+08
[600]	valid_0's l2: 1.45406e+08
[800]	valid_0's l2: 1.41722e+08
[1000]	valid_0's l

Early stopping, best iteration is:
[198]	valid_0's l2: 3.42939e+07
cv_score is: [39567640.3188509, 152396315.07154322, 46985117.20279528, 48076370.16902754, 185085063.29742968, 52782529.678257644, 52193153.23436018, 34293873.57084108]
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 6.26995e+07
[400]	valid_0's l2: 5.43561e+07
[600]	valid_0's l2: 5.36191e+07
Early stopping, best iteration is:
[498]	valid_0's l2: 5.34072e+07
cv_score is: [39567640.3188509, 152396315.07154322, 46985117.20279528, 48076370.16902754, 185085063.29742968, 52782529.678257644, 52193153.23436018, 34293873.57084108, 53407210.23511568]
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l2: 6.99081e+07
[400]	valid_0's l2: 6.21119e+07
[600]	valid_0's l2: 6.14074e+07
[800]	valid_0's l2: 6.12763e+07
Early stopping, best iteration is:
[616]	valid_0's l2: 6.11721e+07
cv_score is: [39567640.3188509, 152396315.07154322, 46985117.20279528, 48076370.16902754, 185085

In [None]:
A厂的验证集mape为0.006748706322373056
B厂的验证集mape为0.006538176240147742

In [22]:
test_day['A'] = test_day['A'].astype(int)
test_day['B'] = test_day['B'].astype(int)

In [25]:
test_v11 = pd.read_csv('../data/test.csv')
test_v11['A厂'] = test_day['A']
test_v11['B厂'] = test_day['B']
test_v11.to_csv('./data/lgb_kfold_play_.csv',index=False,encoding = 'utf-8')   #存储原始结果
test_v11

Unnamed: 0,日期,A厂,B厂
0,2020/11/01,262634,227376
1,2020/11/02,264562,229334
2,2020/11/03,265148,230754
3,2020/11/04,259702,236091
4,2020/11/05,258913,235196
...,...,...,...
146,2021/03/27,220933,163203
147,2021/03/28,219865,161103
148,2021/03/29,219443,162642
149,2021/03/30,217775,162795


In [24]:
test_day[['date','A','B']]

Unnamed: 0,date,A,B
0,2020-11-01,262634,227376
1,2020-11-02,264562,229334
2,2020-11-03,265148,230754
3,2020-11-04,259702,236091
4,2020-11-05,258913,235196
...,...,...,...
146,2021-03-27,220933,163203
147,2021-03-28,219865,161103
148,2021-03-29,219443,162642
149,2021-03-30,217775,162795


excel操作后读取规则2021年结果，规则细节在total_submit文件中有

In [33]:
import pandas as pd
df1 = pd.read_csv('./data/lgb_fold_play__2021_970_20212_101.csv')   
df1.columns=['date','A','B']
df1

Unnamed: 0,date,A,B
0,2020/11/1,272186.00,236361.00
1,2020/11/2,266976.00,243226.00
2,2020/11/3,268541.00,240503.00
3,2020/11/4,266992.00,236450.00
4,2020/11/5,267706.00,238724.00
...,...,...,...
146,2021/3/27,270559.81,254224.07
147,2021/3/28,269251.86,255687.56
148,2021/3/29,275184.60,258130.75
149,2021/3/30,275226.01,258373.15


In [34]:
test_v11 = pd.read_csv('../data/test.csv')
test_v11['A厂'] = df1.A.astype(int)
test_v11['B厂'] = df1.B.astype(int)
test_v11

Unnamed: 0,日期,A厂,B厂
0,2020/11/01,272186,236361
1,2020/11/02,266976,243226
2,2020/11/03,268541,240503
3,2020/11/04,266992,236450
4,2020/11/05,267706,238724
...,...,...,...
146,2021/03/27,270559,254224
147,2021/03/28,269251,255687
148,2021/03/29,275184,258130
149,2021/03/30,275226,258373


In [35]:
test_v11.to_csv('./data/lgb_fold_play__2021_970_20212_101.csv',index=False)