In [1]:
import sys
import numpy as np
import pandas as pd
import os 
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
#### 读取数据集 (只用训练集和测试集的销量数据)
path= 'data/Train/'
train_sales  = pd.read_csv(path+'train_sales_data.csv')
train_search = pd.read_csv(path+'train_search_data.csv')
train_user   = pd.read_csv(path+'train_user_reply_data.csv')
test_sales = pd.read_csv(path+'evaluation_public.csv')
submit_example = pd.read_csv(path+'submit_example.csv')

In [3]:
old_model = ['3c974920a76ac9c1','3d7554f1f56dd664','2d0d2c3403909fdb',
 'a28bb927b6fcb33c','17bc272c93f19d56','2a2ab41f8f6ff1cb',
 'c06a2a387c0ee510','7023efdab9cedc03','af6f4f548684e14d',
 '7cf283430b3b5e38','d4efbebb087fd03f','7245e0ee27b195cd',
 '8c915fe4632fb9fa','6155b214590c66e6','28e29f2c03dcd84c',
 '37aa9169b575ef79','63065128401bb3ff','ea489c253676aafc',
 'cd5841d44fd7625e','b25c4e2e3856af22','4a103c30d593fbbe',
 '7a7885e2d7c00bcf','346393c2c6305fb1','02aab221aabc03b9',
 '5d7fb682edd0f937','a207df29ec9583f0','b4be3a4917289c82',
 'ef76a85c4b39f693','bb9fbec9a2833839','da457d15788fe8ee',
 '6858d6dfe680bdf7','79de4e4b24c35b04','12f8b7e14947c34d',
 '04e66e578f653ab9','dff803b4024d261d','61e73e32ad101892',
 'a432c483b5beb856','0797526c057dcf5b','936168bd4850913d',
 'cc21c7e91a3b5a0c','7aab7fca2470987e','fde95ea242abd896',
 '97f15de12cfabbd5','f5d69960089c3614','5b1c11c3efed5312',
 '17363f08d683d52b','06880909932890ca','9c1c7ee8ebdda299',
 'c6833cb891626c17','3e21824be728cbec','f8a6975573af1b33',
 '54fc07138d70374c','212083a9246d2fd3','4f79773e600518a6',
 'fc32b1a017b34efe','feabbf46658382b9','f270f6a489c6a9d7',
 'd0f245b8781e3631','c6cd4e0e073f5ac2','a9a43d1a7ecbe75d']

test_sales = test_sales.iloc[test_sales[~test_sales.model.isin(old_model)].index]
print(test_sales.regMonth.nunique())

4


In [4]:
data = pd.concat([train_sales, test_sales], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = np.log(data['salesVolume'])
data['id'] = data['id'].fillna(0).astype(int)
#data['popularity'] = np.log(data['popularity'])
data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
#LabelEncoder 类别特征编码
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']


In [5]:
### 特征工程
def get_stat_feature(df_):    #### 获得前12个月的历史平移特征
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']
    for col in tqdm(['label']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9,10,11,12]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i   ## 在当前月份的基础上往后偏移 i 个月
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i)) ### 设置为偏移的月份为下标
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])  ### 按照偏移的月份匹配，即得到历史偏移特征，即同比环比
    
    for col in tqdm(['popularity']):
        # shift
        for i in [1,2,3,10,11,12]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i   ## 在当前月份的基础上往后偏移 i 个月
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i)) ### 设置为偏移的月份为下标
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])  ### 按照偏移的月份匹配，即得到历史偏移特征，即同比环比
    
    
    return df,stat_feat


def get_trend_feat(df_):
    df = df_.copy()
    trend_feat = []
    df_month = df.groupby(['adcode', 'model','mt'])['label'].mean().unstack(level=-1).reset_index() ## 统计出每一种model在每个省的月销量均值
    df_1 = df[['adcode','model','mt']]
    df_1 = pd.merge(df_1,df_month,on=['adcode','model'],how='left')
    df_2 = pd.DataFrame()
    trend_feat = ['adcode_model_trend_year_div','adcode_model_trend_month_sub','adcode_model_trend_year_sub',\
                 'adcode_model_trend_half_year','adcode_model_trend_three_month','adcode_model_trend_two_month']
    for month in range(14,29):
        tmp1 = df_1[df_1.mt==month]
        tmp1['adcode_model_trend_year_div'] = df_1[month-1] / df_1[month-13]
        tmp1['adcode_model_trend_month_sub'] = df_1[month-1] - df_1[month-2]
        tmp1['adcode_model_trend_year_sub'] = df_1[month-1] - df_1[month-13]
        tmp1['adcode_model_trend_half_year'] =  (df_1[month-1]+df_1[month-2]+df_1[month-3]+df_1[month-4]+df_1[month-5]+df_1[month-6])/ \
                                            (df_1[month-7]+df_1[month-8]+df_1[month-9]+df_1[month-10]+df_1[month-11]+df_1[month-12])
        tmp1['adcode_model_trend_three_month'] =  (df_1[month-1]+df_1[month-2]+df_1[month-3])/(df_1[month-4]+df_1[month-5]+df_1[month-6])
        tmp1['adcode_model_trend_two_month'] =  (df_1[month-1]+df_1[month-2])/(df_1[month-3]+df_1[month-4])
        df_2 = pd.concat([df_2,tmp1[['adcode','model','mt']+trend_feat]])
    df = pd.merge(df,df_2,on=['adcode','model','mt'],how='left')
    return df,trend_feat

def get_smooth_feat(df_):
    df = df_.copy()
    
    tmp_df = df[['adcode', 'model', 'regMonth','mt']]
    for m in range(1, 29):
        tmp_df = pd.merge(tmp_df, df[df.mt==m][['adcode', 'model', 'label']], on=['adcode', 'model'], how='left').rename(columns={'label': m})
    df_2 = pd.DataFrame()
    smooth_feat = ['a','b','c']
    for m in range(14,29):
        df_3 = pd.DataFrame()
        df_1 = tmp_df[tmp_df.mt==m]
        ### 指数平滑部分 
        df_3[m-14] = (df_1[m-13]+df_1[m-12]+df_1[m-11]) / 3
        ph_factor = 0.5
        for i in range(m-13,m):  ### 一次平滑
            df_3[i] = df_1[i] * ph_factor + (1-ph_factor) * df_3[i-1] 
        df_3[m-14+30] = (df_3[m-13]+df_3[m-12]+df_3[m-11])/3
        for i in range(m-13,m):  ### 二次平滑
            df_3[i+30] = df_3[i] * ph_factor +(1-ph_factor) * df_3[i+30-1]
        df_3[m-14+60] = (df_3[m-13+30]+df_3[m-12+30]+df_3[m-11+30])/3
        for i in range(m-13,m):  ### 三次平滑
            df_3[i+60] = df_3[i+30] * ph_factor +(1-ph_factor) * df_3[i+60-1]
        a = 3 * df_3[m-1] - 3 * df_3[m-1+30] + df_3[m-1+60]
        b = ((6-5*ph_factor)*df_3[m-1]-2*(5-4*ph_factor)*df_3[m-1+30]+(4-3*ph_factor)*df_3[m-1+60]) * ph_factor / (2*(1-ph_factor)*(1-ph_factor))
        c = (df_3[m-1]-2*df_3[m-1+30]+df_3[m-1+60]) * ph_factor*ph_factor/(2*(1-ph_factor)*(1-ph_factor))
        df_1['a'] = a
        df_1['b'] = b
        df_1['c'] = c
        df_2 = pd.concat([df_2,df_1[['adcode', 'model','mt']+smooth_feat]])
    print(df_2.head(5))
    df = pd.merge(df,df_2,on=['adcode','model','mt'],how='left')
    return df,smooth_feat
        
get_smooth_feat(data)

       adcode  model  mt         a         b         c
17160  310000      0  14  5.777354 -0.018044 -0.011865
17161  530000      0  14  6.112306  0.095870  0.001347
17162  150000      0  14  5.235214 -0.043259 -0.009351
17163  110000      0  14  6.204539  0.031391 -0.006780
17164  510000      0  14  6.248989  0.032074 -0.003026


(       adcode  bodyType  forecastVolum    id  model province  regMonth  \
 0      310000         0            NaN     0      0       上海         1   
 1      530000         0            NaN     0      0       云南         1   
 2      150000         0            NaN     0      0      内蒙古         1   
 3      110000         0            NaN     0      0       北京         1   
 4      510000         0            NaN     0      0       四川         1   
 ...       ...       ...            ...   ...    ...      ...       ...   
 45227  350000         1            NaN  7300     67       福建         4   
 45228  210000         1            NaN  7301     67       辽宁         4   
 45229  500000         1            NaN  7302     67       重庆         4   
 45230  610000         1            NaN  7303     67       陕西         4   
 45231  230000         1            NaN  7304     67      黑龙江         4   
 
        regYear  salesVolume  popularity  carCommentVolum  newsReplyVolum  \
 0         2016      

In [6]:
#################### 将特征放入模型进行测评 ###########

In [7]:
#### 评价函数
def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)	

#### 获得模型
def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):   
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              categorical_feature=cate_feat, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse' 
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)   
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13  ### 只用17年之后的数据
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-4))
    valid_idx = (df['mt'].between(m-3, m-3))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    
    print('train_idx:',st ,m-4)
    print('valid_idx:',m-3,m-3)
    print('test_idx :',m  ,m  )  
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']   
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)  
    # offline
    df['pred_label'] = model.predict(df[features])
    print(len(valid_idx))
    print(df[valid_idx].shape)
    try:
        best_score = score(df[valid_idx]) 
    except:
        pass
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features]) 
    print('valid mean:',np.exp(df[valid_idx]['pred_label'].mean()))
    print('true  mean:',np.exp(df[valid_idx]['label'].mean()))
    print('test  mean:',np.exp(df[test_idx]['forecastVolum'].mean()))
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x)
    return sub,df[valid_idx]['pred_label']

In [8]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    data_df_1, stat_feat = get_stat_feature(data)  ### 获得历史平移结果
    data_df_2, trend_feat = get_trend_feat(data_df_1)  ### 获得趋势特征
    MEAN = data_df_2.adcode_model_trend_year_div.mean()
    #print(sorted(data_df_2['adcode_model_trend_year_div'].unique()))
    data_df_2.loc[data_df_2.adcode_model_trend_year_div > 2.5 ,'adcode_model_trend_year_div'] = max(1.01,MEAN)
    data_df_2.loc[data_df_2.adcode_model_trend_year_div < 0.7 ,'adcode_model_trend_year_div'] = min(0.99,MEAN)
    MEAN = data_df_2.adcode_model_trend_half_year.mean()
    data_df_2.loc[data_df_2.adcode_model_trend_half_year > 2.5 ,'adcode_model_trend_half_year'] = max(1.01,MEAN)
    data_df_2.loc[data_df_2.adcode_model_trend_half_year < 0.7 ,'adcode_model_trend_half_year'] = min(0.99,MEAN)
    MEAN = data_df_2.adcode_model_trend_three_month.mean()
    data_df_2.loc[data_df_2.adcode_model_trend_three_month > 2.5 ,'adcode_model_trend_three_month'] = max(1.01,MEAN)
    data_df_2.loc[data_df_2.adcode_model_trend_three_month < 0.7 ,'adcode_model_trend_three_month'] = min(0.99,MEAN)
    MEAN = data_df_2.adcode_model_trend_two_month.mean()
    data_df_2.loc[data_df_2.adcode_model_trend_two_month > 2.5 ,'adcode_model_trend_two_month'] = max(1.01,MEAN)
    data_df_2.loc[data_df_2.adcode_model_trend_two_month < 0.7 ,'adcode_model_trend_two_month'] = min(0.99,MEAN)
    
#     for f in feat_list:
#         print(data_df_2[f].min(),data_df_2[f].max())
#         print(sorted(data_df_2[f].unique()))
#     0/0
#     feat_list = ['adcode_model_trend_year_div','adcode_model_trend_half_year','adcode_model_trend_three_month','adcode_model_trend_two_month']
#     for f in feat_list:
#         data_df_2.loc[data_df_2[f] > 9, f] = 1.01
#         data_df_2.loc[(data_df_2[f] <= 9) & (data_df_2[f] > 1.2), f] = np.sqrt(np.sqrt(data_df_2.loc[(data_df_2[f] <= 9) &\
#                                                                                                    (data_df_2[f] > 1.2), f]))
#         data_df_2.loc[data_df_2[f] < 0.4, f] = 0.99
#         data_df_2.loc[(data_df_2[f] <= 0.7) & (data_df_2[f] >= 0.4), f] = np.sqrt(data_df_2.loc[(data_df_2[f] <= 0.6) &\
#                                                                                                      (data_df_2[f] > 0.4), f])

    #print(sorted(data_df_2['adcode_model_trend_year_div'].unique()))
    data_df, smooth_feat = get_smooth_feat(data_df_2)  ### 获得趋势特征
    print(data_df.shape)
    #print(data_df.head(5))
    num_feat = ['regYear'] + stat_feat + trend_feat + smooth_feat#+trend_feat1+trend_feat2#+ stastic_feat   ### 年份特征+历史平移
    cate_feat = ['adcode','bodyType','model','regMonth']  ### 类别特征
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat ### 特征等于 数值特征 + 类别特征
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    print(sub.head(5))
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values*0.99
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values*0.99

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.07it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.97it/s]


       adcode  model  mt         a         b         c
17160  310000      0  14  5.777354 -0.018044 -0.011865
17161  530000      0  14  6.112306  0.095870  0.001347
17162  150000      0  14  5.235214 -0.043259 -0.009351
17163  110000      0  14  6.204539  0.031391 -0.006780
17164  510000      0  14  6.248989  0.032074 -0.003026
(45232, 61)
32 32
all_idx  : 13 24
train_idx: 13 21
valid_idx: 22 22
test_idx : 25 25
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0392822	valid_1's l2: 0.0813264
[200]	training's l2: 0.0256018	valid_1's l2: 0.079604
[300]	training's l2: 0.0201341	valid_1's l2: 0.0787603
[400]	training's l2: 0.0167373	valid_1's l2: 0.0779493
[500]	training's l2: 0.0143367	valid_1's l2: 0.0777968
[600]	training's l2: 0.0124814	valid_1's l2: 0.0774659
[700]	training's l2: 0.0108639	valid_1's l2: 0.0773123
[800]	training's l2: 0.00956229	valid_1's l2: 0.0769737
[900]	training's l2: 0.00847194	valid_1's l2: 0.076859
Early stopping, best itera

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.20it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.37it/s]


       adcode  model  mt         a         b         c
17160  310000      0  14  5.777354 -0.018044 -0.011865
17161  530000      0  14  6.112306  0.095870  0.001347
17162  150000      0  14  5.235214 -0.043259 -0.009351
17163  110000      0  14  6.204539  0.031391 -0.006780
17164  510000      0  14  6.248989  0.032074 -0.003026
(45232, 61)
32 32
all_idx  : 13 25
train_idx: 13 22
valid_idx: 23 23
test_idx : 26 26
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0406562	valid_1's l2: 0.0600421
Early stopping, best iteration is:
[64]	training's l2: 0.0553308	valid_1's l2: 0.0567942
45232
(1804, 62)
0.9359717261490854
valid mean: 289.1663159273284
true  mean: 314.2448985525272
test  mean: 93.40985372690129
         id  forecastVolum
43629  5702       0.410704
43782  5855       5.295335
43783  5856       5.097105
43784  5857       4.609061
43785  5858       3.988941


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.40it/s]


       adcode  model  mt         a         b         c
17160  310000      0  14  5.777354 -0.018044 -0.011865
17161  530000      0  14  6.112306  0.095870  0.001347
17162  150000      0  14  5.235214 -0.043259 -0.009351
17163  110000      0  14  6.204539  0.031391 -0.006780
17164  510000      0  14  6.248989  0.032074 -0.003026
(45232, 61)
32 32
all_idx  : 13 26
train_idx: 13 23
valid_idx: 24 24
test_idx : 27 27
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0416155	valid_1's l2: 0.147885
[200]	training's l2: 0.0282917	valid_1's l2: 0.144418
[300]	training's l2: 0.0223131	valid_1's l2: 0.143996
Early stopping, best iteration is:
[294]	training's l2: 0.0225318	valid_1's l2: 0.143762
45232
(1804, 62)
0.9167397851099122
valid mean: 297.1627085751629
true  mean: 400.3399378558482
test  mean: 109.51604358547544
         id  forecastVolum
44265  6338       5.626663
44266  6339       5.347831
44267  6340       4.982661
44268  6341       4.238483
44269  6

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.30it/s]


       adcode  model  mt         a         b         c
17160  310000      0  14  5.777354 -0.018044 -0.011865
17161  530000      0  14  6.112306  0.095870  0.001347
17162  150000      0  14  5.235214 -0.043259 -0.009351
17163  110000      0  14  6.204539  0.031391 -0.006780
17164  510000      0  14  6.248989  0.032074 -0.003026
(45232, 61)
32 32
all_idx  : 13 27
train_idx: 13 24
valid_idx: 25 25
test_idx : 28 28
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0437863	valid_1's l2: 0.0174192
[200]	training's l2: 0.0294657	valid_1's l2: 0.00944025
[300]	training's l2: 0.023478	valid_1's l2: 0.00665293
[400]	training's l2: 0.0199394	valid_1's l2: 0.00583789
[500]	training's l2: 0.0172779	valid_1's l2: 0.00467202
[600]	training's l2: 0.0152291	valid_1's l2: 0.00395371
[700]	training's l2: 0.0135173	valid_1's l2: 0.00366767
[800]	training's l2: 0.0121023	valid_1's l2: 0.00310386
[900]	training's l2: 0.0108926	valid_1's l2: 0.00286582
[1000]	training's l

In [9]:
print(sub.head(5))

         id  forecastVolum
43652  5725       1.137304
44749  6822       5.514633
44750  6823       5.156681
44751  6824       4.932480
44752  6825       4.065639


In [10]:


print(data.loc[(data.regMonth>=1)&(data.regYear==2018),'salesVolume'])
data['salesVolume1'] = np.exp(data.loc[(data.regMonth>=1)&(data.regYear==2018), 'salesVolume'])
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','regMonth','salesVolume1']]
sub.columns = ['id','regMonth','forecastVolum']
print(sub.groupby('regMonth')['forecastVolum'].mean())
sub[['id','regMonth','forecastVolum']].round().astype(int).to_csv('lgb_new.csv', index=False)
# regMonthexp1m
# 1    264.099377
# 2    159.295483
# 3    196.623678
# 4    164.711936

43296    5.757223
43297    5.554662
43298    5.088444
43299    4.476443
43300    6.098925
           ...   
45227    5.310678
45228    4.636859
45229    4.823340
45230    5.189195
45231    3.275468
Name: salesVolume, Length: 1936, dtype: float64
regMonth
1    264.099377
2    159.295483
3    196.623678
4    164.711936
Name: forecastVolum, dtype: float64


In [11]:
#print(data.loc[(data.regMonth>=1)&(data.regMonth<=4)&(data.regYear==2017)])
print(data.loc[(data.regMonth>=1)&(data.regMonth<=4)&(data.regYear==2017)].groupby('regMonth')['salesVolume'].mean())

print(data.loc[(data.regMonth>=1)&(data.regMonth<=4)&(data.regYear==2016)].groupby('regMonth')['salesVolume'].mean())

regMonth
1    517.026053
2    299.323725
3    453.738914
4    432.867517
Name: salesVolume, dtype: float64
regMonth
1    732.955100
2    334.764967
3    471.858093
4    456.740576
Name: salesVolume, dtype: float64
