<a href="https://colab.research.google.com/github/yuto-kobayashi-1/signate-AIQuest/blob/main/20210922_AIquest_StockPredict_store.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb
from copy import deepcopy
import numpy as np
from datetime import datetime , date 
from sklearn.inspection import permutation_importance
from sklearn.model_selection import TimeSeriesSplit
from itertools import product

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
col_name = {'日付':'date',
          '店舗ID':'store_id',
          '商品ID':'goods_id',
          '商品価格':'price',
          '売上個数':'amount',
          '商品カテゴリID':'category_id'
          }

In [4]:
sales_data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/sales_history.csv",encoding="utf_8")
test_data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/test.csv",encoding="utf_8")

In [5]:
def preproces(df,col_name):
  df = df.rename(columns=col_name)
 
  # イレギュラーデータを削除する
  df = df[df['price']!=0]
  df = df[df['amount']>0]

  df['date'] = pd.to_datetime(df['date'])
  df['base_date'] = df['date'].dt.strftime("%Y%m")

  df['sales_amount'] = (df['price'] * df['amount']).astype('int')
 
  #月次単位にデータを集計
  df = df.drop(['price'],axis=1).groupby(['base_date','store_id','goods_id']).sum(['amount','sales_amount']).reset_index()
  df['av_price'] = (df['sales_amount'] /  df['amount'] )

  # df = df.groupby(['base_date','store_id','goods_id']).agg({'amount':'sum','price':'mean'}).reset_index()
  # df = df.rename(columns={'price':'av_price'})

   # 「日付」カラムの文字列から、「年」「月」の情報を抽出する
  df['year'] = df['base_date'].apply(lambda x: x[:4]) 
  df['month'] = df['base_date'].apply(lambda x: x[4:])
  
  #月ブロックの作成
  gp_time = df.groupby(['year', 'month']).count().reset_index()[['year', 'month']]
  # カラム名「月ブロック」として、通し番号をつける
  gp_time['base_date_count'] = list(range(len(gp_time)))

  # testデータ用に、評価対象期間である2019年12月(月ブロック: 23)のレコードを追加する
  gp_time = gp_time.append({
    'base_date_count': 23,
    'year': '2019',
    'month': '12'
  }, ignore_index=True)

  # 「月ブロック」カラムをsalesに統合する
  df = pd.merge(df, gp_time, on=['year', 'month'], how='left')

  all_combination = zero_padding_amount(df)

  all_combination = pd.merge(all_combination, gp_time, on=['base_date_count'], how='left')
  df = pd.merge(all_combination, df, on=['base_date_count', 'goods_id', 'store_id','year','month'], how='left')

  #発売後経過月数を算出
  tmp = df.groupby(['goods_id','base_date_count']).count().reset_index()[['goods_id','base_date_count']]
  tmp['num_of_month'] = tmp.groupby(['goods_id'])['base_date_count'].rank()
  df = df.merge(tmp,on=['goods_id','base_date_count'],how='left')
  
  # fillna_cols = ['amount','sales_amount']
  fillna_cols = ['amount']
  for col in fillna_cols:
    df[col] = df[col].fillna(0)

  # カテゴリーデータの登録
  category = pd.read_csv('/content/drive/MyDrive/signate/competiton/210919_AIquest/item_categories.csv')
  category = category.rename(columns=col_name)

  df = pd.merge(df,category,on=['goods_id'],how='left')

  return df , gp_time

In [6]:
def make_features(df):

  #グッズID単位の月間平均販売数
  tmp = df.groupby(['base_date_count','goods_id']).sum().reset_index()[['base_date_count','goods_id','amount']]
  tmp['mean_goods_amount'] = tmp['amount'] / len(df['store_id'].unique())
  df = pd.merge(df,tmp[['base_date_count','goods_id','mean_goods_amount']],on=['base_date_count','goods_id'],how='left')
  df['div_mean_goods_amount'] = df['amount'] - df['mean_goods_amount']
  df['div_mean_goods_amount'] = df['div_mean_goods_amount'].replace([np.inf, -np.inf], 0)

  #グッズID単位の平均価格
  tmp = df[df['av_price'] > 0] 
  tmp = tmp.groupby(['base_date_count','goods_id']).mean().reset_index()[['base_date_count','goods_id','av_price']]
  tmp['mean_av_price'] = tmp['av_price']
  df = pd.merge(df,tmp[['base_date_count','goods_id','mean_av_price']],on=['base_date_count','goods_id'],how='left')
  # df['div_mean_av_price'] = df['amount'] - df['mean_goods_amount']
  # df['div_mean_av_price'] = df['div_mean_goods_amount'].replace([np.inf, -np.inf], 0)

  #グッズID単位の平均価格との差
  df['diff_price'] = df['mean_av_price'] - df['av_price']

  #カテゴリID単位の月間平均販売数
  tmp = df_sales_data.groupby(['base_date_count','category_id']).sum().reset_index()[['base_date_count','category_id','amount']]
  tmp['mean_category_amount'] = tmp['amount'] / len(df['store_id'].unique())
  df = pd.merge(df,tmp[['base_date_count','category_id','mean_category_amount']],on=['base_date_count','category_id'],how='left')


  del tmp
  
  #移動平均作成
  df = df.sort_values(['base_date_count','goods_id','store_id'])
  period = [3,6,12]
  for i in period:
    # 販売個数の移動平均算出
    tmp = df[['goods_id','store_id','amount']].groupby(['store_id','goods_id'],group_keys=False).rolling(window=i).mean()['amount'].reset_index()
    df[f'mv{i}m_amount'] = tmp[['level_2','amount']].set_index('level_2')['amount']
    #移動平均乖離率の算出
    df[f'div_mv{i}m_amount'] = (df[f'mv{i}m_amount'] - df['amount'])/df['amount']
    df[f'div_mv{i}m_amount'] = df[f'div_mv{i}m_amount'].replace([np.inf, -np.inf], 0)
  
  
  #lag特徴量作成
  for i in range(2,13,1):
    lag = df.copy()
    lag['base_date_count'] = lag['base_date_count'] + i
    lag = lag.rename(columns={'amount': f'bf{i}m_amount'})
    lag = lag.rename(columns={'av_price': f'bf{i}m_av_price'})
    lag = lag.rename(columns={'mean_goods_amount': f'bf{i}m_mean_goods_amount'})
    lag = lag.rename(columns={'div_mean_goods_amount': f'bf{i}m_div_mean_goods_amount'})    

    lag = lag.rename(columns={'mean_category_amount': f'bf{i}m_mean_category_amount'})

    lag = lag.rename(columns={'mean_av_price': f'bf{i}m_mean_av_price'})
    lag = lag.rename(columns={'diff_price': f'bf{i}m_diff_price'})
    
    col = []
    for peripd in [3,6,12]:
      lag = lag.rename(columns={f'mv{peripd}m_amount': f'bf{i}m_mv{peripd}m_amount'})
      lag = lag.rename(columns={f'div_mv{peripd}m_amount': f'bf{i}m_div_mv{peripd}m_amount'})
      col.append(f'bf{i}m_mv{peripd}m_amount')
      col.append(f'bf{i}m_div_mv{peripd}m_amount')

    # lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount']
    lag = (lag[['base_date_count', 'store_id', 'goods_id'
            ,f'bf{i}m_amount',f'bf{i}m_av_price',f'bf{i}m_mean_goods_amount'
            ,f'bf{i}m_mean_category_amount',f'bf{i}m_div_mean_goods_amount'
            ,f'bf{i}m_mean_av_price',f'bf{i}m_diff_price'
            ]+col]
          )
    df = pd.merge(df, lag, on=['base_date_count', 'store_id', 'goods_id'], how='left', left_index=True)

  df = df.reset_index(drop=True)

 

  return df

In [46]:
def make_test_data(df_test,df_train,col_name):
  df_test = df_test.rename(columns=col_name)
  #初期値設定
  df_test['year'] = 2019
  df_test['month'] = 12
  df_test['amount'] = 0
  df_test['base_date_count'] = 23

  category = pd.read_csv('/content/drive/MyDrive/signate/competiton/210919_AIquest/item_categories.csv')
  category = category.rename(columns=col_name)
  df_test = pd.merge(df_test,category,on=['goods_id'],how='left')
  
  #traindata移動平均乖離率を算出
  #移動平均作成
  # df_train = df_train.sort_values(['base_date_count','goods_id','store_id'])

  period = [3,6,12]
  mv_col = []
  for i in period:
  #販売個数の移動平均用カラム追加
    mv_col.append(f'mv{i}m_amount')
  #移動平均乖離率のカラム追加
    mv_col.append(f'div_mv{i}m_amount')
 

  #店舗商品単位で過去販売数データを作成
  df_train = df_train[['store_id','goods_id','amount','base_date_count','av_price','mean_goods_amount','mean_category_amount','div_mean_goods_amount','mean_av_price','diff_price'] + mv_col]
  # print(df_train.columns)

  for i in range(2,13,1):
    lag = df_train.copy()
    lag['base_date_count'] = lag['base_date_count'] + i
    lag = lag.rename(columns={'amount': f'bf{i}m_amount'})
    lag = lag.rename(columns={'av_price': f'bf{i}m_av_price'})
    lag = lag.rename(columns={'mean_goods_amount': f'bf{i}m_mean_goods_amount'})
    lag = lag.rename(columns={'mean_category_amount': f'bf{i}m_mean_category_amount'})
    lag = lag.rename(columns={'div_mean_goods_amount': f'bf{i}m_div_mean_goods_amount'})  
    lag = lag.rename(columns={'mean_av_price': f'bf{i}m_mean_av_price'})  
    lag = lag.rename(columns={'diff_price': f'bf{i}m_diff_price'})

    col = []
    for peripd in [3,6,12]:
      lag = lag.rename(columns={f'mv{peripd}m_amount': f'bf{i}m_mv{peripd}m_amount'})
      lag = lag.rename(columns={f'div_mv{peripd}m_amount': f'bf{i}m_div_mv{peripd}m_amount'})
      col.append(f'bf{i}m_mv{peripd}m_amount')
      col.append(f'bf{i}m_div_mv{peripd}m_amount')

    # lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount']]
    lag = (lag[['base_date_count', 'store_id', 'goods_id'
            ,f'bf{i}m_amount',f'bf{i}m_av_price',f'bf{i}m_mean_goods_amount'
            ,f'bf{i}m_mean_category_amount',f'bf{i}m_div_mean_goods_amount'
            ,f'bf{i}m_mean_av_price',f'bf{i}m_diff_price'
            ]+col]
    )
  
    df_test = pd.merge(df_test, lag, on=['base_date_count', 'store_id', 'goods_id'], how='left', left_index=True)
  
  #発売経過月数追加
  tmp = df_test.groupby(['goods_id','base_date_count']).count().reset_index()[['goods_id','base_date_count']]
  tmp2 = df_train.groupby(['goods_id','base_date_count']).count().reset_index()[['goods_id','base_date_count']]
  tmp_m = pd.concat([tmp,tmp2])
  tmp_m['num_of_month'] = tmp_m.groupby(['goods_id'])['base_date_count'].rank()

  df_test = pd.merge(df_test,tmp_m,on=['base_date_count','goods_id'],how='left')
  del tmp,tmp2,tmp_m
  
  df_test = df_test.reset_index(drop=True)
  # #testデータに特徴量付与
  # df_test = df_test.merge(df_merge[['base_date_count', 'store_id', 'goods_id']+cols],how='left')

  return df_test

In [8]:
# def train(df,label_cols,features):
def trainCV(df,features):

  df = df.rename(columns = {'amount': 'y'})

  X =  df[features]
  y = df['y']

  for feature in features:
    X[feature] = X[feature].astype('float')

  # kf = KFold(n_splits=5,shuffle=True,random_state=0)
  kf = TimeSeriesSplit(n_splits=5)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    X_valid = X.iloc[test_index]
    y_valid = y.iloc[test_index]

    # X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'num_leaves': 64,
    # 'min_data_in_leaf': 20,
    # 'max_depth': 7,
    # 'verbose': 0,
  }

    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data],
  )

    models.append(model)

  return models 

In [9]:
# def train(df,label_cols,features):
def trainTS(df,features):

  df = df.rename(columns = {'amount': 'y'})

  idx_train={}
  idx_valid={}
  idx_train[0] = (df['base_date_count']<=9)
  idx_valid[0] = (df['base_date_count']==11)
  idx_train[1] = (df['base_date_count']<=19)
  idx_valid[1] = (df['base_date_count']==21)
  idx_train[2] = (df['base_date_count']>=12) & (df['base_date_count']<=19)
  idx_valid[2] = (df['base_date_count']==21)


  X =  df[features]
  y = df['y']

  for feature in features:
    X[feature] = X[feature].astype('float')

  # kf = KFold(n_splits=5,shuffle=True,random_state=0)
  # kf = TimeSeriesSplit(n_splits=5)

  models = []
  

  for i in range(len(idx_train)):
    print(i)
    X_train = X[idx_train[i]]
    y_train =  y[idx_train[i]]
    X_valid = X[idx_valid[i]]
    y_valid = y[idx_valid[i]]

    # X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'num_leaves': 64,
    # 'min_data_in_leaf': 20,
    # 'max_depth': 7,
    # 'verbose': 0,
  }

    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data],
  )

    models.append(model)

  return models 

In [10]:
def feature_importance(models,data,cols):
  importance = pd.DataFrame(columns=data[cols].columns)

  for model in models:
    imp_ = pd.Series(model.feature_importance(importance_type='gain'),index=data[cols].columns)
    importance = importance.append(imp_,ignore_index=True)

  importance = importance.mean()

  importance = pd.DataFrame(importance,columns={"importance"}).sort_values("importance",ascending=False)

  importance = pd.DataFrame(importance/importance.sum())
  return importance

In [11]:
def zero_padding_amount(df):
  all_combination = []

  # '月'、'商品ID'、 '店舗ID'の全組み合わせを作成する
  columns = ['base_date_count', 'goods_id', 'store_id']

  #product関数は指定された全組み合わせを作成する関数
  for i in range(22):
      train_block = df[df['base_date_count']==i]
      all_combination.append(np.array(list(product([i], train_block['goods_id'].unique(), train_block['store_id'].unique()))))
    
  all_combination = pd.DataFrame(np.vstack(all_combination), columns=columns)
  all_combination.sort_values(columns, inplace=True)

  return all_combination

前処理

In [12]:
df_sales_data , gp_time = preproces(sales_data,col_name)

In [44]:
df_sales_data.head()

Unnamed: 0,base_date_count,goods_id,store_id,year,month,base_date,amount,sales_amount,av_price,num_of_month,category_id
0,0,1000001,0,2018,1,201801.0,6.0,2520.0,420.0,1.0,100
1,0,1000001,1,2018,1,201801.0,2.0,650.0,325.0,1.0,100
2,0,1000001,2,2018,1,201801.0,1.0,420.0,420.0,1.0,100
3,0,1000001,3,2018,1,201801.0,2.0,840.0,420.0,1.0,100
4,0,1000001,4,2018,1,,0.0,,,1.0,100


In [15]:
df_train = make_features(df_sales_data)

In [16]:
df_train[(df_train['goods_id'].isin([1000001]))&(df_train['store_id'].isin([0]))][['base_date_count','goods_id','store_id','av_price','mean_av_price','bf2m_diff_price']]

Unnamed: 0,base_date_count,goods_id,store_id,av_price,mean_av_price,bf2m_diff_price
0,0,1000001,0,420.0,403.571429,
83124,1,1000001,0,356.666667,388.333333,
167076,2,1000001,0,420.0,420.0,-16.428571
252144,3,1000001,0,420.0,406.428571,31.666667
335664,4,1000001,0,,398.409091,0.0
418878,5,1000001,0,420.0,401.777778,-13.571429
503424,6,1000001,0,420.0,403.235294,
586656,7,1000001,0,420.0,406.428571,-18.222222
667116,8,1000001,0,420.0,420.0,-16.764706
745362,9,1000001,0,,417.948718,-13.571429


In [17]:
# df_train[(df_train['goods_id'].isin([1000001]))&(df_train['store_id'].isin([0]))][['base_date_count','goods_id','store_id','amount','mv3m_amount','div_mv3m_amount','bf2m_mv3m_amount','bf2m_div_mv3m_amount','div_mean_goods_amount','bf2m_div_mean_goods_amount','amount']]

In [47]:
df_test = make_test_data(test_data,df_train,col_name)

In [64]:
df_test.head()

Unnamed: 0,index,goods_id,store_id,year,month,amount,base_date_count,category_id,bf2m_amount,bf2m_av_price,bf2m_mean_goods_amount,bf2m_mean_category_amount,bf2m_div_mean_goods_amount,bf2m_mean_av_price,bf2m_diff_price,bf2m_mv3m_amount,bf2m_div_mv3m_amount,bf2m_mv6m_amount,bf2m_div_mv6m_amount,bf2m_mv12m_amount,bf2m_div_mv12m_amount,bf3m_amount,bf3m_av_price,bf3m_mean_goods_amount,bf3m_mean_category_amount,bf3m_div_mean_goods_amount,bf3m_mean_av_price,bf3m_diff_price,bf3m_mv3m_amount,bf3m_div_mv3m_amount,bf3m_mv6m_amount,bf3m_div_mv6m_amount,bf3m_mv12m_amount,bf3m_div_mv12m_amount,bf4m_amount,bf4m_av_price,bf4m_mean_goods_amount,bf4m_mean_category_amount,bf4m_div_mean_goods_amount,bf4m_mean_av_price,...,bf10m_amount,bf10m_av_price,bf10m_mean_goods_amount,bf10m_mean_category_amount,bf10m_div_mean_goods_amount,bf10m_mean_av_price,bf10m_diff_price,bf10m_mv3m_amount,bf10m_div_mv3m_amount,bf10m_mv6m_amount,bf10m_div_mv6m_amount,bf10m_mv12m_amount,bf10m_div_mv12m_amount,bf11m_amount,bf11m_av_price,bf11m_mean_goods_amount,bf11m_mean_category_amount,bf11m_div_mean_goods_amount,bf11m_mean_av_price,bf11m_diff_price,bf11m_mv3m_amount,bf11m_div_mv3m_amount,bf11m_mv6m_amount,bf11m_div_mv6m_amount,bf11m_mv12m_amount,bf11m_div_mv12m_amount,bf12m_amount,bf12m_av_price,bf12m_mean_goods_amount,bf12m_mean_category_amount,bf12m_div_mean_goods_amount,bf12m_mean_av_price,bf12m_diff_price,bf12m_mv3m_amount,bf12m_div_mv3m_amount,bf12m_mv6m_amount,bf12m_div_mv6m_amount,bf12m_mv12m_amount,bf12m_div_mv12m_amount,num_of_month
0,0,1000001,0,2019,12,0,23,100,0.0,,1.555556,406.833333,-1.555556,237.0,,0.333333,0.0,0.5,0.0,0.666667,0.0,1.0,250.0,0.833333,375.444444,0.166667,250.0,0.0,1.0,0.0,0.5,-0.5,0.666667,-0.333333,0.0,,1.944444,513.388889,-1.944444,237.0,...,1.0,420.0,1.444444,648.722222,-0.444444,415.681818,-4.318182,1.0,0.0,1.0,0.0,1.5,0.5,2.0,420.0,1.277778,726.444444,0.722222,420.0,0.0,1.333333,-0.333333,1.166667,-0.416667,1.666667,-0.166667,0.0,,2.111111,931.944444,-2.111111,420.0,,0.666667,0.0,1.333333,0.0,2.0,0.0,23.0
1,1,1000001,1,2019,12,0,23,100,0.0,,1.555556,406.833333,-1.555556,237.0,,0.0,,0.0,,0.0,,0.0,,0.833333,375.444444,-0.833333,250.0,,0.0,,0.0,,0.083333,0.0,0.0,,1.944444,513.388889,-1.944444,237.0,...,0.0,,1.444444,648.722222,-1.444444,415.681818,,0.0,,0.166667,0.0,0.5,0.0,0.0,,1.277778,726.444444,-1.277778,420.0,,0.0,,0.166667,0.0,0.583333,0.0,0.0,,2.111111,931.944444,-2.111111,420.0,,0.333333,0.0,0.333333,0.0,0.75,0.0,23.0
2,2,1000001,2,2019,12,0,23,100,2.0,250.0,1.555556,406.833333,0.444444,237.0,-13.0,1.666667,-0.166667,1.166667,-0.416667,1.0,-0.5,0.0,,0.833333,375.444444,-0.833333,250.0,,1.0,0.0,1.0,0.0,0.916667,0.0,3.0,250.0,1.944444,513.388889,1.055556,237.0,...,2.0,420.0,1.444444,648.722222,0.555556,415.681818,-4.318182,0.666667,-0.666667,1.0,-0.5,1.416667,-0.291667,0.0,,1.277778,726.444444,-1.277778,420.0,,0.333333,0.0,1.333333,0.0,1.25,0.0,0.0,,2.111111,931.944444,-2.111111,420.0,,0.666667,0.0,1.833333,0.0,1.333333,0.0,23.0
3,3,1000001,3,2019,12,0,23,100,2.0,250.0,1.555556,406.833333,0.444444,237.0,-13.0,3.666667,0.833333,4.166667,1.083333,4.416667,1.208333,3.0,250.0,0.833333,375.444444,2.166667,250.0,0.0,4.666667,0.555556,4.166667,0.388889,4.833333,0.611111,6.0,250.0,1.944444,513.388889,4.055556,237.0,...,2.0,420.0,1.444444,648.722222,0.555556,415.681818,-4.318182,5.333333,1.666667,6.5,2.25,5.25,1.625,5.0,420.0,1.277778,726.444444,3.722222,420.0,0.0,7.0,0.4,7.166667,0.433333,5.083333,0.016667,9.0,420.0,2.111111,931.944444,6.888889,420.0,0.0,7.666667,-0.148148,7.0,-0.222222,4.833333,-0.462963,23.0
4,4,1000001,4,2019,12,0,23,100,12.0,250.0,1.555556,406.833333,10.444444,237.0,-13.0,11.333333,-0.055556,9.0,-0.25,7.25,-0.395833,8.0,250.0,0.833333,375.444444,7.166667,250.0,0.0,10.333333,0.291667,7.5,-0.0625,6.75,-0.15625,14.0,250.0,1.944444,513.388889,12.055556,237.0,...,8.0,420.0,1.444444,648.722222,6.555556,415.681818,-4.318182,6.333333,-0.208333,5.333333,-0.333333,4.083333,-0.489583,4.0,420.0,1.277778,726.444444,2.722222,420.0,0.0,5.333333,0.333333,4.333333,0.083333,3.583333,-0.104167,7.0,420.0,2.111111,931.944444,4.888889,420.0,0.0,6.0,-0.142857,4.666667,-0.333333,3.25,-0.535714,23.0


In [65]:
print(len(df_sales_data))
print(len(df_train))
print(len(test_data))
print(len(df_test))

1647990
1647990
3060
3060


In [66]:
#特徴量編集結果確認
i = 5
print(df_train.groupby('base_date_count').agg({'bf2m_amount': 'count','bf2m_av_price': 'count',f'bf{i}m_amount': 'count',f'bf{i}m_av_price': 'count'}))
print(df_test.groupby('base_date_count').agg({'bf2m_amount': 'count','bf2m_av_price': 'count',f'bf{i}m_amount': 'count',f'bf{i}m_av_price': 'count'}))
# print(df_train.groupby('base_date_count').agg({'bf2m_amount': 'count',f'bf{i}m_amount': 'count'}))
# print(df_test.groupby('base_date_count').agg({'bf2m_amount': 'count',f'bf{i}m_amount': 'count'}))

                 bf2m_amount  bf2m_av_price  bf5m_amount  bf5m_av_price
base_date_count                                                        
0                          0              0            0              0
1                          0              0            0              0
2                      70920          25923            0              0
3                      69660          24215            0              0
4                      70686          25951            0              0
5                      70542          23198        64674          24422
6                      69660          22149        64368          22941
7                      67158          22678        61866          23816
8                      66006          23884        60372          21190
9                      64296          23742        58428          19886
10                     63864          21092        59904          21004
11                     65682          21130        60552        

モデル構築

In [67]:
lag = []
#2～6か月前と12か月前の特徴量を利用する
month = [x for x in range(2,7,1)]
month.append(12)
for i in month:
  lag.append(f'bf{i}m_amount')
  lag.append(f'bf{i}m_av_price')
  lag.append(f'bf{i}m_mean_goods_amount')
  # lag.append(f'bf{i}m_mean_av_price')  
  # lag.append(f'bf{i}m_diff_price')
  # lag.append(f'bf{i}m_mean_category_amount')
  # lag.append(f'bf{i}m_div_mean_goods_amount')

  for peripd in [3,6,12]:
    lag.append(f'bf{i}m_mv{peripd}m_amount')
    # lag.append(f'bf{i}m_div_mv{peripd}m_amount')

features = ['store_id','goods_id','category_id','year','month','num_of_month'] + lag

In [68]:
features

['store_id',
 'goods_id',
 'category_id',
 'year',
 'month',
 'num_of_month',
 'bf2m_amount',
 'bf2m_av_price',
 'bf2m_mean_goods_amount',
 'bf2m_mv3m_amount',
 'bf2m_mv6m_amount',
 'bf2m_mv12m_amount',
 'bf3m_amount',
 'bf3m_av_price',
 'bf3m_mean_goods_amount',
 'bf3m_mv3m_amount',
 'bf3m_mv6m_amount',
 'bf3m_mv12m_amount',
 'bf4m_amount',
 'bf4m_av_price',
 'bf4m_mean_goods_amount',
 'bf4m_mv3m_amount',
 'bf4m_mv6m_amount',
 'bf4m_mv12m_amount',
 'bf5m_amount',
 'bf5m_av_price',
 'bf5m_mean_goods_amount',
 'bf5m_mv3m_amount',
 'bf5m_mv6m_amount',
 'bf5m_mv12m_amount',
 'bf6m_amount',
 'bf6m_av_price',
 'bf6m_mean_goods_amount',
 'bf6m_mv3m_amount',
 'bf6m_mv6m_amount',
 'bf6m_mv12m_amount',
 'bf12m_amount',
 'bf12m_av_price',
 'bf12m_mean_goods_amount',
 'bf12m_mv3m_amount',
 'bf12m_mv6m_amount',
 'bf12m_mv12m_amount']

店舗ごとにモデル作成

In [80]:
store_ids = [x  for x in range(18)]
df_trains = {}
for store_id in store_ids:
  df_trains[store_id] = df_train[df_train['store_id']==store_id]

In [83]:
models={}
for store_id in store_ids:
  models[store_id] = trainTS(df_trains[store_id],features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 4.72721
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 4.66679
[3]	valid_0's rmse: 4.61863
[4]	valid_0's rmse: 4.57327
[5]	valid_0's rmse: 4.54202
[6]	valid_0's rmse: 4.50576
[7]	valid_0's rmse: 4.47657
[8]	valid_0's rmse: 4.45722
[9]	valid_0's rmse: 4.43527
[10]	valid_0's rmse: 4.41871
[11]	valid_0's rmse: 4.40496
[12]	valid_0's rmse: 4.39389
[13]	valid_0's rmse: 4.38673
[14]	valid_0's rmse: 4.37813
[15]	valid_0's rmse: 4.36636
[16]	valid_0's rmse: 4.36261
[17]	valid_0's rmse: 4.35729
[18]	valid_0's rmse: 4.3517
[19]	valid_0's rmse: 4.35104
[20]	valid_0's rmse: 4.34499
[21]	valid_0's rmse: 4.33695
[22]	valid_0's rmse: 4.33333
[23]	valid_0's rmse: 4.33353
[24]	valid_0's rmse: 4.33247
[25]	valid_0's rmse: 4.33158
[26]	valid_0's rmse: 4.33142
[27]	valid_0's rmse: 4.33064
[28]	valid_0's rmse: 4.32588
[29]	valid_0's rmse: 4.32124
[30]	valid_0's rmse: 4.31002
[31]	valid_0's rmse: 4.31007
[32]	valid_0's rmse: 4.30175
[33]	valid_0's rm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 2.13481
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 2.09704
[3]	valid_0's rmse: 2.06725
[4]	valid_0's rmse: 2.03949
[5]	valid_0's rmse: 2.01642
[6]	valid_0's rmse: 1.99599
[7]	valid_0's rmse: 1.97838
[8]	valid_0's rmse: 1.96589
[9]	valid_0's rmse: 1.95152
[10]	valid_0's rmse: 1.94106
[11]	valid_0's rmse: 1.9306
[12]	valid_0's rmse: 1.92541
[13]	valid_0's rmse: 1.91457
[14]	valid_0's rmse: 1.90723
[15]	valid_0's rmse: 1.90109
[16]	valid_0's rmse: 1.89838
[17]	valid_0's rmse: 1.89694
[18]	valid_0's rmse: 1.89341
[19]	valid_0's rmse: 1.89058
[20]	valid_0's rmse: 1.88554
[21]	valid_0's rmse: 1.88658
[22]	valid_0's rmse: 1.88298
[23]	valid_0's rmse: 1.88546
[24]	valid_0's rmse: 1.8845
[25]	valid_0's rmse: 1.88461
[26]	valid_0's rmse: 1.88325
[27]	valid_0's rmse: 1.88094
[28]	valid_0's rmse: 1.87705
[29]	valid_0's rmse: 1.87568
[30]	valid_0's rmse: 1.87523
[31]	valid_0's rmse: 1.8755
[32]	valid_0's rmse: 1.87416
[33]	valid_0's rmse

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1]	valid_0's rmse: 2.79978
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 2.75838
[3]	valid_0's rmse: 2.71852
[4]	valid_0's rmse: 2.68491
[5]	valid_0's rmse: 2.65488
[6]	valid_0's rmse: 2.63362
[7]	valid_0's rmse: 2.61397
[8]	valid_0's rmse: 2.59566
[9]	valid_0's rmse: 2.58161
[10]	valid_0's rmse: 2.5728
[11]	valid_0's rmse: 2.56554
[12]	valid_0's rmse: 2.55416
[13]	valid_0's rmse: 2.54444
[14]	valid_0's rmse: 2.54057
[15]	valid_0's rmse: 2.53583
[16]	valid_0's rmse: 2.52876
[17]	valid_0's rmse: 2.52631
[18]	valid_0's rmse: 2.52631
[19]	valid_0's rmse: 2.52195
[20]	valid_0's rmse: 2.52044
[21]	valid_0's rmse: 2.51894
[22]	valid_0's rmse: 2.5157
[23]	valid_0's rmse: 2.51449
[24]	valid_0's rmse: 2.51474
[25]	valid_0's rmse: 2.51325
[26]	valid_0's rmse: 2.50597
[27]	valid_0's rmse: 2.49697
[28]	valid_0's rmse: 2.49452
[29]	valid_0's rmse: 2.49544
[30]	valid_0's rmse: 2.49501
[31]	valid_0's rmse: 2.49469
[32]	valid_0's rmse: 2.4957
[33]	valid_0's rmse: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 4.66648
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 4.59598
[3]	valid_0's rmse: 4.53871
[4]	valid_0's rmse: 4.48885
[5]	valid_0's rmse: 4.44495
[6]	valid_0's rmse: 4.40237
[7]	valid_0's rmse: 4.36782
[8]	valid_0's rmse: 4.33476
[9]	valid_0's rmse: 4.3133
[10]	valid_0's rmse: 4.30092
[11]	valid_0's rmse: 4.28058
[12]	valid_0's rmse: 4.26191
[13]	valid_0's rmse: 4.25355
[14]	valid_0's rmse: 4.24751
[15]	valid_0's rmse: 4.23875
[16]	valid_0's rmse: 4.22745
[17]	valid_0's rmse: 4.21889
[18]	valid_0's rmse: 4.20582
[19]	valid_0's rmse: 4.20458
[20]	valid_0's rmse: 4.19656
[21]	valid_0's rmse: 4.19306
[22]	valid_0's rmse: 4.19284
[23]	valid_0's rmse: 4.18615
[24]	valid_0's rmse: 4.18572
[25]	valid_0's rmse: 4.18306
[26]	valid_0's rmse: 4.18258
[27]	valid_0's rmse: 4.17849
[28]	valid_0's rmse: 4.17786
[29]	valid_0's rmse: 4.17277
[30]	valid_0's rmse: 4.17502
[31]	valid_0's rmse: 4.17485
[32]	valid_0's rmse: 4.17203
[33]	valid_0's rm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 3.71735
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 3.64729
[3]	valid_0's rmse: 3.58357
[4]	valid_0's rmse: 3.53298
[5]	valid_0's rmse: 3.49049
[6]	valid_0's rmse: 3.44874
[7]	valid_0's rmse: 3.42143
[8]	valid_0's rmse: 3.38853
[9]	valid_0's rmse: 3.36954
[10]	valid_0's rmse: 3.34503
[11]	valid_0's rmse: 3.32484
[12]	valid_0's rmse: 3.31038
[13]	valid_0's rmse: 3.29852
[14]	valid_0's rmse: 3.28583
[15]	valid_0's rmse: 3.27098
[16]	valid_0's rmse: 3.26101
[17]	valid_0's rmse: 3.25439
[18]	valid_0's rmse: 3.25037
[19]	valid_0's rmse: 3.24243
[20]	valid_0's rmse: 3.23911
[21]	valid_0's rmse: 3.23226
[22]	valid_0's rmse: 3.22886
[23]	valid_0's rmse: 3.22202
[24]	valid_0's rmse: 3.21991
[25]	valid_0's rmse: 3.21358
[26]	valid_0's rmse: 3.20862
[27]	valid_0's rmse: 3.20468
[28]	valid_0's rmse: 3.2026
[29]	valid_0's rmse: 3.20489
[30]	valid_0's rmse: 3.20171
[31]	valid_0's rmse: 3.20097
[32]	valid_0's rmse: 3.19842
[33]	valid_0's rm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1]	valid_0's rmse: 3.20366
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 3.15304
[3]	valid_0's rmse: 3.11442
[4]	valid_0's rmse: 3.07995
[5]	valid_0's rmse: 3.05124
[6]	valid_0's rmse: 3.02864
[7]	valid_0's rmse: 3.00423
[8]	valid_0's rmse: 2.98481
[9]	valid_0's rmse: 2.96744
[10]	valid_0's rmse: 2.95371
[11]	valid_0's rmse: 2.94304
[12]	valid_0's rmse: 2.92743
[13]	valid_0's rmse: 2.91833
[14]	valid_0's rmse: 2.91249
[15]	valid_0's rmse: 2.8995
[16]	valid_0's rmse: 2.89315
[17]	valid_0's rmse: 2.88739
[18]	valid_0's rmse: 2.87952
[19]	valid_0's rmse: 2.87478
[20]	valid_0's rmse: 2.87386
[21]	valid_0's rmse: 2.86545
[22]	valid_0's rmse: 2.86247
[23]	valid_0's rmse: 2.8616
[24]	valid_0's rmse: 2.86089
[25]	valid_0's rmse: 2.86092
[26]	valid_0's rmse: 2.85628
[27]	valid_0's rmse: 2.85756
[28]	valid_0's rmse: 2.853
[29]	valid_0's rmse: 2.85089
[30]	valid_0's rmse: 2.8506
[31]	valid_0's rmse: 2.84954
[32]	valid_0's rmse: 2.85108
[33]	valid_0's rmse: 2.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 5.63115
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 5.53586
[3]	valid_0's rmse: 5.45861
[4]	valid_0's rmse: 5.4005
[5]	valid_0's rmse: 5.33229
[6]	valid_0's rmse: 5.28157
[7]	valid_0's rmse: 5.23752
[8]	valid_0's rmse: 5.20274
[9]	valid_0's rmse: 5.18151
[10]	valid_0's rmse: 5.16466
[11]	valid_0's rmse: 5.14878
[12]	valid_0's rmse: 5.1298
[13]	valid_0's rmse: 5.11112
[14]	valid_0's rmse: 5.10521
[15]	valid_0's rmse: 5.09156
[16]	valid_0's rmse: 5.07155
[17]	valid_0's rmse: 5.06462
[18]	valid_0's rmse: 5.06675
[19]	valid_0's rmse: 5.05692
[20]	valid_0's rmse: 5.04302
[21]	valid_0's rmse: 5.03488
[22]	valid_0's rmse: 5.02039
[23]	valid_0's rmse: 5.0231
[24]	valid_0's rmse: 5.01418
[25]	valid_0's rmse: 5.00599
[26]	valid_0's rmse: 5.01066
[27]	valid_0's rmse: 5.00442
[28]	valid_0's rmse: 5.00362
[29]	valid_0's rmse: 4.99875
[30]	valid_0's rmse: 5.00534
[31]	valid_0's rmse: 5.0021
[32]	valid_0's rmse: 4.99988
[33]	valid_0's rmse:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 5.86475
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 5.7668
[3]	valid_0's rmse: 5.70492
[4]	valid_0's rmse: 5.63386
[5]	valid_0's rmse: 5.58269
[6]	valid_0's rmse: 5.54995
[7]	valid_0's rmse: 5.51872
[8]	valid_0's rmse: 5.47403
[9]	valid_0's rmse: 5.45099
[10]	valid_0's rmse: 5.43488
[11]	valid_0's rmse: 5.42086
[12]	valid_0's rmse: 5.40086
[13]	valid_0's rmse: 5.37196
[14]	valid_0's rmse: 5.35744
[15]	valid_0's rmse: 5.35006
[16]	valid_0's rmse: 5.34573
[17]	valid_0's rmse: 5.33761
[18]	valid_0's rmse: 5.33087
[19]	valid_0's rmse: 5.31843
[20]	valid_0's rmse: 5.30261
[21]	valid_0's rmse: 5.29772
[22]	valid_0's rmse: 5.294
[23]	valid_0's rmse: 5.28928
[24]	valid_0's rmse: 5.2852
[25]	valid_0's rmse: 5.27596
[26]	valid_0's rmse: 5.27488
[27]	valid_0's rmse: 5.26964
[28]	valid_0's rmse: 5.26514
[29]	valid_0's rmse: 5.26143
[30]	valid_0's rmse: 5.2621
[31]	valid_0's rmse: 5.25944
[32]	valid_0's rmse: 5.25762
[33]	valid_0's rmse: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1]	valid_0's rmse: 1.90529
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.87326
[3]	valid_0's rmse: 1.84833
[4]	valid_0's rmse: 1.82868
[5]	valid_0's rmse: 1.8093
[6]	valid_0's rmse: 1.79694
[7]	valid_0's rmse: 1.78445
[8]	valid_0's rmse: 1.77653
[9]	valid_0's rmse: 1.77041
[10]	valid_0's rmse: 1.76336
[11]	valid_0's rmse: 1.75413
[12]	valid_0's rmse: 1.74612
[13]	valid_0's rmse: 1.74221
[14]	valid_0's rmse: 1.73714
[15]	valid_0's rmse: 1.73559
[16]	valid_0's rmse: 1.7319
[17]	valid_0's rmse: 1.72912
[18]	valid_0's rmse: 1.72616
[19]	valid_0's rmse: 1.72118
[20]	valid_0's rmse: 1.72299
[21]	valid_0's rmse: 1.72131
[22]	valid_0's rmse: 1.72133
[23]	valid_0's rmse: 1.72046
[24]	valid_0's rmse: 1.71878
[25]	valid_0's rmse: 1.71675
[26]	valid_0's rmse: 1.71543
[27]	valid_0's rmse: 1.71595
[28]	valid_0's rmse: 1.71596
[29]	valid_0's rmse: 1.71665
[30]	valid_0's rmse: 1.71514
[31]	valid_0's rmse: 1.71598
[32]	valid_0's rmse: 1.71323
[33]	valid_0's rmse:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 3.77427
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 3.71405
[3]	valid_0's rmse: 3.64256
[4]	valid_0's rmse: 3.60325
[5]	valid_0's rmse: 3.51078
[6]	valid_0's rmse: 3.48171
[7]	valid_0's rmse: 3.4382
[8]	valid_0's rmse: 3.43601
[9]	valid_0's rmse: 3.42579
[10]	valid_0's rmse: 3.42807
[11]	valid_0's rmse: 3.43272
[12]	valid_0's rmse: 3.43292
[13]	valid_0's rmse: 3.43617
[14]	valid_0's rmse: 3.44841
[15]	valid_0's rmse: 3.45265
[16]	valid_0's rmse: 3.45708
[17]	valid_0's rmse: 3.48082
[18]	valid_0's rmse: 3.49978
[19]	valid_0's rmse: 3.50761
[20]	valid_0's rmse: 3.51777
[21]	valid_0's rmse: 3.52684
[22]	valid_0's rmse: 3.53722
[23]	valid_0's rmse: 3.55257
[24]	valid_0's rmse: 3.56203
[25]	valid_0's rmse: 3.55728
[26]	valid_0's rmse: 3.56494
[27]	valid_0's rmse: 3.56191
[28]	valid_0's rmse: 3.56804
[29]	valid_0's rmse: 3.5865
[30]	valid_0's rmse: 3.5927
[31]	valid_0's rmse: 3.60091
[32]	valid_0's rmse: 3.6192
[33]	valid_0's rmse:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 5.23843
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 5.15134
[3]	valid_0's rmse: 5.08606
[4]	valid_0's rmse: 5.03442
[5]	valid_0's rmse: 4.98485
[6]	valid_0's rmse: 4.93537
[7]	valid_0's rmse: 4.90171
[8]	valid_0's rmse: 4.87154
[9]	valid_0's rmse: 4.84702
[10]	valid_0's rmse: 4.82382
[11]	valid_0's rmse: 4.80667
[12]	valid_0's rmse: 4.78077
[13]	valid_0's rmse: 4.76871
[14]	valid_0's rmse: 4.75479
[15]	valid_0's rmse: 4.74715
[16]	valid_0's rmse: 4.7425
[17]	valid_0's rmse: 4.73848
[18]	valid_0's rmse: 4.72224
[19]	valid_0's rmse: 4.72279
[20]	valid_0's rmse: 4.71715
[21]	valid_0's rmse: 4.71407
[22]	valid_0's rmse: 4.70348
[23]	valid_0's rmse: 4.70235
[24]	valid_0's rmse: 4.69553
[25]	valid_0's rmse: 4.69706
[26]	valid_0's rmse: 4.68912
[27]	valid_0's rmse: 4.68781
[28]	valid_0's rmse: 4.68293
[29]	valid_0's rmse: 4.68575
[30]	valid_0's rmse: 4.68314
[31]	valid_0's rmse: 4.68281
[32]	valid_0's rmse: 4.6789
[33]	valid_0's rms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 2.61575
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 2.58071
[3]	valid_0's rmse: 2.54839
[4]	valid_0's rmse: 2.52097
[5]	valid_0's rmse: 2.49948
[6]	valid_0's rmse: 2.47934
[7]	valid_0's rmse: 2.46391
[8]	valid_0's rmse: 2.45009
[9]	valid_0's rmse: 2.43881
[10]	valid_0's rmse: 2.43065
[11]	valid_0's rmse: 2.42548
[12]	valid_0's rmse: 2.41805
[13]	valid_0's rmse: 2.41028
[14]	valid_0's rmse: 2.40282
[15]	valid_0's rmse: 2.39691
[16]	valid_0's rmse: 2.3943
[17]	valid_0's rmse: 2.38875
[18]	valid_0's rmse: 2.38194
[19]	valid_0's rmse: 2.37978
[20]	valid_0's rmse: 2.37732
[21]	valid_0's rmse: 2.37434
[22]	valid_0's rmse: 2.37044
[23]	valid_0's rmse: 2.369
[24]	valid_0's rmse: 2.36815
[25]	valid_0's rmse: 2.367
[26]	valid_0's rmse: 2.36346
[27]	valid_0's rmse: 2.35937
[28]	valid_0's rmse: 2.35979
[29]	valid_0's rmse: 2.35806
[30]	valid_0's rmse: 2.35386
[31]	valid_0's rmse: 2.35199
[32]	valid_0's rmse: 2.34935
[33]	valid_0's rmse: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 1.90897
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.88741
[3]	valid_0's rmse: 1.87362
[4]	valid_0's rmse: 1.86174
[5]	valid_0's rmse: 1.8502
[6]	valid_0's rmse: 1.84264
[7]	valid_0's rmse: 1.83662
[8]	valid_0's rmse: 1.83157
[9]	valid_0's rmse: 1.82807
[10]	valid_0's rmse: 1.82498
[11]	valid_0's rmse: 1.82253
[12]	valid_0's rmse: 1.81908
[13]	valid_0's rmse: 1.81499
[14]	valid_0's rmse: 1.81003
[15]	valid_0's rmse: 1.80445
[16]	valid_0's rmse: 1.80462
[17]	valid_0's rmse: 1.80275
[18]	valid_0's rmse: 1.79978
[19]	valid_0's rmse: 1.79789
[20]	valid_0's rmse: 1.79333
[21]	valid_0's rmse: 1.79203
[22]	valid_0's rmse: 1.79075
[23]	valid_0's rmse: 1.78897
[24]	valid_0's rmse: 1.78888
[25]	valid_0's rmse: 1.78655
[26]	valid_0's rmse: 1.78654
[27]	valid_0's rmse: 1.78451
[28]	valid_0's rmse: 1.78468
[29]	valid_0's rmse: 1.78356
[30]	valid_0's rmse: 1.78514
[31]	valid_0's rmse: 1.78605
[32]	valid_0's rmse: 1.78763
[33]	valid_0's rm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1]	valid_0's rmse: 5.65827
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 5.59106
[3]	valid_0's rmse: 5.5321
[4]	valid_0's rmse: 5.48414
[5]	valid_0's rmse: 5.43782
[6]	valid_0's rmse: 5.40779
[7]	valid_0's rmse: 5.37351
[8]	valid_0's rmse: 5.34231
[9]	valid_0's rmse: 5.32052
[10]	valid_0's rmse: 5.29782
[11]	valid_0's rmse: 5.27952
[12]	valid_0's rmse: 5.26364
[13]	valid_0's rmse: 5.24976
[14]	valid_0's rmse: 5.23478
[15]	valid_0's rmse: 5.236
[16]	valid_0's rmse: 5.22728
[17]	valid_0's rmse: 5.21906
[18]	valid_0's rmse: 5.2113
[19]	valid_0's rmse: 5.20117
[20]	valid_0's rmse: 5.19186
[21]	valid_0's rmse: 5.18461
[22]	valid_0's rmse: 5.17871
[23]	valid_0's rmse: 5.17277
[24]	valid_0's rmse: 5.17014
[25]	valid_0's rmse: 5.1615
[26]	valid_0's rmse: 5.15659
[27]	valid_0's rmse: 5.15228
[28]	valid_0's rmse: 5.15013
[29]	valid_0's rmse: 5.14811
[30]	valid_0's rmse: 5.14701
[31]	valid_0's rmse: 5.14401
[32]	valid_0's rmse: 5.14161
[33]	valid_0's rmse: 5.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 2.63952
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 2.58883
[3]	valid_0's rmse: 2.54994
[4]	valid_0's rmse: 2.51024
[5]	valid_0's rmse: 2.48116
[6]	valid_0's rmse: 2.45032
[7]	valid_0's rmse: 2.42148
[8]	valid_0's rmse: 2.40813
[9]	valid_0's rmse: 2.38909
[10]	valid_0's rmse: 2.37454
[11]	valid_0's rmse: 2.36112
[12]	valid_0's rmse: 2.349
[13]	valid_0's rmse: 2.34244
[14]	valid_0's rmse: 2.33678
[15]	valid_0's rmse: 2.33449
[16]	valid_0's rmse: 2.32991
[17]	valid_0's rmse: 2.32618
[18]	valid_0's rmse: 2.32561
[19]	valid_0's rmse: 2.32254
[20]	valid_0's rmse: 2.32009
[21]	valid_0's rmse: 2.31658
[22]	valid_0's rmse: 2.31922
[23]	valid_0's rmse: 2.31646
[24]	valid_0's rmse: 2.31368
[25]	valid_0's rmse: 2.31133
[26]	valid_0's rmse: 2.30851
[27]	valid_0's rmse: 2.30509
[28]	valid_0's rmse: 2.30552
[29]	valid_0's rmse: 2.30449
[30]	valid_0's rmse: 2.3032
[31]	valid_0's rmse: 2.30563
[32]	valid_0's rmse: 2.30251
[33]	valid_0's rmse

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 2.81316
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 2.77396
[3]	valid_0's rmse: 2.74154
[4]	valid_0's rmse: 2.7092
[5]	valid_0's rmse: 2.68025
[6]	valid_0's rmse: 2.6592
[7]	valid_0's rmse: 2.63848
[8]	valid_0's rmse: 2.62224
[9]	valid_0's rmse: 2.60649
[10]	valid_0's rmse: 2.59252
[11]	valid_0's rmse: 2.58006
[12]	valid_0's rmse: 2.56798
[13]	valid_0's rmse: 2.55663
[14]	valid_0's rmse: 2.54948
[15]	valid_0's rmse: 2.54255
[16]	valid_0's rmse: 2.53255
[17]	valid_0's rmse: 2.52724
[18]	valid_0's rmse: 2.52333
[19]	valid_0's rmse: 2.52174
[20]	valid_0's rmse: 2.51921
[21]	valid_0's rmse: 2.51577
[22]	valid_0's rmse: 2.51459
[23]	valid_0's rmse: 2.51468
[24]	valid_0's rmse: 2.51237
[25]	valid_0's rmse: 2.51104
[26]	valid_0's rmse: 2.51073
[27]	valid_0's rmse: 2.51027
[28]	valid_0's rmse: 2.50966
[29]	valid_0's rmse: 2.50725
[30]	valid_0's rmse: 2.50715
[31]	valid_0's rmse: 2.50481
[32]	valid_0's rmse: 2.50392
[33]	valid_0's rms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1]	valid_0's rmse: 2.04515
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.99699
[3]	valid_0's rmse: 1.95733
[4]	valid_0's rmse: 1.92021
[5]	valid_0's rmse: 1.89253
[6]	valid_0's rmse: 1.86678
[7]	valid_0's rmse: 1.84444
[8]	valid_0's rmse: 1.82825
[9]	valid_0's rmse: 1.81004
[10]	valid_0's rmse: 1.79498
[11]	valid_0's rmse: 1.78537
[12]	valid_0's rmse: 1.77626
[13]	valid_0's rmse: 1.76994
[14]	valid_0's rmse: 1.76038
[15]	valid_0's rmse: 1.75557
[16]	valid_0's rmse: 1.75019
[17]	valid_0's rmse: 1.74504
[18]	valid_0's rmse: 1.74174
[19]	valid_0's rmse: 1.74116
[20]	valid_0's rmse: 1.74034
[21]	valid_0's rmse: 1.7375
[22]	valid_0's rmse: 1.73599
[23]	valid_0's rmse: 1.73444
[24]	valid_0's rmse: 1.73306
[25]	valid_0's rmse: 1.73123
[26]	valid_0's rmse: 1.73079
[27]	valid_0's rmse: 1.72931
[28]	valid_0's rmse: 1.72818
[29]	valid_0's rmse: 1.72947
[30]	valid_0's rmse: 1.72706
[31]	valid_0's rmse: 1.72752
[32]	valid_0's rmse: 1.72672
[33]	valid_0's rmse

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1]	valid_0's rmse: 3.23475
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 3.17755
[3]	valid_0's rmse: 3.12708
[4]	valid_0's rmse: 3.08357
[5]	valid_0's rmse: 3.04691
[6]	valid_0's rmse: 3.01449
[7]	valid_0's rmse: 2.98651
[8]	valid_0's rmse: 2.96663
[9]	valid_0's rmse: 2.9449
[10]	valid_0's rmse: 2.9276
[11]	valid_0's rmse: 2.91539
[12]	valid_0's rmse: 2.90104
[13]	valid_0's rmse: 2.88959
[14]	valid_0's rmse: 2.88045
[15]	valid_0's rmse: 2.87001
[16]	valid_0's rmse: 2.86657
[17]	valid_0's rmse: 2.86163
[18]	valid_0's rmse: 2.85155
[19]	valid_0's rmse: 2.85061
[20]	valid_0's rmse: 2.84636
[21]	valid_0's rmse: 2.8437
[22]	valid_0's rmse: 2.84206
[23]	valid_0's rmse: 2.83379
[24]	valid_0's rmse: 2.83278
[25]	valid_0's rmse: 2.83128
[26]	valid_0's rmse: 2.828
[27]	valid_0's rmse: 2.82452
[28]	valid_0's rmse: 2.82381
[29]	valid_0's rmse: 2.82079
[30]	valid_0's rmse: 2.81942
[31]	valid_0's rmse: 2.81898
[32]	valid_0's rmse: 2.81766
[33]	valid_0's rmse: 2.

In [100]:
importance = {}
for store_id in store_ids:
  importance[store_id] = feature_importance(models[store_id],df_train,features)

importance = pd.concat([importance[store_id] for store_id in store_ids ])
importance = importance.reset_index().rename(columns={'index':'feature'}).groupby('feature').mean().sort_values('importance',ascending=False).reset_index()

In [101]:
importance[importance['importance']>0].head(30)

Unnamed: 0,importance
bf2m_mean_goods_amount,0.330086
goods_id,0.259345
num_of_month,0.135208
month,0.086632
bf2m_amount,0.046484
bf2m_av_price,0.028024
category_id,0.026998
bf3m_mean_goods_amount,0.016925
bf3m_mv3m_amount,0.011806
bf2m_mv3m_amount,0.007387


In [103]:
rmses=[]
for store_id in store_ids:
  for model in models[store_id]:
    rmses.append(model.best_score["valid_0"]["rmse"])
np.mean(rmses)

2.795561498711899

予測

In [104]:
def predict(models,df,features):
  X = df[features]
  preds = np.zeros((len(df),len(models)))
  for n , model in enumerate(models):
    # pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
    pred = model.predict(X,num_iteration=model.best_iteration)
    preds[:,n] = pred

  score = df.copy()
  score["pred"] = np.mean(preds,axis=1)
  
  return score

In [105]:
df_tests = {}
for store_id in store_ids:
  df_tests[store_id] = df_test[df_train['store_id']==store_id]

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separa

In [111]:
scores = {}
for store_id in store_ids:
  scores[store_id] = predict(models[store_id],df_tests[store_id],features)

In [112]:
score = pd.concat([scores[store_id] for store_id in store_ids ])

In [116]:
score = score.sort_values('index')

In [117]:
print(score['pred'].min())
print(score['pred'].max())

0.020236833248020544
17.519070494930045


In [118]:
score = score[['index','pred']]

In [119]:
score.to_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/submit.csv",header=False,index=False)