<a href="https://colab.research.google.com/github/yuto-kobayashi-1/signate-AIQuest/blob/develop/20210922_AIquest_StockPredict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb
from copy import deepcopy
import numpy as np
from datetime import datetime , date 
from sklearn.inspection import permutation_importance
from sklearn.model_selection import TimeSeriesSplit

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
col_name = {'日付':'date',
          '店舗ID':'store_id',
          '商品ID':'goods_id',
          '商品価格':'price',
          '売上個数':'amount',
          '商品カテゴリID':'category_id'
          }

In [9]:
sales_data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/sales_history.csv",encoding="utf_8")
test_data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/test.csv",encoding="utf_8")

In [10]:
def preproces(df,col_name):
  df = df.rename(columns=col_name)
 
  # イレギュラーデータを削除する
  df = df[df['price']!=0]
  df = df[df['amount']>0]

  df['date'] = pd.to_datetime(df['date'])
  df['base_date'] = df['date'].dt.strftime("%Y%m")

  df['sales_amount'] = (df['price'] * df['amount']).astype('int')
 
  #月次単位にデータを集計
  df = df.drop(['price'],axis=1).groupby(['base_date','store_id','goods_id']).sum(['amount','sales_amount']).reset_index()
  df['av_price'] = (df['sales_amount'] /  df['amount'] )

   # 「日付」カラムの文字列から、「年」「月」の情報を抽出する
  df['year'] = df['base_date'].apply(lambda x: x[:4]) 
  df['month'] = df['base_date'].apply(lambda x: x[4:])
  
  #月ブロックの作成
  gp_time = df.groupby(['year', 'month']).count().reset_index()[['year', 'month']]
  # カラム名「月ブロック」として、通し番号をつける
  gp_time['base_date_count'] = list(range(len(gp_time)))

  # testデータ用に、評価対象期間である2019年12月(月ブロック: 23)のレコードを追加する
  gp_time = gp_time.append({
    'base_date_count': 23,
    'year': '2019',
    'month': '12'
  }, ignore_index=True)

  # 「月ブロック」カラムをsalesに統合する
  df = pd.merge(df, gp_time, on=['year', 'month'], how='left')

  all_combination = zero_padding_amount(df)

  all_combination = pd.merge(all_combination, gp_time, on=['base_date_count'], how='left')
  df = pd.merge(all_combination, df, on=['base_date_count', 'goods_id', 'store_id','year','month'], how='left')
  
  fillna_cols = ['amount','sales_amount']
  for col in fillna_cols:
    df[col] = df[col].fillna(0)

  # カテゴリーデータの登録
  category = pd.read_csv('/content/drive/MyDrive/signate/competiton/210919_AIquest/item_categories.csv')
  category = category.rename(columns=col_name)

  df = pd.merge(df,category,on=['goods_id'],how='left')

  return df , gp_time

In [11]:
def make_features(df):

  #移動平均作成
  df = df.sort_values(['base_date_count','goods_id','store_id'])

  period = [3,6,12]
  for i in period:
    # 販売個数の移動平均算出
    tmp = df[['goods_id','store_id','amount']].groupby(['store_id','goods_id'],group_keys=False).rolling(window=i).mean()['amount'].reset_index()
    df[f'mv{i}m_amount'] = tmp[['level_2','amount']].set_index('level_2')['amount']
  
  #leg特徴量作成
  for i in range(2,13,1):
    lag = df.copy()
    lag['base_date_count'] = lag['base_date_count'] + i
    lag = lag.rename(columns={'amount': f'bf{i}m_amount'})
    lag = lag.rename(columns={'av_price': f'bf{i}m_av_price'})
    lag = lag.rename(columns={'av_price': f'bf{i}m_av_price'})
    
    col = []
    for peripd in [3,6,12]:
      lag = lag.rename(columns={f'mv{peripd}m_amount': f'bf{i}m_mv{peripd}m_amount'})
      col.append(f'bf{i}m_mv{peripd}m_amount')

    # lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount']
    lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount',f'bf{i}m_av_price']+col]

    df = pd.merge(df, lag, on=['base_date_count', 'store_id', 'goods_id'], how='left', left_index=True)

  df = df.reset_index(drop=True)

 

  return df

In [12]:
def make_test_data(df_test,df_train,col_name):
  df_test = df_test.rename(columns=col_name)
  #初期値設定
  df_test['year'] = 2019
  df_test['month'] = 12
  df_test['amount'] = 0
  df_test['base_date_count'] = 23

  category = pd.read_csv('/content/drive/MyDrive/signate/competiton/210919_AIquest/item_categories.csv')
  category = category.rename(columns=col_name)
  df_test = pd.merge(df_test,category,on=['goods_id'],how='left')
  
  #traindata移動平均乖離率を算出
  #移動平均作成
  # df_train = df_train.sort_values(['base_date_count','goods_id','store_id'])

  period = [3,6,12]
  mv_col = []
  for i in period:
  # 販売個数の移動平均用カラム追加
    mv_col.append(f'mv{i}m_amount')

  #店舗商品単位で過去販売数データを作成
  df_train = df_train[['store_id','goods_id','amount','base_date_count','av_price'] + mv_col]
  # print(df_train.columns)
  for i in range(2,13,1):
    lag = df_train.copy()
    lag['base_date_count'] = lag['base_date_count'] + i
    lag = lag.rename(columns={'amount': f'bf{i}m_amount'})
    lag = lag.rename(columns={'av_price': f'bf{i}m_av_price'})

    col = []
    for peripd in [3,6,12]:
      lag = lag.rename(columns={f'mv{peripd}m_amount': f'bf{i}m_mv{peripd}m_amount'})
      col.append(f'bf{i}m_mv{peripd}m_amount')

    # lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount']]
    lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount',f'bf{i}m_av_price']+col]

    df_test = pd.merge(df_test, lag, on=['base_date_count', 'store_id', 'goods_id'], how='left', left_index=True)
  
  df_test = df_test.reset_index(drop=True)
  # #testデータに特徴量付与
  # df_test = df_test.merge(df_merge[['base_date_count', 'store_id', 'goods_id']+cols],how='left')

  return df_test

In [13]:
# def train(df,label_cols,features):
def trainCV(df,features):

  df = df.rename(columns = {'amount': 'y'})

  X =  df[features]
  y = df['y']

  for feature in features:
    X[feature] = X[feature].astype('float')

  # kf = KFold(n_splits=5,shuffle=True,random_state=0)
  kf = TimeSeriesSplit(n_splits=5)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    X_valid = X.iloc[test_index]
    y_valid = y.iloc[test_index]

    # X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'num_leaves': 64,
    # 'min_data_in_leaf': 20,
    # 'max_depth': 7,
    # 'verbose': 0,
  }

    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data],
  )

    models.append(model)

  return models 

In [14]:
# def train(df,label_cols,features):
def trainTS(df,features):

  df = df.rename(columns = {'amount': 'y'})

  idx_train={}
  idx_valid={}
  idx_train[0] = (df['base_date_count']<=9)
  idx_valid[0] = (df['base_date_count']==11)
  idx_train[1] = (df['base_date_count']<=19)
  idx_valid[1] = (df['base_date_count']==21)
  idx_train[2] = (df['base_date_count']>=12) & (df['base_date_count']<=19)
  idx_valid[2] = (df['base_date_count']==21)


  X =  df[features]
  y = df['y']

  for feature in features:
    X[feature] = X[feature].astype('float')

  # kf = KFold(n_splits=5,shuffle=True,random_state=0)
  # kf = TimeSeriesSplit(n_splits=5)

  models = []
  

  for i in range(len(idx_train)):
    print(i)
    X_train = X[idx_train[i]]
    y_train =  y[idx_train[i]]
    X_valid = X[idx_valid[i]]
    y_valid = y[idx_valid[i]]

    # X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'num_leaves': 64,
    # 'min_data_in_leaf': 20,
    # 'max_depth': 7,
    # 'verbose': 0,
  }

    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data],
  )

    models.append(model)

  return models 

In [15]:
def feature_importance(models,data,cols):
  importance = pd.DataFrame(columns=data[cols].columns)

  for model in models:
    imp_ = pd.Series(model.feature_importance(importance_type='gain'),index=data[cols].columns)
    importance = importance.append(imp_,ignore_index=True)

  importance = importance.mean()

  importance = pd.DataFrame(importance,columns={"importance"}).sort_values("importance",ascending=False)

  importance = pd.DataFrame(importance/importance.sum())
  return importance

In [16]:
from itertools import product

In [17]:
def zero_padding_amount(df):
  all_combination = []

  # '月'、'商品ID'、 '店舗ID'の全組み合わせを作成する
  columns = ['base_date_count', 'goods_id', 'store_id']

  #product関数は指定された全組み合わせを作成する関数
  for i in range(22):
      train_block = df[df['base_date_count']==i]
      all_combination.append(np.array(list(product([i], train_block['goods_id'].unique(), train_block['store_id'].unique()))))
    
  all_combination = pd.DataFrame(np.vstack(all_combination), columns=columns)
  all_combination.sort_values(columns, inplace=True)

  return all_combination

前処理

In [18]:
df_sales_data , gp_time = preproces(sales_data,col_name)

In [186]:
df_train = make_features(df_sales_data)

In [188]:
df_train

Unnamed: 0,base_date_count,goods_id,store_id,year,month,base_date,amount,sales_amount,av_price,category_id,mv3m_amount,mv6m_amount,mv12m_amount,bf2m_amount,bf2m_av_price,bf2m_mv3m_amount,bf2m_mv6m_amount,bf2m_mv12m_amount,bf3m_amount,bf3m_av_price,bf3m_mv3m_amount,bf3m_mv6m_amount,bf3m_mv12m_amount,bf4m_amount,bf4m_av_price,bf4m_mv3m_amount,bf4m_mv6m_amount,bf4m_mv12m_amount,bf5m_amount,bf5m_av_price,bf5m_mv3m_amount,bf5m_mv6m_amount,bf5m_mv12m_amount,bf6m_amount,bf6m_av_price,bf6m_mv3m_amount,bf6m_mv6m_amount,bf6m_mv12m_amount,bf7m_amount,bf7m_av_price,bf7m_mv3m_amount,bf7m_mv6m_amount,bf7m_mv12m_amount,bf8m_amount,bf8m_av_price,bf8m_mv3m_amount,bf8m_mv6m_amount,bf8m_mv12m_amount,bf9m_amount,bf9m_av_price,bf9m_mv3m_amount,bf9m_mv6m_amount,bf9m_mv12m_amount,bf10m_amount,bf10m_av_price,bf10m_mv3m_amount,bf10m_mv6m_amount,bf10m_mv12m_amount,bf11m_amount,bf11m_av_price,bf11m_mv3m_amount,bf11m_mv6m_amount,bf11m_mv12m_amount,bf12m_amount,bf12m_av_price,bf12m_mv3m_amount,bf12m_mv6m_amount,bf12m_mv12m_amount
0,0,1000001,0,2018,01,201801,6.0,2520.0,420.0,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0,1000001,1,2018,01,201801,2.0,650.0,325.0,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0,1000001,2,2018,01,201801,1.0,420.0,420.0,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0,1000001,3,2018,01,201801,2.0,840.0,420.0,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0,1000001,4,2018,01,,0.0,0.0,,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647985,21,3500073,13,2019,10,,0.0,0.0,,350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1647986,21,3500073,14,2019,10,,0.0,0.0,,350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1647987,21,3500073,15,2019,10,,0.0,0.0,,350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1647988,21,3500073,16,2019,10,,0.0,0.0,,350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [195]:
df_train[(df_train['goods_id'].isin([1000001]))&(df_train['store_id'].isin([0]))][['base_date_count','goods_id','store_id','amount','mv3m_amount','bf2m_mv3m_amount','bf3m_mv3m_amount']]

Unnamed: 0,base_date_count,goods_id,store_id,amount,mv3m_amount,bf2m_mv3m_amount,bf3m_mv3m_amount
0,0,1000001,0,6.0,,,
83124,1,1000001,0,3.0,,,
167076,2,1000001,0,1.0,3.333333,,
252144,3,1000001,0,2.0,2.0,,
335664,4,1000001,0,0.0,1.0,3.333333,
418878,5,1000001,0,4.0,2.0,2.0,3.333333
503424,6,1000001,0,3.0,2.333333,1.0,2.0
586656,7,1000001,0,2.0,3.0,2.0,1.0
667116,8,1000001,0,1.0,2.0,2.333333,2.0
745362,9,1000001,0,0.0,1.0,3.0,2.333333


In [222]:
df_test = make_test_data(test_data,df_train,col_name)

In [223]:
df_test

Unnamed: 0,index,goods_id,store_id,year,month,amount,base_date_count,category_id,bf2m_amount,bf2m_av_price,bf2m_mv3m_amount,bf2m_mv6m_amount,bf2m_mv12m_amount,bf3m_amount,bf3m_av_price,bf3m_mv3m_amount,bf3m_mv6m_amount,bf3m_mv12m_amount,bf4m_amount,bf4m_av_price,bf4m_mv3m_amount,bf4m_mv6m_amount,bf4m_mv12m_amount,bf5m_amount,bf5m_av_price,bf5m_mv3m_amount,bf5m_mv6m_amount,bf5m_mv12m_amount,bf6m_amount,bf6m_av_price,bf6m_mv3m_amount,bf6m_mv6m_amount,bf6m_mv12m_amount,bf7m_amount,bf7m_av_price,bf7m_mv3m_amount,bf7m_mv6m_amount,bf7m_mv12m_amount,bf8m_amount,bf8m_av_price,bf8m_mv3m_amount,bf8m_mv6m_amount,bf8m_mv12m_amount,bf9m_amount,bf9m_av_price,bf9m_mv3m_amount,bf9m_mv6m_amount,bf9m_mv12m_amount,bf10m_amount,bf10m_av_price,bf10m_mv3m_amount,bf10m_mv6m_amount,bf10m_mv12m_amount,bf11m_amount,bf11m_av_price,bf11m_mv3m_amount,bf11m_mv6m_amount,bf11m_mv12m_amount,bf12m_amount,bf12m_av_price,bf12m_mv3m_amount,bf12m_mv6m_amount,bf12m_mv12m_amount
0,0,1000001,0,2019,12,0,23,100,0.0,,0.333333,0.500000,0.666667,1.0,250.0,1.000000,0.500000,0.666667,0.0,,0.666667,0.333333,0.666667,2.0,250.0,0.666667,0.500000,0.833333,0.0,,0.000000,0.500000,0.916667,0.0,,0.000000,0.500000,1.250000,0.0,,0.333333,0.833333,1.250000,0.0,,1.000000,0.833333,1.416667,1.0,420.0,1.000000,1.000000,1.500000,2.0,420.0,1.333333,1.166667,1.666667,0.0,,0.666667,1.333333,2.000000
1,1,1000001,1,2019,12,0,23,100,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.000000,0.166667,0.0,,0.000000,0.000000,0.333333,0.0,,0.000000,0.000000,0.333333,0.0,,0.000000,0.166667,0.500000,0.0,,0.000000,0.166667,0.500000,0.0,,0.000000,0.166667,0.583333,0.0,,0.333333,0.333333,0.750000
2,2,1000001,2,2019,12,0,23,100,2.0,250.0,1.666667,1.166667,1.000000,0.0,,1.000000,1.000000,0.916667,3.0,250.0,1.666667,1.166667,1.083333,0.0,,0.666667,1.000000,1.166667,2.0,250.0,1.000000,1.000000,1.416667,0.0,,0.666667,0.666667,1.416667,1.0,250.0,1.333333,0.833333,1.500000,1.0,420.0,1.000000,0.833333,1.416667,2.0,420.0,0.666667,1.000000,1.416667,0.0,,0.333333,1.333333,1.250000,0.0,,0.666667,1.833333,1.333333
3,3,1000001,3,2019,12,0,23,100,2.0,250.0,3.666667,4.166667,4.416667,3.0,250.0,4.666667,4.166667,4.833333,6.0,250.0,5.000000,4.166667,5.333333,5.0,250.0,4.666667,3.500000,5.333333,4.0,250.0,3.666667,3.500000,5.250000,5.0,250.0,3.333333,4.333333,5.000000,2.0,250.0,2.333333,4.666667,4.750000,3.0,420.0,3.333333,5.500000,5.250000,2.0,420.0,5.333333,6.500000,5.250000,5.0,420.0,7.000000,7.166667,5.083333,9.0,420.0,7.666667,7.000000,4.833333
4,4,1000001,4,2019,12,0,23,100,12.0,250.0,11.333333,9.000000,7.250000,8.0,250.0,10.333333,7.500000,6.750000,14.0,250.0,9.666667,7.166667,6.250000,9.0,250.0,6.666667,6.166667,5.250000,6.0,250.0,4.666667,5.333333,5.000000,5.0,250.0,4.666667,5.500000,4.833333,3.0,250.0,5.666667,5.500000,4.583333,6.0,420.0,6.000000,6.000000,4.500000,8.0,420.0,6.333333,5.333333,4.083333,4.0,420.0,5.333333,4.333333,3.583333,7.0,420.0,6.000000,4.666667,3.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3055,3055,3500001,13,2019,12,0,23,350,2.0,420.0,2.000000,1.666667,1.416667,3.0,420.0,1.666667,1.333333,1.416667,1.0,420.0,1.000000,1.000000,1.166667,1.0,420.0,1.333333,1.166667,1.333333,1.0,420.0,1.000000,1.000000,1.333333,2.0,420.0,1.000000,1.333333,1.416667,0.0,,1.000000,1.166667,1.500000,1.0,420.0,1.000000,1.500000,1.500000,2.0,420.0,1.666667,1.333333,1.500000,0.0,,1.333333,1.500000,1.500000,3.0,420.0,2.000000,1.666667,1.583333
3056,3056,3500001,14,2019,12,0,23,350,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.000000,0.166667,0.0,,0.000000,0.000000,0.166667
3057,3057,3500001,15,2019,12,0,23,350,0.0,,0.000000,0.166667,0.083333,0.0,,0.000000,0.166667,0.083333,0.0,,0.000000,0.166667,0.083333,0.0,,0.333333,0.166667,0.083333,0.0,,0.333333,0.166667,0.083333,1.0,200.0,0.333333,0.166667,0.083333,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000
3058,3058,3500001,16,2019,12,0,23,350,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.000000,0.083333,0.0,,0.000000,0.166667,0.166667,0.0,,0.000000,0.166667,0.166667,0.0,,0.000000,0.166667,0.166667,0.0,,0.333333,0.166667,0.166667,0.0,,0.333333,0.166667,0.166667,1.0,420.0,0.333333,0.166667,0.166667,0.0,,0.000000,0.166667,0.083333,0.0,,0.000000,0.166667,0.083333,0.0,,0.000000,0.166667,0.083333


In [224]:
df_test['bf3m_mv3m_amount']

0        1.000000
1        0.000000
2        1.000000
3        4.666667
4       10.333333
          ...    
3055     1.666667
3056     0.000000
3057     0.000000
3058     0.000000
3059     0.000000
Name: bf3m_mv3m_amount, Length: 3060, dtype: float64

In [207]:
df_test['bf3m_mv3m_amount']

0        1.000000
1        0.000000
2        1.000000
3        4.666667
4       10.333333
          ...    
3055     1.666667
3056     0.000000
3057     0.000000
3058     0.000000
3059     0.000000
Name: bf3m_mv3m_amount, Length: 3060, dtype: float64

In [208]:
print(len(df_sales_data))
print(len(df_train))
print(len(test_data))
print(len(df_test))

1647990
1647990
3060
3060


In [209]:
#特徴量編集結果確認
i = 5
print(df_train.groupby('base_date_count').agg({'bf2m_amount': 'count','bf2m_av_price': 'count',f'bf{i}m_amount': 'count',f'bf{i}m_av_price': 'count'}))
print(df_test.groupby('base_date_count').agg({'bf2m_amount': 'count','bf2m_av_price': 'count',f'bf{i}m_amount': 'count',f'bf{i}m_av_price': 'count'}))
# print(df_train.groupby('base_date_count').agg({'bf2m_amount': 'count',f'bf{i}m_amount': 'count'}))
# print(df_test.groupby('base_date_count').agg({'bf2m_amount': 'count',f'bf{i}m_amount': 'count'}))

                 bf2m_amount  bf2m_av_price  bf5m_amount  bf5m_av_price
base_date_count                                                        
0                          0              0            0              0
1                          0              0            0              0
2                      70920          25923            0              0
3                      69660          24215            0              0
4                      70686          25951            0              0
5                      70542          23198        64674          24422
6                      69660          22149        64368          22941
7                      67158          22678        61866          23816
8                      66006          23884        60372          21190
9                      64296          23742        58428          19886
10                     63864          21092        59904          21004
11                     65682          21130        60552        

モデル構築

In [210]:
lag = []
for i in range(2,13,1):
  lag.append(f'bf{i}m_amount')
  lag.append(f'bf{i}m_av_price')
  for peripd in [3,6,12]:
    lag.append(f'bf{i}m_mv{peripd}m_amount')
    
features = ['store_id','goods_id','category_id','year','month'] + lag

In [211]:
features

['store_id',
 'goods_id',
 'category_id',
 'year',
 'month',
 'bf2m_amount',
 'bf2m_av_price',
 'bf2m_mv3m_amount',
 'bf2m_mv6m_amount',
 'bf2m_mv12m_amount',
 'bf3m_amount',
 'bf3m_av_price',
 'bf3m_mv3m_amount',
 'bf3m_mv6m_amount',
 'bf3m_mv12m_amount',
 'bf4m_amount',
 'bf4m_av_price',
 'bf4m_mv3m_amount',
 'bf4m_mv6m_amount',
 'bf4m_mv12m_amount',
 'bf5m_amount',
 'bf5m_av_price',
 'bf5m_mv3m_amount',
 'bf5m_mv6m_amount',
 'bf5m_mv12m_amount',
 'bf6m_amount',
 'bf6m_av_price',
 'bf6m_mv3m_amount',
 'bf6m_mv6m_amount',
 'bf6m_mv12m_amount',
 'bf7m_amount',
 'bf7m_av_price',
 'bf7m_mv3m_amount',
 'bf7m_mv6m_amount',
 'bf7m_mv12m_amount',
 'bf8m_amount',
 'bf8m_av_price',
 'bf8m_mv3m_amount',
 'bf8m_mv6m_amount',
 'bf8m_mv12m_amount',
 'bf9m_amount',
 'bf9m_av_price',
 'bf9m_mv3m_amount',
 'bf9m_mv6m_amount',
 'bf9m_mv12m_amount',
 'bf10m_amount',
 'bf10m_av_price',
 'bf10m_mv3m_amount',
 'bf10m_mv6m_amount',
 'bf10m_mv12m_amount',
 'bf11m_amount',
 'bf11m_av_price',
 'bf11m_mv3m_amo

In [212]:
models = trainTS(df_train,features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 3.86147
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 3.81042
[3]	valid_0's rmse: 3.7682
[4]	valid_0's rmse: 3.72445
[5]	valid_0's rmse: 3.69439
[6]	valid_0's rmse: 3.66751
[7]	valid_0's rmse: 3.64117
[8]	valid_0's rmse: 3.62066
[9]	valid_0's rmse: 3.60141
[10]	valid_0's rmse: 3.58613
[11]	valid_0's rmse: 3.57277
[12]	valid_0's rmse: 3.55905
[13]	valid_0's rmse: 3.54725
[14]	valid_0's rmse: 3.53763
[15]	valid_0's rmse: 3.52715
[16]	valid_0's rmse: 3.52161
[17]	valid_0's rmse: 3.51578
[18]	valid_0's rmse: 3.50806
[19]	valid_0's rmse: 3.50204
[20]	valid_0's rmse: 3.4977
[21]	valid_0's rmse: 3.49616
[22]	valid_0's rmse: 3.49315
[23]	valid_0's rmse: 3.49175
[24]	valid_0's rmse: 3.48485
[25]	valid_0's rmse: 3.48462
[26]	valid_0's rmse: 3.48082
[27]	valid_0's rmse: 3.46996
[28]	valid_0's rmse: 3.46511
[29]	valid_0's rmse: 3.46203
[30]	valid_0's rmse: 3.46131
[31]	valid_0's rmse: 3.45611
[32]	valid_0's rmse: 3.45432
[33]	valid_0's rms

In [226]:
importance = feature_importance(models,df_train,features)  

In [227]:
importance

Unnamed: 0,importance
bf2m_amount,0.392751
goods_id,0.163697
month,0.071586
bf2m_av_price,0.048971
store_id,0.044998
bf3m_amount,0.044026
category_id,0.042834
bf2m_mv3m_amount,0.036535
bf3m_mv3m_amount,0.024404
bf3m_av_price,0.016528


In [215]:
rmses=[]
for model in models:
  rmses.append(model.best_score["valid_0"]["rmse"])
np.mean(rmses)

3.079433117013808

予測

In [216]:
def predict(models,df,features):
  X = df[features]
  preds = np.zeros((len(df),len(models)))
  for n , model in enumerate(models):
    # pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
    pred = model.predict(X,num_iteration=model.best_iteration)
    preds[:,n] = pred

  score = df.copy()
  score["pred"] = np.mean(preds,axis=1)
  
  return score

In [217]:
score = predict(models,df_test,features)

In [218]:
print(score['pred'].min())
print(score['pred'].max())

0.005807073753910024
20.879685766842623


In [219]:
score = score[['index','pred']]

In [220]:
score.to_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/submit.csv",header=False,index=False)