<a href="https://colab.research.google.com/github/yuto-kobayashi-1/signate-AIQuest/blob/develop/20210922_AIquest_StockPredict_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb
from copy import deepcopy
import numpy as np
from datetime import datetime , date 
from sklearn.inspection import permutation_importance
from sklearn.model_selection import TimeSeriesSplit

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
col_name = {'日付':'date',
          '店舗ID':'store_id',
          '商品ID':'goods_id',
          '商品価格':'price',
          '売上個数':'amount',
          '商品カテゴリID':'category_id'
          }

In [4]:
sales_data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/sales_history.csv",encoding="utf_8")
test_data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/test.csv",encoding="utf_8")

In [5]:
def preproces(df,col_name):
  df = df.rename(columns=col_name)

  category = pd.read_csv('/content/drive/MyDrive/signate/competiton/210919_AIquest/item_categories.csv')
  category = category.rename(columns=col_name)

  # イレギュラーデータを削除する
  df = df[df['price']!=0]
  df = df[df['amount']>0]
  
  df = pd.merge(df,category,on=['goods_id'],how='left')

  df['date'] = pd.to_datetime(df['date'])
  df['base_date'] = df['date'].dt.strftime("%Y%m")

  df['sales_amount'] = (df['price'] * df['amount']).astype('int')
 
  #月次単位にデータを集計
  df = df.drop(['price'],axis=1).groupby(['base_date','store_id','goods_id']).sum(['amount','sales_amount']).reset_index()
  df['av_price'] = (df['sales_amount'] /  df['amount'] )

   # 「日付」カラムの文字列から、「年」「月」の情報を抽出する
  df['year'] = df['base_date'].apply(lambda x: x[:4]) 
  df['month'] = df['base_date'].apply(lambda x: x[4:])
  
  #月ブロックの作成
  gp_time = df.groupby(['year', 'month']).count().reset_index()[['year', 'month']]
  # カラム名「月ブロック」として、通し番号をつける
  gp_time['base_date_count'] = list(range(len(gp_time)))

  # testデータ用に、評価対象期間である2019年12月(月ブロック: 23)のレコードを追加する
  gp_time = gp_time.append({
    'base_date_count': 23,
    'year': '2019',
    'month': '12'
  }, ignore_index=True)

  # 「月ブロック」カラムをsalesに統合する
  df = pd.merge(df, gp_time, on=['year', 'month'], how='left')


  return df , gp_time

In [None]:
def make_features(df):
  
  # for i in [2,12]:
  for i in range(2,13,1):
    lag = df.copy()
    lag['base_date_count'] = lag['base_date_count'] + i
    lag = lag.rename(columns={'amount': f'bf{i}m_amount'})
    lag = lag.rename(columns={'av_price': f'bf{i}m_av_price'})

    # lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount']
    lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount',f'bf{i}m_av_price']]

    df = pd.merge(df, lag, on=['base_date_count', 'store_id', 'goods_id'], how='left', left_index=True)

  df = df.reset_index(drop=True)

  return df

In [None]:
def make_test_data(df_test,df_train,col_name):
  df_test = df_test.rename(columns=col_name)
  #初期値設定
  df_test['year'] = 2019
  df_test['month'] = 12
  df_test['amount'] = 0
  df_test['base_date_count'] = 23

  category = pd.read_csv('/content/drive/MyDrive/signate/competiton/210919_AIquest/item_categories.csv')
  category = category.rename(columns=col_name)
  df_test = pd.merge(df_test,category,on=['goods_id'],how='left')
  
  #店舗商品単位で過去販売数データを作成
  df_train = df_train[['store_id','goods_id','amount','base_date_count','av_price']]
  print(df_train.columns)
  # for i in [2,12]:
  for i in range(2,13,1):
    lag = df_train.copy()
    lag['base_date_count'] = lag['base_date_count'] + i
    lag = lag.rename(columns={'amount': f'bf{i}m_amount'})
    lag = lag.rename(columns={'av_price': f'bf{i}m_av_price'})

    # lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount']]
    lag = lag[['base_date_count', 'store_id', 'goods_id',f'bf{i}m_amount',f'bf{i}m_av_price']]

    df_test = pd.merge(df_test, lag, on=['base_date_count', 'store_id', 'goods_id'], how='left', left_index=True)
  
  df_test = df_test.reset_index(drop=True)
  # #testデータに特徴量付与
  # df_test = df_test.merge(df_merge[['base_date_count', 'store_id', 'goods_id']+cols],how='left')

  return df_test

In [None]:
# def train(df,label_cols,features):
def trainCV(df,features):

  df = df.rename(columns = {'amount': 'y'})

  X =  df[features]
  y = df['y']

  for feature in features:
    X[feature] = X[feature].astype('float')

  # kf = KFold(n_splits=5,shuffle=True,random_state=0)
  kf = TimeSeriesSplit(n_splits=5)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    X_valid = X.iloc[test_index]
    y_valid = y.iloc[test_index]

    # X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'num_leaves': 64,
    # 'min_data_in_leaf': 20,
    # 'max_depth': 7,
    # 'verbose': 0,
  }

    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data],
  )

    models.append(model)

  return models 

In [None]:
# def train(df,label_cols,features):
def trainTS(df,features):

  df = df.rename(columns = {'amount': 'y'})

  idx_train={}
  idx_valid={}
  idx_train[0] = (df['base_date_count']<=9)
  idx_valid[0] = (df['base_date_count']==11)
  idx_train[1] = (df['base_date_count']<=19)
  idx_valid[1] = (df['base_date_count']==21)
  idx_train[2] = (df['base_date_count']>=12) & (df['base_date_count']<=19)
  idx_valid[2] = (df['base_date_count']==21)


  X =  df[features]
  y = df['y']

  for feature in features:
    X[feature] = X[feature].astype('float')

  # kf = KFold(n_splits=5,shuffle=True,random_state=0)
  # kf = TimeSeriesSplit(n_splits=5)

  models = []
  

  for i in range(len(idx_train)):
    print(i)
    X_train = X[idx_train[i]]
    y_train =  y[idx_train[i]]
    X_valid = X[idx_valid[i]]
    y_valid = y[idx_valid[i]]

    # X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'num_leaves': 64,
    # 'min_data_in_leaf': 20,
    # 'max_depth': 7,
    # 'verbose': 0,
  }

    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data],
  )

    models.append(model)

  return models 

In [None]:
def feature_importance(models,data,cols):
  importance = pd.DataFrame(columns=data[cols].columns)

  for model in models:
    imp_ = pd.Series(model.feature_importance(),index=data[cols].columns)
    importance = importance.append(imp_,ignore_index=True)

  importance = importance.mean()

  importance = pd.DataFrame(importance,columns={"importance"}).sort_values("importance",ascending=False)

  importance = pd.DataFrame(importance/importance.sum())
  return importance

In [None]:
df_sales_data , gp_time = preproces(sales_data,col_name)

In [None]:
df_train = make_features(df_sales_data)

In [None]:
df_test = make_test_data(test_data,df_train,col_name)

Index(['store_id', 'goods_id', 'amount', 'base_date_count', 'av_price'], dtype='object')


In [None]:
df_test

Unnamed: 0,index,goods_id,store_id,year,month,amount,base_date_count,category_id,bf2m_amount,bf2m_av_price,bf3m_amount,bf3m_av_price,bf4m_amount,bf4m_av_price,bf5m_amount,bf5m_av_price,bf6m_amount,bf6m_av_price,bf7m_amount,bf7m_av_price,bf8m_amount,bf8m_av_price,bf9m_amount,bf9m_av_price,bf10m_amount,bf10m_av_price,bf11m_amount,bf11m_av_price,bf12m_amount,bf12m_av_price
0,0,1000001,0,2019,12,0,23,100,,,1.0,250.0,,,2.0,250.0,,,,,,,,,1.0,420.0,2.0,420.0,,
1,1,1000001,1,2019,12,0,23,100,,,,,,,,,,,,,,,,,,,,,,
2,2,1000001,2,2019,12,0,23,100,2.0,250.0,,,3.0,250.0,,,2.0,250.0,,,1.0,250.0,1.0,420.0,2.0,420.0,,,,
3,3,1000001,3,2019,12,0,23,100,2.0,250.0,3.0,250.0,6.0,250.0,5.0,250.0,4.0,250.0,5.0,250.0,2.0,250.0,3.0,420.0,2.0,420.0,5.0,420.0,9.0,420.0
4,4,1000001,4,2019,12,0,23,100,12.0,250.0,8.0,250.0,14.0,250.0,9.0,250.0,6.0,250.0,5.0,250.0,3.0,250.0,6.0,420.0,8.0,420.0,4.0,420.0,7.0,420.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3055,3055,3500001,13,2019,12,0,23,350,2.0,420.0,3.0,420.0,1.0,420.0,1.0,420.0,1.0,420.0,2.0,420.0,,,1.0,420.0,2.0,420.0,,,3.0,420.0
3056,3056,3500001,14,2019,12,0,23,350,,,,,,,,,,,,,,,,,,,,,,
3057,3057,3500001,15,2019,12,0,23,350,,,,,,,,,,,1.0,200.0,,,,,,,,,,
3058,3058,3500001,16,2019,12,0,23,350,,,,,,,,,,,,,,,1.0,420.0,,,,,,


In [None]:
print(len(df_sales_data))
print(len(df_train))
print(len(test_data))
print(len(df_test))

494432
494432
3060
3060


In [None]:
#特徴量編集結果確認
i = 5
print(df_train.groupby('base_date_count').agg({'bf2m_amount': 'count','bf2m_av_price': 'count',f'bf{i}m_amount': 'count',f'bf{i}m_av_price': 'count'}))
print(df_test.groupby('base_date_count').agg({'bf2m_amount': 'count','bf2m_av_price': 'count',f'bf{i}m_amount': 'count',f'bf{i}m_av_price': 'count'}))
# print(df_train.groupby('base_date_count').agg({'bf2m_amount': 'count',f'bf{i}m_amount': 'count'}))
# print(df_test.groupby('base_date_count').agg({'bf2m_amount': 'count',f'bf{i}m_amount': 'count'}))

                 bf2m_amount  bf2m_av_price  bf5m_amount  bf5m_av_price
base_date_count                                                        
0                          0              0            0              0
1                          0              0            0              0
2                      12933          12933            0              0
3                      11336          11336            0              0
4                      12119          12119            0              0
5                      11574          11574        10056          10056
6                      11738          11738        10322          10322
7                      12303          12303        10953          10953
8                      11800          11800         9146           9146
9                      11284          11284         8393           8393
10                     10247          10247         8623           8623
11                     12120          12120        10744        

モデル構築

In [None]:
lag = []
for i in range(2,13,1):
  lag.append(f'bf{i}m_amount')
  lag.append(f'bf{i}m_av_price')

features = ['store_id','goods_id','category_id','year','month'] + lag

In [None]:
features

['store_id',
 'goods_id',
 'category_id',
 'year',
 'month',
 'bf2m_amount',
 'bf2m_av_price',
 'bf3m_amount',
 'bf3m_av_price',
 'bf4m_amount',
 'bf4m_av_price',
 'bf5m_amount',
 'bf5m_av_price',
 'bf6m_amount',
 'bf6m_av_price',
 'bf7m_amount',
 'bf7m_av_price',
 'bf8m_amount',
 'bf8m_av_price',
 'bf9m_amount',
 'bf9m_av_price',
 'bf10m_amount',
 'bf10m_av_price',
 'bf11m_amount',
 'bf11m_av_price',
 'bf12m_amount',
 'bf12m_av_price']

In [None]:
models = trainTS(df_train,features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
[1]	valid_0's rmse: 5.70157
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 5.2073
[3]	valid_0's rmse: 4.76627
[4]	valid_0's rmse: 4.38451
[5]	valid_0's rmse: 4.03222
[6]	valid_0's rmse: 3.73997
[7]	valid_0's rmse: 3.49536
[8]	valid_0's rmse: 3.28528
[9]	valid_0's rmse: 3.09045
[10]	valid_0's rmse: 2.94915
[11]	valid_0's rmse: 2.82123
[12]	valid_0's rmse: 2.71951
[13]	valid_0's rmse: 2.61894
[14]	valid_0's rmse: 2.52599
[15]	valid_0's rmse: 2.45019
[16]	valid_0's rmse: 2.39154
[17]	valid_0's rmse: 2.34642
[18]	valid_0's rmse: 2.30649
[19]	valid_0's rmse: 2.24572
[20]	valid_0's rmse: 2.21274
[21]	valid_0's rmse: 2.16379
[22]	valid_0's rmse: 2.15023
[23]	valid_0's rmse: 2.12931
[24]	valid_0's rmse: 2.11082
[25]	valid_0's rmse: 2.09743
[26]	valid_0's rmse: 2.09143
[27]	valid_0's rmse: 2.08042
[28]	valid_0's rmse: 2.06914
[29]	valid_0's rmse: 2.05758
[30]	valid_0's rmse: 2.04348
[31]	valid_0's rmse: 2.0453
[32]	valid_0's rmse: 2.04171
[33]	valid_0's rms

In [None]:
importance = feature_importance(models,df_train,features)  

In [None]:
importance

Unnamed: 0,importance
category_id,0.257333
goods_id,0.163579
bf2m_amount,0.078667
store_id,0.078596
month,0.063158
bf3m_amount,0.048561
bf2m_av_price,0.041965
bf4m_amount,0.031649
bf6m_amount,0.020351
bf3m_av_price,0.02007


In [None]:
rmses=[]
for model in models:
  rmses.append(model.best_score["valid_0"]["rmse"])
np.mean(rmses)

2.853881467507956

予測

In [None]:
def predict(models,df,features):
  X = df[features]
  preds = np.zeros((len(df),len(models)))
  for n , model in enumerate(models):
    # pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
    pred = model.predict(X,num_iteration=model.best_iteration)
    preds[:,n] = pred

  score = df.copy()
  score["pred"] = np.mean(preds,axis=1)
  
  return score

In [None]:
score = predict(models,df_test,features)

In [None]:
print(score['pred'].min())
print(score['pred'].max())

0.7211484900113123
1.6879753617791036


In [None]:
score = score[['index','pred']]

In [None]:
score.to_csv("/content/drive/MyDrive/signate/competiton/210919_AIquest/submit.csv",header=False,index=False)