### ベースラインを作成するためのスクリプト

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb

import sys
sys.path.append('../')

from src.functions import Functions
from src.modeling import Modeling

#### 用いる自作関数(src/functions.py)

In [2]:
print(Functions.appendHolidayFlag.__doc__)
print(Functions.datetimeConverter.__doc__)
print(Functions.getColumnsDiff.__doc__)

assigning features related to holidays
        Args:
            df : DataFrame
        Return:
            DataFrame
        
        
convert specific col to datetime
        Args:
            df : DataFrame
            col_name : str
        Return:
            pd.Series
        
        
get diff of columns between df A and df B
        Args:
            A : DataFrame
            B : DataFrame
        Return:
            list
        
        


#### データセットの呼び出し

In [3]:
features = pd.read_csv('../data/raw_data/features.csv')
sales_train = pd.read_csv('../data/raw_data/train.csv')
stores = pd.read_csv('../data/raw_data/stores.csv')
sales_test = pd.read_csv('../data/raw_data/test.csv')

# Date列の型変換
features['Date'] = Functions.datetimeConverter(features, 'Date')
sales_train['Date'] = Functions.datetimeConverter(sales_train, 'Date')
sales_test['Date'] = Functions.datetimeConverter(sales_test, 'Date')

#### データセットの結合

In [4]:
df_all_train = sales_train.merge(features, how="left", on=["Store","Date","IsHoliday"]).merge(stores, how="left", on=["Store"])
df_all_test = sales_test.merge(features, how="left", on=["Store","Date","IsHoliday"]).merge(stores, how="left", on=["Store"])

#### ラベル変更

In [5]:
df_all_train = pd.get_dummies(df_all_train, columns=["Type"])
df_all_test = pd.get_dummies(df_all_test, columns=["Type"])

LE = LabelEncoder()
df_all_train['IsHoliday'] = LE.fit_transform(df_all_train['IsHoliday'])
df_all_test['IsHoliday'] = LE.fit_transform(df_all_test['IsHoliday'])

#### 特徴量の追加

In [6]:
df_all_train = Functions.appendHolidayFlag(df_all_train)
df_all_test = Functions.appendHolidayFlag(df_all_test)

df_all_train["Month"] = df_all_train['Date'].dt.month
df_all_train["week"] = df_all_train['Date'].dt.week
df_all_train["Day"] = df_all_train['Date'].dt.day
df_all_train = df_all_train.drop(["Date"], axis=1)

df_all_test["Month"] = df_all_test['Date'].dt.month
df_all_test["week"] = df_all_test['Date'].dt.week
df_all_test["Day"] = df_all_test['Date'].dt.day
df_all_test = df_all_test.drop(["Date"], axis=1)

  df_all_train["week"] = df_all_train['Date'].dt.week
  df_all_test["week"] = df_all_test['Date'].dt.week


#### 欠損方法の方針決め(xgbの使用時に用いるデータは欠損の補完を行わない)
- ['mean', 'median', 'zero']それぞれで欠損を補完し、validationデータで算出したWMAEを比較する
- -> 結果的に大きな差がなかった

In [7]:
def compute(train, test, testSize=0.3):
    
    d = Functions.getColumnsDiff(train, test)
    
    X_train, X_test, y_train, y_test = Functions.getData(train, Drops=d, testSize=0.3)
    etr = ExtraTreesRegressor(bootstrap=False, criterion="mse", max_depth=None,
                                          max_features="auto", max_leaf_nodes=None,
                                          min_impurity_decrease=0.0, min_impurity_split=None,
                                          min_samples_leaf=2, min_samples_split=5,
                                          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=30,
                                          oob_score=False, random_state=2021, warm_start=False)
    etr.fit(X_train, y_train)

    y_pred = etr.predict(X_test)
    Modeling.computeMetrics(X_test, y_test, y_pred)

In [8]:
# trainデータとtestデータを結合し、一括変換
drops = Functions.getColumnsDiff(df_all_train, df_all_test)
df_all_train_alpha = df_all_train.drop(drops, axis=1)
df_all_train_alpha['flg'] = 'Train'
df_all_test['flg'] = 'Test'
stack_df = pd.concat([df_all_train_alpha, df_all_test])

#### 欠損方法の方針決め(xgbの使用時に用いるデータは欠損の補完を行わない)
- ['mean', 'median', 'zero']それぞれで欠損を補完し、validationデータで算出した精度を比較する
- -> 結果的に大きな差がなかった

In [9]:
#options = ['mean', 'median', 'mode', 'zero']
#for idx, fill in enumerate(options):
#    if fill == 'mean':
#        print('-'*5, 'mean', '-'*5,)
#        import copy
#        stack_df１ = stack_df.copy()
#        stack_df1['MarkDown1'].fillna(stack_df1['MarkDown1'].mean(), inplace=True)
#        stack_df1['MarkDown2'].fillna(stack_df1['MarkDown2'].mean(), inplace=True)
#        stack_df1['MarkDown3'].fillna(stack_df1['MarkDown3'].mean(), inplace=True)
#        stack_df1['MarkDown4'].fillna(stack_df1['MarkDown4'].mean(), inplace=True)
#        stack_df1['MarkDown5'].fillna(stack_df1['MarkDown5'].mean(), inplace=True)
#        stack_df1['CPI'].fillna(stack_df1['CPI'].mean(), inplace=True)
#        stack_df1['Unemployment'].fillna(stack_df1['Unemployment'].mean(), inplace=True)
#        
#        train_df = stack_df1.groupby('flg').get_group('Train').drop('flg', axis=1)
#        train_df['Weekly_Sales'] = df_all_train['Weekly_Sales']
#        test_df = stack_df1.groupby('flg').get_group('Test').drop('flg', axis=1)
#        
#        compute(train_df, test_df)
#        
#    if fill == 'median':
#        print('-'*5, 'median', '-'*5,)
#        stack_df2 = stack_df.copy()
#        stack_df2['MarkDown1'].fillna(stack_df2['MarkDown1'].median(), inplace=True)
#        stack_df2['MarkDown2'].fillna(stack_df2['MarkDown2'].median(), inplace=True)
#        stack_df2['MarkDown3'].fillna(stack_df2['MarkDown3'].median(), inplace=True)
#        stack_df2['MarkDown4'].fillna(stack_df2['MarkDown4'].median(), inplace=True)
#        stack_df2['MarkDown5'].fillna(stack_df2['MarkDown5'].median(), inplace=True)
#        stack_df2['CPI'].fillna(stack_df2['CPI'].median(), inplace=True)
#        stack_df2['Unemployment'].fillna(stack_df2['Unemployment'].median(), inplace=True)
#        
#        train_df = stack_df2.groupby('flg').get_group('Train').drop('flg', axis=1)
#        train_df['Weekly_Sales'] = df_all_train['Weekly_Sales']
#        test_df = stack_df2.groupby('flg').get_group('Test').drop('flg', axis=1)
#        
#        compute(train_df, test_df)
#        
#    if fill == 'zero':
#        print('-'*5, 'zero', '-'*5,)
#        stack_df4 = stack_df.copy()
#        stack_df4['MarkDown1'].fillna(0, inplace=True)
#        stack_df4['MarkDown2'].fillna(0, inplace=True)
#        stack_df4['MarkDown3'].fillna(0, inplace=True)
#        stack_df4['MarkDown4'].fillna(0, inplace=True)
#        stack_df4['MarkDown5'].fillna(0, inplace=True)
#        stack_df4['CPI'].fillna(0, inplace=True)
#        stack_df4['Unemployment'].fillna(0, inplace=True)
#
#        train_df = stack_df4.groupby('flg').get_group('Train').drop('flg', axis=1)
#        train_df['Weekly_Sales'] = df_all_train['Weekly_Sales']
#        test_df = stack_df4.groupby('flg').get_group('Test').drop('flg', axis=1)
#
#        compute(train_df, test_df)
#

### xgbについてsub用のcsvファイルの作成

In [10]:
drops = Functions.getColumnsDiff(df_all_train, df_all_test)

X_train = df_all_train.drop(drops, axis=1)
y_train = df_all_train['Weekly_Sales']

XGB = xgb.XGBRegressor(objective="reg:squarederror", random_state=2021)
XGB.fit(X_train, y_train)

y_pred = XGB.predict(df_all_test.drop('flg', axis=1))
ss = pd.read_csv('../data/raw_data/sampleSubmission.csv')
ss.loc[:, 'Weekly_Sales'] = y_pred
ss.to_csv('../files/submissions/xgb_baseline.csv', index=False)

#### ExtraTreesRegressorについてsub用のcsvファイルの作成

In [11]:
stack_df = stack_df.fillna(stack_df.mean())

df_all_train_ = stack_df.groupby('flg').get_group('Train').drop('flg', axis=1)
df_all_train_['Weekly_Sales'] = df_all_train['Weekly_Sales']
df_all_test = stack_df.groupby('flg').get_group('Test').drop('flg', axis=1)

drops = Functions.getColumnsDiff(df_all_train_, df_all_test)

X_train = df_all_train_.drop(drops, axis=1)
y_train = df_all_train_['Weekly_Sales']
etr = ExtraTreesRegressor(bootstrap=False, criterion="mse", max_depth=None,
                                      max_features="auto", max_leaf_nodes=None,
                                      min_impurity_decrease=0.0, min_impurity_split=None,
                                      min_samples_leaf=2, min_samples_split=5,
                                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=30,
                                      oob_score=False, random_state=2021, warm_start=False)
etr.fit(X_train, y_train)

y_pred = etr.predict(df_all_test)
ss = pd.read_csv('../data/raw_data/sampleSubmission.csv')
ss.loc[:, 'Weekly_Sales'] = y_pred
ss.to_csv('../files/submissions/ExtraTreesRegressor_baseline.csv', index=False)