# Train-Test Validation

In [2]:
import pandas as pd
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import train_test_validation

In [3]:
!ls data

walmart_ts_6_fcst_grp_test.csv  walmart_ts_6_fcst_grp_train.csv


In [4]:
df_train = pd.read_csv('data/walmart_ts_6_fcst_grp_train.csv')
df_test = pd.read_csv('data/walmart_ts_6_fcst_grp_test.csv')
df_train.shape, df_test.shape

((702, 11), (36, 11))

In [5]:
df_train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,IsHoliday,sample_weight
0,4,4,2010/2/5,59554.57,-1.0,-1.0,-1.0,-1.0,-1.0,0,1
1,4,4,2010/2/12,54069.82,-1.0,-1.0,-1.0,-1.0,-1.0,1,5
2,4,4,2010/2/19,53939.17,-1.0,-1.0,-1.0,-1.0,-1.0,0,1
3,4,4,2010/2/26,54687.08,-1.0,-1.0,-1.0,-1.0,-1.0,0,1
4,4,4,2010/3/5,56959.02,-1.0,-1.0,-1.0,-1.0,-1.0,0,1


In [6]:
df_train.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,IsHoliday,sample_weight
count,702.0,702.0,702.0,702.0,702.0,702.0,702.0,702.0,702.0,702.0
mean,12.666667,9.833333,40117.380071,1978.311709,1386.535114,701.248519,951.741097,1408.076553,0.076923,1.307692
std,10.084663,3.438371,9670.895006,6000.890548,6444.609039,6962.930247,4587.991037,3647.650457,0.266659,1.066637
min,1.0,4.0,20190.54,-1.0,-265.76,-1.0,-1.0,-1.0,0.0,1.0
25%,4.0,8.0,35323.58,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0
50%,10.5,10.5,39868.815,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0
75%,19.0,13.0,44023.645,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0
max,31.0,13.0,72179.92,60394.73,89121.94,109030.75,57817.56,31844.2,1.0,5.0


In [7]:
df_train['Date'].value_counts()

2010/2/5     6
2011/7/8     6
2011/9/30    6
2011/9/23    6
2011/9/16    6
            ..
2010/10/8    6
2010/10/1    6
2010/9/24    6
2010/9/17    6
2012/4/27    6
Name: Date, Length: 117, dtype: int64

In [8]:
df_test.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,IsHoliday,sample_weight
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,12.666667,9.833333,41219.515833,10495.218056,73.827778,199.830556,2763.526389,5088.3,0.0,1.0
std,10.220428,3.48466,10722.794463,4433.167101,141.509609,268.832696,1964.497878,3010.273333,0.0,0.0
min,1.0,4.0,23516.13,2989.94,-1.0,1.01,148.75,2044.72,0.0,1.0
25%,4.0,8.0,35972.9125,7394.11,-1.0,36.015,1399.1,3254.9075,0.0,1.0
50%,10.5,10.5,41275.32,10395.235,-1.0,78.44,2366.66,4376.86,0.0,1.0
75%,19.0,13.0,44199.89,12662.3925,86.0,279.78,3452.015,5906.93,0.0,1.0
max,31.0,13.0,62758.52,21290.13,532.0,1095.85,7557.14,17903.64,0.0,1.0


In [9]:
df_test['Date'].value_counts()

2012/5/4     6
2012/5/11    6
2012/5/18    6
2012/5/25    6
2012/6/1     6
2012/6/8     6
Name: Date, dtype: int64

In [22]:
target_col = 'Weekly_Sales'
feature_cols = ['Store','Dept','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','IsHoliday']
time_col = 'Date'   # 時間変数は特徴量として扱えない
categorical_cols = ['Store','Dept','IsHoliday']

In [23]:
ds_train = Dataset(df_train,
                   label=target_col,
                   features=feature_cols,
                   datetime_name = time_col,
                   cat_features=categorical_cols)
ds_train

Unnamed: 0,Column,DType,Kind,Additional Info
0,Date,datetime64,Datetime,
1,Weekly_Sales,floating,,
2,Store,integer,Categorical Feature,
3,Dept,integer,Categorical Feature,
4,MarkDown1,floating,Numerical Feature,
5,MarkDown2,floating,Numerical Feature,
6,MarkDown3,floating,Numerical Feature,
7,MarkDown4,floating,Numerical Feature,
8,MarkDown5,floating,Numerical Feature,
9,IsHoliday,integer,Categorical Feature,

Unnamed: 0,Date,Weekly_Sales,Store,Dept,...,MarkDown4,MarkDown5,IsHoliday,sample_weight
0,2010-02-05,59554.57,4,4,...,-1.00,-1.00,0,1
1,2010-02-12,54069.82,4,4,...,-1.00,-1.00,1,5
2,2010-02-19,53939.17,4,4,...,-1.00,-1.00,0,1
3,2010-02-26,54687.08,4,4,...,-1.00,-1.00,0,1
4,2010-03-05,56959.02,4,4,...,-1.00,-1.00,0,1
...,...,...,...,...,...,...,...,...,...
697,2012-03-30,44506.97,31,13,...,1334.23,3691.10,0,1
698,2012-04-06,45709.61,31,13,...,3623.01,16629.10,0,1
699,2012-04-13,43005.37,31,13,...,1482.03,3076.80,0,1
700,2012-04-20,47289.81,31,13,...,311.94,7709.86,0,1


In [24]:
ds_test = Dataset(df_test,
                  label=target_col,
                  features=feature_cols,
                  datetime_name = time_col,
                  cat_features=categorical_cols)
ds_test

Unnamed: 0,Column,DType,Kind,Additional Info
0,Date,datetime64,Datetime,
1,Weekly_Sales,floating,,
2,Store,integer,Categorical Feature,
3,Dept,integer,Categorical Feature,
4,MarkDown1,floating,Numerical Feature,
5,MarkDown2,floating,Numerical Feature,
6,MarkDown3,floating,Numerical Feature,
7,MarkDown4,floating,Numerical Feature,
8,MarkDown5,floating,Numerical Feature,
9,IsHoliday,integer,Categorical Feature,

Unnamed: 0,Date,Weekly_Sales,Store,Dept,...,MarkDown4,MarkDown5,IsHoliday,sample_weight
0,2012-05-04,60576.41,4,4,...,6993.97,4541.89,0,1
1,2012-05-11,57583.94,4,4,...,5005.96,3815.16,0,1
2,2012-05-18,60832.91,4,4,...,2290.18,2521.84,0,1
3,2012-05-25,57781.87,4,4,...,3268.41,3749.53,0,1
4,2012-06-01,62758.52,4,4,...,2676.33,4896.34,0,1
...,...,...,...,...,...,...,...,...,...
31,2012-05-11,41087.84,31,13,...,2960.85,3794.93,0,1
32,2012-05-18,44323.20,31,13,...,719.88,17903.64,0,1
33,2012-05-25,44310.38,31,13,...,2066.74,5802.72,0,1
34,2012-06-01,43161.56,31,13,...,2706.46,3236.46,0,1


In [16]:
validation_suite = train_test_validation()
suite_result = validation_suite.run(ds_train, ds_test)

In [17]:
suite_result.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_OBM7U5ETP1ZSPWQ2FASOCLHC9">Train Test Validat…

In [16]:
# 結果をhtml保存
suite_result.save_as_html('outputs/TrainTestValidation_walmart.html')

# to see the result access to the html file

'outputs/TrainTestValidation_walmart.html'