In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

### 5-1　レコードデータにおけるモデル検証用のデータ分割

In [2]:
production_tb = pd.read_csv('production.csv')

print(production_tb.shape)
production_tb.head()

(1000, 4)


Unnamed: 0,type,length,thickness,fault_flg
0,E,274.027383,40.241131,False
1,D,86.319269,16.906715,False
2,E,123.940388,1.018462,False
3,B,175.554886,16.414924,False
4,B,244.93474,29.061081,False


In [3]:
train_data, test_data, train_target, test_target = train_test_split(production_tb.drop('fault_flg', axis=1),
                                                                   production_tb[['fault_flg']], test_size=0.2)

print(train_data.shape)
print(test_data.shape)
print(train_target.shape)
print(test_target.shape)

(800, 3)
(200, 3)
(800, 1)
(200, 1)


In [4]:
train_data.head()

Unnamed: 0,type,length,thickness
97,A,192.243279,37.002319
217,E,202.499888,12.930346
864,E,246.032309,28.175128
957,E,144.875658,20.563811
500,D,116.075474,7.646799


In [5]:
train_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)

train_target.reset_index(inplace=True, drop=True)
test_target.reset_index(inplace=True, drop=True)

In [6]:
train_data.head()

Unnamed: 0,type,length,thickness
0,A,192.243279,37.002319
1,E,202.499888,12.930346
2,E,246.032309,28.175128
3,E,144.875658,20.563811
4,D,116.075474,7.646799


In [7]:
# 0～train_targetのデータ数までの番号リストを作成
row_no_list = list(range(len(train_target)))

print(len(train_target))
print('row_no_list', row_no_list[:10], '...',row_no_list[791:800])

800
row_no_list [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] ... [791, 792, 793, 794, 795, 796, 797, 798, 799]


In [8]:
k_fold = KFold(n_splits=4, shuffle=True)


# 交差数分、繰り返し処理する
for train_cv_no, test_cv_no in k_fold.split(row_no_list):
    
    # 交差検証におけるtrainデータを抽出
    train_cv = train_data.iloc[train_cv_no, :]
    
    # 交差検証におけるtestデータを抽出
    test_cv = train_data.iloc[test_cv_no, :]

### 5-2 時系列データにおけるモデル検証用のデータ分割

In [9]:
monthly_index_tb = pd.read_csv('monthly_index.csv')

print(monthly_index_tb.shape)
monthly_index_tb.head()

(120, 3)


Unnamed: 0,year_month,sales_amount,customer_number
0,2010-01,7191240,6885
1,2010-02,6253663,6824
2,2010-03,6868320,7834
3,2010-04,7147388,8552
4,2010-05,8755929,8171


In [10]:
monthly_index_tb.tail()

Unnamed: 0,year_month,sales_amount,customer_number
115,2019-08,80528792,97059
116,2019-09,90191653,86560
117,2019-10,95749954,99507
118,2019-11,86998004,85215
119,2019-12,104401092,93620


In [11]:
# 最初の学習番号
train_window_start = 1

# 最後の学習番号
train_window_end = 24

# 検証データのデータ数
horizon = 12

# スライドするデータ数
skip = 12

In [12]:
monthly_index_tb.sort_values(by='year_month').head()

Unnamed: 0,year_month,sales_amount,customer_number
0,2010-01,7191240,6885
1,2010-02,6253663,6824
2,2010-03,6868320,7834
3,2010-04,7147388,8552
4,2010-05,8755929,8171


In [13]:
while True:
    
    # 検証データの終了番号
    test_window_end = train_window_end + horizon
    
    train = monthly_index_tb[train_window_start: train_window_end]
    test = monthly_index_tb[(train_window_end + 1):test_window_end]
    
    if test_window_end >= len(monthly_index_tb.index):
        break
    
    # 最初と最後の学習番号に対して、skip=12 を加算
    train_window_start += skip
    train_window_end += skip
    
    # While True以下の処理が繰り返される

In [14]:
print(len(monthly_index_tb.index))

120


trainを2年、testを1年で分割し、trainとtestをskip分（12か月ずつ）ずらしていっている。