# データセットの分割数の違いを感じる
- 学習データで学習 → テストデータでテスト → 誤り率を計算する
- 上記を繰り返して、誤り率の平均と分散を算出する
- CVの分割数 と ホールドアウト割合 を変えて算出する

# 実装のメモ

### アルゴリズムの流れ

1. データセットを取得

2. データセットを学習データと教師データに分割

3. 学習データを元にパラメータを学習

4. 教師データで誤り率を算出

5. 2〜4 を繰り返して、誤り率の標本データを取得し、平均分散を算出

6. 5 を CVの分割数 と ホールドアウトの割合 を変えて算出する


In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
def get_data() -> pd.DataFrame:
    '''
    データセットを作成します
    '''
    origin_data = load_breast_cancer()
    df_origin = pd.DataFrame(origin_data.data, columns=origin_data.feature_names)
    df_origin['target'] = origin_data.target
    return df_origin

# test
df_origin = get_data()
df_origin.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
from abc import ABCMeta, abstractmethod


class Spliter(metaclass=ABCMeta):
    '''
    データの分割器です。
    '''
    def __init__(self, df, num):
        '''
        分割器を初期化します。
        '''
        self.df = df
        self.num = num

    @abstractmethod
    def run(self):
        '''
        データの分割を実行します。
        
        :return TrainDataSet, TestDataSet
        '''
        raise NotImplementedError()
        
    def to_str(self):
        return self.__class__.__name__ + '#' + str(self.num)


class FOSpliter(Spliter): 
    '''
    Fold Out法でデータを分割する分割器です。
    '''
    def __init__(self, df, num):
        '''
        :param df 分割したいデータ
        :param num データを分割するレート
        '''
        super().__init__(df, num)
    
    def run(self) -> pd.DataFrame:
        #print(f"run FOSpliter number : {self.num}")
        df_train = self.df.sample(frac=self.num)
        df_test = self.df.drop(df_train.index)
        yield df_train, df_test


class CVSpliter(Spliter):
    '''
    Cross Varidation法でデータを分割する分割器です。
    '''
    def __init__(self, df, num):
        '''
        :param df 分割したいデータ
        :param num データの分割する個数
        '''
        super().__init__(df, num)
        
    def run(self) -> pd.DataFrame:
        #print(f"run CVSpliter number : {self.num}")
        fold = KFold(n_splits=self.num, shuffle=True, random_state=1)
        ids_train, ids_test = next(fold.split(self.df))
        df_train, df_test = self.df.iloc[ids_train], self.df.iloc[ids_test]
        yield df_train, df_test

#  test
df = get_data()
spliter_fo = FOSpliter(df=df, num=0.5)
spliter_cv = CVSpliter(df=df, num=4)
tasks = [spliter_fo, spliter_cv]
for task in tasks:
    print(task.to_str())
    df_train, df_test = next(task.run())
    display(df_train.head())
    display(df_test.head())


FOSpliter#0.5


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
514,15.05,19.07,97.26,701.9,0.09215,0.08597,0.07486,0.04335,0.1561,0.05915,...,28.06,113.8,967.0,0.1246,0.2101,0.2866,0.112,0.2282,0.06954,0
149,13.74,17.91,88.12,585.0,0.07944,0.06376,0.02881,0.01329,0.1473,0.0558,...,22.46,97.19,725.9,0.09711,0.1824,0.1564,0.06019,0.235,0.07014,1
28,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,...,36.71,149.3,1269.0,0.1641,0.611,0.6335,0.2024,0.4027,0.09876,0
523,13.71,18.68,88.73,571.0,0.09916,0.107,0.05385,0.03783,0.1714,0.06843,...,25.63,99.43,701.9,0.1425,0.2566,0.1935,0.1284,0.2849,0.09031,1
346,12.06,18.9,76.66,445.3,0.08386,0.05794,0.00751,0.008488,0.1555,0.06048,...,27.06,86.54,562.6,0.1289,0.1352,0.04506,0.05093,0.288,0.08083,1


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,0
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,0
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,0
10,16.02,23.24,102.7,797.8,0.08206,0.06669,0.03299,0.03323,0.1528,0.05697,...,33.88,123.8,1150.0,0.1181,0.1551,0.1459,0.09975,0.2948,0.08452,0


CVSpliter#4


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,0
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,0
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,0


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,0
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,0


In [4]:
def col_error(df_train: pd.DataFrame, df_test: pd.DataFrame):
    '''
    ロジスティック回帰を用いてテストデータの誤り率を計算します。
    '''
    df_train_x = df_train.drop('target', axis=1)
    df_train_y = df_train['target']
    lr = LogisticRegression()
    lr.fit(df_train_x, df_train_y)
    
    df_test_x = df_test.drop('target', axis=1)
    df_test_y = df_test['target']
    score = lr.score(df_test_x, df_test_y)
    
    return 1- score

# test
df = get_data()
print(col_error(df, df))

0.040421792618629215


In [5]:
class Aggregator():
    '''
    情報を貯蓄して解析する集積器です。
    内部でキーに対応したリストを保持しています。
    '''
    def __init__(self):
        self.values = {}

    def savings(self, key, value):
        '''
        値をキーに対応したリストに保存します。
        '''
        if key not in self.values:
            self.values[key] = []
        self.values[key].append(value)
        pass
    
    def total(self):
        '''
        貯蓄した値を集計します。
        '''
        dict_result = {}
        for key in self.values:
            info = {}
            info['mean'] = np.array(self.values[key]).mean()
            info['var'] = np.array(self.values[key]).var()
            dict_result[key] = info
        return pd.DataFrame.from_dict(dict_result).T
    
    def destory(self):
        '''
        貯蔵した値を破棄します。
        '''
        self.values = {}

memory = Aggregator()
memory.savings('key1', 1)
memory.savings('key1', 2)
memory.savings('key2', 3)
print(memory.values)

{'key1': [1, 2], 'key2': [3]}


In [6]:
def run_task(memory, task):
    df_train, df_test = next(task.run())
    error = col_error(df_train, df_test)
    memory.savings(task.to_str(), error)
    return memory

def run_tasks(memory, tasks):
    for task in tasks:
        df_train, df_test = next(task.run())
        error = col_error(df_train, df_test)
        memory.savings(task.to_str(), error)
    return memory

def main():
    memory = Aggregator()
    for i in range(100):
        df_origin = get_data()
        tasks_fo = [FOSpliter(df=df_origin, num=(1- num*0.1)) for num in range(1, 6)]
        tasks_cv = [CVSpliter(df=df_origin, num=(num)) for num in range(2, 10)]
        tasks = tasks_fo + tasks_cv
        memory = run_tasks(memory, tasks)
    display(memory.total())

main()

Unnamed: 0,mean,var
FOSpliter#0.9,0.048947,0.000734349
FOSpliter#0.8,0.050877,0.0002954755
FOSpliter#0.7,0.049123,0.0002564892
FOSpliter#0.6,0.050614,0.0001836796
FOSpliter#0.5,0.052035,0.0001159877
CVSpliter#2,0.073684,1.92593e-34
CVSpliter#3,0.047368,4.814825e-35
CVSpliter#4,0.048951,4.814825e-35
CVSpliter#5,0.04386,1.92593e-34
CVSpliter#6,0.042105,4.814825e-35
