In [1]:
# データの用意
import numpy as np
import pandas as pd
import seaborn as sns

df = sns.load_dataset("iris")  # Irisのデータを取得
df.iloc[0, 1] = np.NaN  # わざと欠損値を作る

# データの表示
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
# pandas-profilingのパッケージをインストール
!pip install pandas-profiling

# データのprofileを作成
import pandas_profiling as pdp
from IPython.display import HTML

profile = pdp.ProfileReport(df)
profile.to_file(outputfile="profile.html")
HTML(filename='profile.html')

Collecting pandas-profiling
  Using cached pandas_profiling-3.0.0-py2.py3-none-any.whl (248 kB)
Collecting htmlmin>=0.1.12
  Using cached htmlmin-0.1.12-py3-none-any.whl
Collecting visions[type_image_path]==0.7.1
  Using cached visions-0.7.1-py3-none-any.whl (102 kB)
Collecting pydantic>=1.8.1
  Using cached pydantic-1.8.2-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting missingno>=0.4.2
  Using cached missingno-0.4.2-py3-none-any.whl (9.7 kB)
Collecting tangled-up-in-unicode==0.1.0
  Using cached tangled_up_in_unicode-0.1.0-py3-none-any.whl (3.1 MB)
Collecting requests>=2.24.0
  Using cached requests-2.25.1-py2.py3-none-any.whl (61 kB)
Collecting phik>=0.11.1
  Using cached phik-0.11.2-py3-none-any.whl
Collecting networkx>=2.4
  Using cached networkx-2.5.1-py3-none-any.whl (1.6 MB)
Collecting multimethod==1.4
  Using cached multimethod-1.4-py2.py3-none-any.whl (7.3 kB)
Collecting bottleneck
  Using cached Bottleneck-1.3.2.tar.gz (88 kB)
  Installing build dependencies: started
  Installing

ModuleNotFoundError: No module named 'pandas_profiling'

In [0]:
# ノートブック上にグラフを描画するように設定
%matplotlib inline

# リスト5.2で欠損値にした箇所に対する計算エラーの警告を出力させないように設定
import warnings
warnings.simplefilter('ignore')

sns.boxplot(x='species', y='sepal_width', hue='species', data=df)
sns.pairplot(df, hue='species')

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

# 将来的にデフォルト値が変更される警告が出力されないように設定
import warnings
warnings.simplefilter('ignore')

# 1. 前処理------
df = df.dropna()  # 欠損値のあるレコードを削除
del df['petal_width']  # データチェックの結果からpetal_lengthと相関が強いpetal_widthは削除

# カテゴリーデータであるspecies（種類）を数値データに変換
df['species'] = df['species'].map(
    {'setosa': 0, 'versicolor': 1, 'virginica': 2})

# Numpyの変数を用意
y = np.array(df['species'])
X = np.array(df.iloc[:, 0:3])  # 0, 1, 2行目の3つを取得

# 2. 入れ子式の交差検証法を行う用意
# 2.1 outer loopの設定
outer_loop = KFold(n_splits=5, shuffle=True, random_state=0)

# 2.2 inner loopの設定
inner_loop = KFold(n_splits=4, shuffle=True, random_state=0)

# 2.3 パイプライン生成
pipe = Pipeline([('scaler', StandardScaler()),
                 ('logistic', LogisticRegression())])

# 2.4 グリッドサーチの設定
# 比較するハイパーパラメータ設定
param_grid = {
    'logistic__C': [1, 10, 100],
}

gs = GridSearchCV(
    estimator=pipe, param_grid=param_grid, scoring='accuracy', cv=inner_loop)

# 3. 学習
val_result = np.zeros((5, 3))  # outer_loop数×ハイパーパラメータの種類
ol_index = 0  # outer_loopのindex

# outer loop
for train_val_index, test_index in outer_loop.split(X):
    X_train_val, X_test = X[train_val_index], X[test_index]
    y_train_val, y_test = y[train_val_index], y[test_index]

    # inner loop
    gs.fit(X_train_val, y_train_val)  # trainで学習し、valで評価
    val_result[ol_index] = gs.cv_results_["mean_test_score"]

    print(" outer loopの結果その{}：{}".format(ol_index+1,
                                         val_result[ol_index]))
    ol_index += 1

# 4. 評価
print("--\n outer loopの平均結果 ：{}\n--".format(val_result.mean(axis=0)))
# 出力すると3番目のハイパーパラメータ C=100が良いと分かる

# 5. テストデータで性能確認
clf_pipe = Pipeline([('scaler', StandardScaler()),
                     ('logistic', LogisticRegression(C=100))])
il_index = 0  # inner-loopのindex
test_result = np.zeros(5*4)  # outer_loop数×inner_loop数

# outer loop
for train_val_index, test_index in outer_loop.split(X):
    X_train_val, X_test = X[train_val_index], X[test_index]
    y_train_val, y_test = y[train_val_index], y[test_index]

    # inner loop
    for train_index, val_index in inner_loop.split(X_train_val):
        X_train, X_val = X_train_val[train_index], X_train_val[val_index]
        y_train, y_val = y_train_val[train_index], y_train_val[val_index]

        clf_pipe.fit(X_train, y_train)  # trainで学習し
        test_result[il_index] = clf_pipe.score(X_test, y_test)  # testの正解率を求める

        print(" inner loopの結果その{}：{}".format(il_index +
                                             1, test_result[il_index]))
        il_index += 1

print("--\n テストデータの平均正解率：{}".format(test_result.mean()))