# sprint 0 機械学習スクラッチ入門

# 【問題1】train_test_splitのスクラッチ

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
import seaborn as sns

sns.set()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [14]:
def scratch_train_test_split(
    X, y, train_size=0.8,
):
    """
    検証用データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      学習データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    # ここにコードを書
    n_sumple = X.shape[0]
    n_train = np.floor(train_size * n_sumple).astype(int)
    n_test = n_sumple - n_train

    X_train = X[:n_train]
    X_test = X[n_train:]
    y_train = y[:n_train]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test

In [4]:
X, y = np.arange(10).reshape((5, 2)), range(5)

In [5]:
X

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [7]:
list(y)

[0, 1, 2, 3, 4]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
X_train

array([[8, 9],
       [4, 5],
       [0, 1],
       [6, 7]])

In [19]:
X_test

array([[8, 9]])

In [21]:
XX, yy = np.arange(10).reshape((5, 2)), range(5)

In [22]:
X_train, X_test, y_train, y_test = scratch_train_test_split(XX, yy, train_size=0.8)

In [23]:
X_train

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [24]:
X_test

array([[8, 9]])

# 【問題2】 分類問題を解くコードの作成

上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

In [26]:
# iris_dataset
from sklearn.datasets import load_iris

iris_dataset = load_iris()
feature = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
X = pd.DataFrame(iris_dataset.data)
X.columns = feature
y = pd.DataFrame(iris_dataset.target)
y.columns = ["Species"]

In [27]:
df = pd.concat([X, y], axis=1, sort=True)

In [28]:
df_selected = df.query("Species == [1, 2]")

In [29]:
from sklearn.model_selection import train_test_split

# シード値は random_state（データを分割する際の乱数のシード値）で指定できます。
X_train, X_test, y_train, y_test = train_test_split(
    df_selected.iloc[:, :-1],
    df_selected.loc[:, "Species"],
    test_size=0.25,
    random_state=42,
)

print("X_train.shape : {}, X_test.shape : {}".format(X_train.shape, X_test.shape))

X_train.shape : (75, 4), X_test.shape : (25, 4)


## ロジスティック回帰

In [30]:
# 学習
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss="log")
sgd.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [31]:
# 予測
y_pred = sgd.predict(X_test)

In [32]:
from sklearn import metrics

# 評価用の関数
def evaluate(y_test, y_pred):
    # 正解率
    print("正解率:", metrics.accuracy_score(y_test, y_pred))
    # 適合率
    print("適合率:", metrics.precision_score(y_test, y_pred))
    # 再現率
    print("再現率:", metrics.recall_score(y_test, y_pred))
    # F値
    print("F値:", metrics.f1_score(y_test, y_pred))


evaluate(y_test, y_pred)

正解率: 0.92
適合率: 1.0
再現率: 0.8571428571428571
F値: 0.923076923076923


## SMV

In [33]:
# 学習
from sklearn.svm import SVC

svm = SVC(kernel="linear", random_state=1)
svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=1,
    shrinking=True, tol=0.001, verbose=False)

In [34]:
# 予測
y_pred = svm.predict(X_test)

In [35]:
# 評価
evaluate(y_test, y_pred)

正解率: 0.92
適合率: 1.0
再現率: 0.8571428571428571
F値: 0.923076923076923


## 決定木

In [36]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [37]:
# 予測
y_pred = tree.predict(X_test)

In [39]:
# 評価
evaluate(y_test, y_pred)

正解率: 0.8
適合率: 0.7647058823529411
再現率: 0.9285714285714286
F値: 0.8387096774193549


## Simple1_datasetを使った３手法の機械学習

In [40]:
#シンプルデータセット１
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X_simple1 = np.concatenate((f0, f1))
y_simple1 = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X_s1 = X_simple1[random_index]
y_s1 = y_simple1[random_index]

In [41]:
X_s1_train, X_s1_test, y_s1_train, y_s1_test = train_test_split(
    X_s1, y_s1, test_size=0.25, random_state=42
)

print("X_train.shape : {}, X_test.shape : {}".format(X_s1_train.shape, X_s1_test.shape))

X_train.shape : (375, 2), X_test.shape : (125, 2)


## ロジスティック回帰

In [42]:
# 学習
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss="log")
sgd.fit(X_s1_train, y_s1_train)

# 予測
y_pred = sgd.predict(X_s1_test)

# 評価
evaluate(y_s1_test, y_pred)

正解率: 1.0
適合率: 1.0
再現率: 1.0
F値: 1.0


## SVM

In [43]:
# 学習
from sklearn.svm import SVC

svm = SVC(kernel="linear", random_state=1)
svm.fit(X_s1_train, y_s1_train)

# 予測
y_pred = svm.predict(X_s1_test)

# 評価
evaluate(y_s1_test, y_pred)

正解率: 1.0
適合率: 1.0
再現率: 1.0
F値: 1.0


## 決定木

In [44]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_s1_train, y_s1_train)

# 予測
y_pred = tree.predict(X_s1_test)

# 評価
evaluate(y_s1_test, y_pred)

正解率: 1.0
適合率: 1.0
再現率: 1.0
F値: 1.0


## Simple2_datasetを使った３手法の機械学習

In [45]:
# sample2_dataset
X_s2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y_s2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [46]:
X_s2_train, X_s2_test, y_s2_train, y_s2_test = train_test_split(
    X_s2, y_s2, test_size=0.25, random_state=42
)

print("X_train.shape : {}, X_test.shape : {}".format(X_s2_train.shape, X_s2_test.shape))

X_train.shape : (30, 2), X_test.shape : (10, 2)


## ロジスティック回帰

In [47]:
# 学習
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss="log")
sgd.fit(X_s2_train, y_s2_train)

# 予測
y_pred = sgd.predict(X_s2_test)

# 評価
evaluate(y_s2_test, y_pred)

正解率: 0.7
適合率: 0.6666666666666666
再現率: 0.5
F値: 0.5714285714285715


## SVM

In [48]:
# 学習
from sklearn.svm import SVC

svm = SVC(kernel="linear", random_state=1)
svm.fit(X_s2_train, y_s2_train)

# 予測
y_pred = svm.predict(X_s2_test)

# 評価
evaluate(y_s2_test, y_pred)

正解率: 0.6
適合率: 0.5
再現率: 0.5
F値: 0.5


## 決定木

In [49]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_s2_train, y_s2_train)

# 予測
y_pred = tree.predict(X_s2_test)

# 評価
evaluate(y_s2_test, y_pred)

正解率: 0.3
適合率: 0.2
再現率: 0.25
F値: 0.22222222222222224


# 【問題3】 回帰問題を解くコードの作成

線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [52]:
# Datasetの作成
df = pd.read_csv("/Users/shinoda/diveintocode-ml/ML2001_sprint/h_train.csv")
df_selected = df.loc[:, ["GrLivArea", "YearBuilt", "SalePrice"]]
df_selected.head()

Unnamed: 0,GrLivArea,YearBuilt,SalePrice
0,1710,2003,208500
1,1262,1976,181500
2,1786,2001,223500
3,1717,1915,140000
4,2198,2000,250000


In [53]:
X = df_selected.loc[:, ["GrLivArea", "YearBuilt"]]
y = df_selected['SalePrice']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [56]:
from sklearn.linear_model import SGDRegressor

In [57]:
sgd_reg = SGDRegressor()
sgd_reg.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [58]:
y_pred = sgd_reg.predict(X_test)
print(sgd_reg.intercept_)
print(sgd_reg.coef_)

[3.60948549e+09]
[ 5.64263876e+11 -1.20555996e+12]


In [59]:
from sklearn.metrics import mean_squared_error
# 平均二乗誤差（標準偏差、ばらつき、MSE）を求める
print("MSE : {}".format(mean_squared_error(y_test, y_pred)))

MSE : 2.3365318268084125e+30
