# IRIS에서 class, sl, sw, pw를 통하여 pl 구하기
- pl = [w1, w2, w3, w4] [C, sl, sw, pw].T + bias
- test_size = 0.2

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['C'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),C
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
df = df.rename({'sepal length (cm)':'SL', 'sepal width (cm)':'SW', 'petal length (cm)':'PL', 'petal width (cm)':'PW'}, axis='columns')
df.head()

Unnamed: 0,SL,SW,PL,PW,C
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

## PL

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
X = df[['SL', 'SW', 'PW', 'C']]
y = df['PL']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f'MSE: {mse:.2f}, RMSE: {np.sqrt(mse):.2f}')
print(f'R_squared: {r2:.4f}')

MSE: 0.10, RMSE: 0.31
R_squared: 0.9682


In [9]:
lr.coef_

array([ 0.68227323, -0.51925158,  1.11668415,  0.37500635])

In [10]:
lr.intercept_

-0.3669994939603347

#### PL의 회귀식
- PL = 0.68SL - 0.52SW + 1.12PW + 0.38C - 0.37

#### 교차검증

In [11]:
from sklearn.model_selection import cross_val_score
lr = LinearRegression()

In [12]:
# 5폴드 세트로 MSE를 구하기
neg_mean_scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mean_scores)
avg_rmse = np.average(rmse_scores)

print('개별 MSE:', np.round(neg_mean_scores, 2))
print('개별 RMSE:', np.round(rmse_scores,2))
print(f'평균 RMSE: {avg_rmse:.4f}')

개별 MSE: [-0.06 -0.08 -0.14 -0.1  -0.21]
개별 RMSE: [0.25 0.28 0.38 0.32 0.46]
평균 RMSE: 0.3387


## SL

In [13]:
X = df[['SW', 'PL', 'PW', 'C']]
y = df['SL']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

In [15]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f'MSE: {mse:.2f}, RMSE: {np.sqrt(mse):.2f}')
print(f'R_squared: {r2:.4f}')

MSE: 0.07, RMSE: 0.27
R_squared: 0.8808


In [16]:
lr.coef_

array([ 0.60733316,  0.77406606, -0.49825608, -0.21265098])

In [17]:
lr.intercept_

1.9046012133683519

#### SL의 회귀식
- SL = 0.61SW + 0.77PL - 0.50PW - 0.21C +1.90

#### 교차검증

In [18]:
lr = LinearRegression()
neg_mean_scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mean_scores)
avg_rmse = np.average(rmse_scores)

print('개별 MSE:', np.round(neg_mean_scores, 2))
print('개별 RMSE:', np.round(rmse_scores,2))
print(f'평균 RMSE: {avg_rmse:.4f}')

개별 MSE: [-0.06 -0.08 -0.14 -0.1  -0.14]
개별 RMSE: [0.25 0.29 0.37 0.31 0.38]
평균 RMSE: 0.3199


## SW

In [19]:
X = df[['SL', 'PL', 'PW', 'C']]
y = df['SW']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

In [21]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f'MSE: {mse:.2f}, RMSE: {np.sqrt(mse):.2f}')
print(f'R_squared: {r2:.4f}')

MSE: 0.05, RMSE: 0.22
R_squared: 0.5667


In [22]:
lr.coef_

array([ 0.61339502, -0.59499152,  0.69803975, -0.12580651])

In [23]:
lr.intercept_

0.9976767818176966

#### SW의 회귀식
- SW = 0.61SL - 0.59PL + 0.70PW - 0.13C + 1.00

#### 교차검증

In [24]:
lr = LinearRegression()
neg_mean_scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mean_scores)
avg_rmse = np.average(rmse_scores)

print('개별 MSE:', np.round(neg_mean_scores, 2))
print('개별 RMSE:', np.round(rmse_scores,2))
print(f'평균 RMSE: {avg_rmse:.4f}')

개별 MSE: [-0.08 -0.11 -0.16 -0.08 -0.11]
개별 RMSE: [0.28 0.33 0.41 0.29 0.33]
평균 RMSE: 0.3289


## PW

In [25]:
X = df[['SL', 'SW', 'PL', 'C']]
y = df['PW']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

In [27]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(f'MSE: {mse:.2f}, RMSE: {np.sqrt(mse):.2f}')
print(f'R_squared: {r2:.4f}')

MSE: 0.04, RMSE: 0.19
R_squared: 0.9226


In [28]:
lr.coef_

array([-0.12876754,  0.17861614,  0.32741898,  0.37562097])

In [29]:
lr.intercept_

-0.1944501604164237

#### PW의 회귀식
- PW = -0.13SL + 0.18SW + 0.33PL + 0.38C - 0.19

In [30]:
lr = LinearRegression()
neg_mean_scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mean_scores)
avg_rmse = np.average(rmse_scores)

print('개별 MSE:', np.round(neg_mean_scores, 2))
print('개별 RMSE:', np.round(rmse_scores,2))
print(f'평균 RMSE: {avg_rmse:.4f}')

개별 MSE: [-0.01 -0.02 -0.02 -0.04 -0.07]
개별 RMSE: [0.09 0.14 0.15 0.19 0.26]
평균 RMSE: 0.1667
