# iris - 다중회귀(Multiple Regression)

In [15]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [16]:
# iris dataset 로드하기
iris = load_iris()

iris.keys()

# DESCR는 해당 데이터셋의 데이터 정보

print(iris['target_names'])

['setosa' 'versicolor' 'virginica']


In [17]:
# data를 df로 변환하여 'label'컬럼을 만들어 traget으로 지정하고 확인.
import pandas as pd 
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
iris_df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [24]:
# 훈련데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

In [25]:
# 다중 선형회귀 적용
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))
# 테스트 점수가 더 높은 과소적합..?

0.9254199044989622
0.9468960016420045


### 사이킷런의 PolynomialFeatures 클래스 활용하기

In [26]:
# 'X_train'데이터를 PolynomialFeatures 적용하여 'train_poly'라는 특성데이터 생성.
from sklearn.preprocessing import PolynomialFeatures

# 훈련데이터셋 'X_train'에 변환기 적용
poly = PolynomialFeatures(include_bias=False) # 계수(1) 추가 안 함
poly.fit(X_train)
train_poly = poly.transform(X_train)
train_poly[:5] # 추가된 특성 확인

array([[4.600e+00, 3.600e+00, 1.000e+00, 2.000e-01, 2.116e+01, 1.656e+01,
        4.600e+00, 9.200e-01, 1.296e+01, 3.600e+00, 7.200e-01, 1.000e+00,
        2.000e-01, 4.000e-02],
       [5.700e+00, 4.400e+00, 1.500e+00, 4.000e-01, 3.249e+01, 2.508e+01,
        8.550e+00, 2.280e+00, 1.936e+01, 6.600e+00, 1.760e+00, 2.250e+00,
        6.000e-01, 1.600e-01],
       [6.700e+00, 3.100e+00, 4.400e+00, 1.400e+00, 4.489e+01, 2.077e+01,
        2.948e+01, 9.380e+00, 9.610e+00, 1.364e+01, 4.340e+00, 1.936e+01,
        6.160e+00, 1.960e+00],
       [4.800e+00, 3.400e+00, 1.600e+00, 2.000e-01, 2.304e+01, 1.632e+01,
        7.680e+00, 9.600e-01, 1.156e+01, 5.440e+00, 6.800e-01, 2.560e+00,
        3.200e-01, 4.000e-02],
       [4.400e+00, 3.200e+00, 1.300e+00, 2.000e-01, 1.936e+01, 1.408e+01,
        5.720e+00, 8.800e-01, 1.024e+01, 4.160e+00, 6.400e-01, 1.690e+00,
        2.600e-01, 4.000e-02]])

In [27]:
# sklearn 라이브러리에서 사용되는 메서드로, 변환기가 생성한 피처 이름을 반환.
poly.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x0^2', 'x0 x1', 'x0 x2', 'x0 x3', 'x1^2',
       'x1 x2', 'x1 x3', 'x2^2', 'x2 x3', 'x3^2'], dtype=object)

In [28]:
test_poly = poly.transform(X_test)
X_test[:5]

array([[6.1, 2.8, 4.7, 1.2],
       [5.7, 3.8, 1.7, 0.3],
       [7.7, 2.6, 6.9, 2.3],
       [6. , 2.9, 4.5, 1.5],
       [6.8, 2.8, 4.8, 1.4]])

In [29]:
# X_test와 비교
test_poly[:5]

array([[ 6.1 ,  2.8 ,  4.7 ,  1.2 , 37.21, 17.08, 28.67,  7.32,  7.84,
        13.16,  3.36, 22.09,  5.64,  1.44],
       [ 5.7 ,  3.8 ,  1.7 ,  0.3 , 32.49, 21.66,  9.69,  1.71, 14.44,
         6.46,  1.14,  2.89,  0.51,  0.09],
       [ 7.7 ,  2.6 ,  6.9 ,  2.3 , 59.29, 20.02, 53.13, 17.71,  6.76,
        17.94,  5.98, 47.61, 15.87,  5.29],
       [ 6.  ,  2.9 ,  4.5 ,  1.5 , 36.  , 17.4 , 27.  ,  9.  ,  8.41,
        13.05,  4.35, 20.25,  6.75,  2.25],
       [ 6.8 ,  2.8 ,  4.8 ,  1.4 , 46.24, 19.04, 32.64,  9.52,  7.84,
        13.44,  3.92, 23.04,  6.72,  1.96]])

#### 다중회귀모델 훈련

In [35]:
# sklearn의 클래스, 선형회귀 import
from sklearn.linear_model import LinearRegression
# 모델 생성
lr = LinearRegression()
# 모델 학습
lr.fit(train_poly, y_train)

LinearRegression()

In [36]:
print(lr.score(train_poly, y_train))
print(lr.score(test_poly, y_test))

0.9521205991019994
0.9275633158732431


##### 결과
- 점수가 높게 나왔음
- 특성이 늘어나면 선형회귀의 예측 결과가 약간 높아짐