다중 선형회귀 분석

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 선형회귀 관련 라이브러리 가져오기

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# statsmodel 가져오기

import statsmodels.api as sm

In [5]:
# 간단한 예제

print(np.array([[0,1],[1,2],[2,2.5]]))
print()
print(np.array([[0,1.2,1.5]]))

[[0.  1. ]
 [1.  2. ]
 [2.  2.5]]

[[0.  1.2 1.5]]


In [8]:
# 데이터

x = np.array([[0,1],[1,2],[2,2.5]])
y= np.array([0,1.2,1.6])

# 선형회귀 object 생성
reg = linear_model.LinearRegression()



# 훈련
reg.fit(x,y)

# training data 예측
pred_train = reg.predict(x)

# test
pred_test = reg.predict([[1.5,2]])

In [9]:
# test data 예측값
pred_test

array([1.])

In [10]:
# coefficient(기울기, data1, data2)
reg.coef_

array([-0.4,  1.6])

In [13]:
# 데이터 가져오기

path = '/content/drive/MyDrive/MLP_bigdata'

In [17]:
ad = pd.read_csv('/content/drive/MyDrive/MLP_bigdata/data/Advertising.csv',index_col = 0)
ad

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
...,...,...,...,...
196,38.2,3.7,13.8,7.6
197,94.2,4.9,8.1,9.7
198,177.0,9.3,6.4,12.8
199,283.6,42.0,66.2,25.5


In [18]:
ad.shape

(200, 4)

In [20]:
ad.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


In [21]:
# 선형회귀 모델 적용
train = ad[:-20] # 200 - 20개
test = ad[-20:] # 20

# training data 의 feature / response 분리
train_x = train[['TV','Radio','Newspaper']]
train_y = train['Sales']

# test data 의 feature / response 분리
test_x = test[['TV','Radio','Newspaper']]
test_y = test['Sales']

In [23]:
# 선형회귀 객체 만들기
lr = linear_model.LinearRegression()

# training data --> 모델 적합
lr.fit(train_x,train_y)

# training data 이용해서 예측
pred_train = lr.predict(train_x)

# test data 이용해서 예측 (***)
pred_test = lr.predict(test_x)

In [24]:
#coefficients
lr.coef_

array([ 0.04638909,  0.18867512, -0.0024597 ])

In [25]:
# training MSE
mean_squared_error(train_y,pred_train)

2.827418881491677

In [27]:
# test MSE
mean_squared_error(test_y,pred_test)

2.4528179307176843

In [28]:
# r2_score

r2_score(train_y,pred_train)

0.8923555807586847

In [29]:
r2_score(test_y,pred_test)
# test 데이터가 더 높으면 과소적합을 의심 해야한다
# 이럴경우 N 을 늘려야 한다
# 지금의 경우는 train = 180, test = 20 이다


0.9288231093749743

In [31]:
# statsmodel (sm_train_x, sm_test_x) 이용
# 반드시 X0 feature 추가해야 함
sm_train_x = train_x
sm_train_x['x0'] = 1

# test에도 X0 feature 추가
sm_test_x = test_x
sm_test_x['x0'] = 1

In [34]:
sm_train_x.head()

Unnamed: 0,TV,Radio,Newspaper,x0
1,230.1,37.8,69.2,1
2,44.5,39.3,45.1,1
3,17.2,45.9,69.3,1
4,151.5,41.3,58.5,1
5,180.8,10.8,58.4,1


In [35]:
sm_test_x.head()

Unnamed: 0,TV,Radio,Newspaper,x0
181,156.6,2.6,8.3,1
182,218.5,5.4,27.4,1
183,56.2,5.7,29.7,1
184,287.6,43.0,71.8,1
185,253.8,21.3,30.0,1


In [36]:
train_y

1      22.1
2      10.4
3       9.3
4      18.5
5      12.9
       ... 
176    27.0
177    20.2
178    11.7
179    11.8
180    12.6
Name: Sales, Length: 180, dtype: float64

In [38]:
# training
result = sm.OLS(train_y,sm_train_x).fit()
                 # x0 없는 train_y, x0 있는 train_x

In [39]:
# training 결과 확인
result.summary()
# R-squared:	0.892  RSS/TSS
# Adj. R-squared:	0.891
# AIC : 낮을수록 좋음
# BIC : 낮을수록 좋음
# P_value : 0.05보다 작은지

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.892
Model:,OLS,Adj. R-squared:,0.891
Method:,Least Squares,F-statistic:,486.3
Date:,"Mon, 17 Jun 2024",Prob (F-statistic):,6.570000000000001e-85
Time:,05:20:52,Log-Likelihood:,-348.95
No. Observations:,180,AIC:,705.9
Df Residuals:,176,BIC:,718.7
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0464,0.001,31.154,0.000,0.043,0.049
Radio,0.1887,0.009,20.347,0.000,0.170,0.207
Newspaper,-0.0025,0.006,-0.395,0.693,-0.015,0.010
x0,2.8399,0.342,8.293,0.000,2.164,3.516

0,1,2,3
Omnibus:,56.196,Durbin-Watson:,2.104
Prob(Omnibus):,0.0,Jarque-Bera (JB):,140.467
Skew:,-1.343,Prob(JB):,3.15e-31
Kurtosis:,6.394,Cond. No.,467.0


다항 회귀분석

In [40]:
from sklearn.preprocessing import PolynomialFeatures

In [42]:
x = np.arange(6).reshape(3,2)
x

array([[0, 1],
       [2, 3],
       [4, 5]])

In [43]:
# [1,a,b,a^2,ab,b^2] feature 생성
poly = PolynomialFeatures(2)
poly.fit_transform(x)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [46]:
# interaction feature만 생성하고 싶을 때
Poly = PolynomialFeatures(interaction_only=True)
Poly.fit_transform(x)

array([[ 1.,  0.,  1.,  0.],
       [ 1.,  2.,  3.,  6.],
       [ 1.,  4.,  5., 20.]])

In [99]:
auto = pd.read_csv('/content/drive/MyDrive/MLP_bigdata/data/Auto.csv')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [50]:
auto.shape

(397, 9)

In [52]:
auto.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [51]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    object 
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.0+ KB


In [100]:
# horsepower '?' 값을 '0'으로 대체
auto['horsepower'] = auto['horsepower'].replace('?',0)

In [54]:
auto['horsepower'][:10]

0    130
1    165
2    150
3    150
4    140
5    198
6    220
7    215
8    225
9    190
Name: horsepower, dtype: object

In [101]:
# auto['horespower] -> 숫자로 변경

auto['horsepower'] = pd.to_numeric(auto['horsepower'])

In [104]:
auto = auto.drop(['name'], axis = 1)

In [56]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    int64  
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 28.0+ KB


auto 다중선형회귀(sklearn)

In [86]:
# horsepower 다양한 feature 생성


# auto['weight_2'] = auto['weight'] ** 2
# auto['weight_3'] = auto['weight'] ** 3
# auto['weight_4'] = auto['weight'] ** 4
# auto['weight_5'] = auto['weight'] ** 5

In [87]:
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,horsepower_2,horsepower_3,horsepower_4,horsepower_5,weight_2,weight_3,weight_4,weight_5
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,16900,2197000,285610000,37129300000,12278016,43022168064,150749676896256,528226867844481024
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,27225,4492125,741200625,122298103125,13638249,50366053557,186001835786001,686904779557701693
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,22500,3375000,506250000,75937500000,11806096,40565745856,139383902761216,478923089887538176
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,22500,3375000,506250000,75937500000,11785489,40459583737,138897750969121,476835979076992393
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,19600,2744000,384160000,53782400000,11895601,41027927849,141505323151201,488051859548492249


In [105]:
# training / test 데이터 분리

train = auto[:-40]
test = auto[-40:]

# training data의 feature와 response 분리
train_x = train.iloc[:,1:]
train_y = train['mpg']

# test data의 feature와 response 분리
test_x = test.iloc[:,1:]
test_y = test['mpg']

In [106]:
# 선형회귀 객체 생성
lr = linear_model.LinearRegression()

# training data --> 모델 적합
lr.fit(train_x,train_y)

# training data 이용해서 예측
pred_train = lr.predict(train_x)

# test data 이용해서 예측
pred_test = lr.predict(test_x)

In [108]:
train_x.columns

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'year', 'origin'],
      dtype='object')

In [107]:
lr.coef_

array([-0.3736576 ,  0.01504545, -0.01430246, -0.00625177,  0.03568412,
        0.74444736,  1.44269524])

In [109]:
# training MSE
mean_squared_error(train_y,pred_train)

#값이 낮을수록 모델이 훈련 데이터에 대해 잘 맞는다는 것을 의미합니다.
#너무 낮으면 모델이 훈련 데이터에 과적합(overfitting)되었을 가능성이 있습니다.


10.48325290009115

In [110]:
# test MSE
mean_squared_error(test_y,pred_test)

# 값이 낮을수록 모델이 새로운 데이터(테스트 데이터)에 대해 잘 예측한다는 것을 의미합니다.
# 테스트 데이터의 MSE가 훈련 데이터의 MSE와 비슷하다면, 모델이 일반화(generalization) 능력이 좋다고 볼 수 있습니다.

14.531581917722567

In [114]:
# 훈련 데이터의 MSE << 테스트 데이터의 MSE:
# 모델이 훈련 데이터에 과적합되었을 가능성이 큽니다. 즉, 훈련 데이터에 너무 잘 맞춰져 있어서 새로운 데이터에 대해 잘 예측하지 못하는 경우입니다.
# 해결 방법: 모델의 복잡성을 줄이거나, 더 많은 데이터를 사용하거나, 정규화를 시도해 볼 수 있습니다.

# 훈련 데이터의 MSE ≈ 테스트 데이터의 MSE:
# 모델이 데이터에 대해 잘 일반화되고 있다는 것을 의미합니다. 이는 좋은 징후입니다.
# 일반적으로 이 상태를 목표로 합니다.

# 훈련 데이터의 MSE > 테스트 데이터의 MSE:
# 이런 경우는 드물지만, 모델이 훈련 데이터보다 테스트 데이터에 더 잘 맞는 경우입니다. 이는 데이터 셋의 특성이나 분포에 따라 발생할 수 있습니다.
# 이는 모델의 성능 평가에 오류가 있거나, 데이터의 샘플링 방법에 문제가 있을 수 있습니다.

In [111]:
# r2_score

r2_score(train_y,pred_train)

0.820018681848257

In [112]:
r2_score(test_y,pred_test)


0.5505537907484992

In [113]:
# train은 82이지만 test는 55라는 결과가 나온다
# 설명력이 약하다 --> 과적합이다