다중 선형 회귀 분석

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
#선형 회귀 관련 라이브러리 가져오기 

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm 


In [3]:
#간단한 예제 

print(np.array([[0,1], [1,2],[2,2.5]]))
print()
print(np.array([0,1.2,1.6]))

[[0.  1. ]
 [1.  2. ]
 [2.  2.5]]

[0.  1.2 1.6]


In [4]:
# 데이터 
x = np.array([[0,1], [1,2],[2,2.5]])
y = np.array([0,1.2,1.6])

# 선형회귀 object 생성 
reg = linear_model.LinearRegression()

#훈련
reg.fit(x, y)

# train data 예측
pred_train = reg.predict(x)

#test
pred_test = reg.predict([[1.5, 2]])


In [5]:
#test data 예측값 
pred_test 

array([1.])

In [6]:
#coef_ (기울기) 독립변수에 대한 
reg.coef_ 

array([-0.4,  1.6])

데이터 분석

In [7]:
ad = pd.read_csv('../../data/data자료/Advertising.csv', index_col=0)
ad.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [8]:
#선형 회귀 모델 적용 

ad.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


In [9]:
ad.shape

(200, 4)

In [10]:
# train / test data 분리 (sklearn사용 안하고 분리)

train = ad[:-100] # 180
test = ad[-100:] # 20 

#train data 의 feature / response 분리 
train_x = train[['TV', 'Radio','Newspaper']]
train_y = train[['Sales']]

test_x = test[['TV', 'Radio','Newspaper']]
test_y = test[['Sales']]


In [11]:
# 선형회귀 객체 만들기 
lr = linear_model.LinearRegression()

#train data >> 모델 적합 
lr.fit(train_x, train_y)

#train data >> 예측 (실무 : 모델 적합도, 과적합 여부 판단)
train_y_pred = lr.predict(train_x)

#test data >> 예측 (**)
test_y_pred = lr.predict(test_x)


In [12]:
#coef_ 
lr.coef_

array([[ 0.04551295,  0.19208846, -0.01066575]])

In [13]:
#train mse 
print(mean_squared_error(train_y, train_y_pred))

#test mse
print(mean_squared_error(test_y, test_y_pred))

2.323730395275743
3.3764700371877536


In [14]:
# r2 score 
print(r2_score(train_y, train_y_pred))
print(r2_score(test_y, test_y_pred)) #과소적합 의심 train < test >> 데이터가 적어서 그럴수 있다 (n을 늘려야함)


0.911047018243114
0.8779307177850375


In [15]:
#statsmodel 이용 
# 반드시 x0 feature 추가해야만 함 
sm_train_x = train_x
sm_train_x['x0'] = 1

sm_test_x = test_x
sm_test_x['x0'] = 1



In [16]:
sm_train_x

Unnamed: 0,TV,Radio,Newspaper,x0
1,230.1,37.8,69.2,1
2,44.5,39.3,45.1,1
3,17.2,45.9,69.3,1
4,151.5,41.3,58.5,1
5,180.8,10.8,58.4,1
...,...,...,...,...
96,163.3,31.6,52.9,1
97,197.6,3.5,5.9,1
98,184.9,21.0,22.0,1
99,289.7,42.3,51.2,1


In [17]:
#train
result = sm.OLS(train_y, sm_train_x).fit() # sale / tv, radio, newspaper
#train 결과 확인
result.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.911
Model:,OLS,Adj. R-squared:,0.908
Method:,Least Squares,F-statistic:,327.7
Date:,"Mon, 17 Jun 2024",Prob (F-statistic):,2.73e-50
Time:,17:29:19,Log-Likelihood:,-184.05
No. Observations:,100,AIC:,376.1
Df Residuals:,96,BIC:,386.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0455,0.002,24.061,0.000,0.042,0.049
Radio,0.1921,0.012,15.407,0.000,0.167,0.217
Newspaper,-0.0107,0.008,-1.295,0.198,-0.027,0.006
x0,3.3498,0.427,7.850,0.000,2.503,4.197

0,1,2,3
Omnibus:,18.516,Durbin-Watson:,2.365
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.148
Skew:,-1.026,Prob(JB):,1.55e-05
Kurtosis:,4.053,Cond. No.,475.0


다항 회귀분석 

In [18]:
from sklearn.preprocessing import PolynomialFeatures

np.arange(6).reshape(3,2)

x = np.arange(6).reshape(3,2)
x


array([[0, 1],
       [2, 3],
       [4, 5]])

In [19]:
# [1,a,b,a^2, ab,b^2] feature 생성 
poly = PolynomialFeatures(2)
poly.fit_transform(x)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [20]:
#interaction feature만 생성하고 싶을 때 
poly = PolynomialFeatures(interaction_only=True)
poly.fit_transform(x)

array([[ 1.,  0.,  1.,  0.],
       [ 1.,  2.,  3.,  6.],
       [ 1.,  4.,  5., 20.]])

auto데이터 활용, 회귀분석 실습 

In [21]:
auto = pd.read_csv('../../data/data자료/Auto.csv')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [22]:
auto.shape

(397, 9)

In [23]:
auto.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [24]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    object 
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.0+ KB


In [25]:
#horseoiwer '?' 값을 '0'으로 대체 
auto['horsepower'] = auto['horsepower'].replace('?', '0')

In [26]:
auto['horsepower'][:10]

0    130
1    165
2    150
3    150
4    140
5    198
6    220
7    215
8    225
9    190
Name: horsepower, dtype: object

In [27]:
# auto['horsepower'] >> 숫자로 변경 
auto['horsepower'] = pd.to_numeric(auto['horsepower'])

In [28]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    int64  
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 28.0+ KB


auto 다중선형회귀 (sklearn)

In [29]:
auto[:5]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [30]:
auto.drop(columns='name', axis=1, inplace=True)

데이터에서 종속변수를 제외한 데이터 >> 독립변수 

In [31]:
# train / test 데이터 분리 

train = auto[:-40] # 357
test = auto[-40:] # 40 
#train data의 feature와 response 분리
train_x = train.iloc[:, 1:]
train_y = train[['mpg']]

test_x = test.iloc[:, 1:]
test_y = test[['mpg']]

In [32]:
train_x

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,8,307.0,130,3504,12.0,70,1
1,8,350.0,165,3693,11.5,70,1
2,8,318.0,150,3436,11.0,70,1
3,8,304.0,150,3433,12.0,70,1
4,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...
352,4,98.0,65,2380,20.7,81,1
353,4,105.0,74,2190,14.2,81,2
354,4,100.0,0,2320,15.8,81,2
355,4,107.0,75,2210,14.4,81,3


In [33]:
#선형 회귀 객체 생성 
lr = linear_model.LinearRegression()

lr.fit(train_x, train_y)

train_y_pred = lr.predict(train_x)
test_y_pred = lr.predict(test_x)

In [34]:
lr.coef_

array([[-0.3736576 ,  0.01504545, -0.01430246, -0.00625177,  0.03568412,
         0.74444736,  1.44269524]])

In [35]:
mean_squared_error(train_y, train_y_pred)

10.483252900091152

In [36]:
mean_squared_error(test_y, test_y_pred)


14.531581917722566

In [37]:
r2_score(train_y, train_y_pred)

0.820018681848257

In [38]:
r2_score(test_y, test_y_pred) # test 데이터는 잘 안봄 


0.5505537907484992