<a href="https://colab.research.google.com/github/sysiphe0/exercise_ml/blob/main/LinearRegression_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### 참고(출처) : Carl's Tech Bolg (https://wotres.tistory.com/)

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
datas = datasets.load_diabetes()

df = pd.DataFrame(datas.data, columns = datas.feature_names)
y = pd.DataFrame(datas.target, columns=['y'])
df = pd.concat([df, y], axis=1)
train, test = train_test_split(df, test_size=0.3, random_state=123)

In [2]:
cols = train.columns

In [4]:
from sklearn.linear_model import LinearRegression
# 독립변수 == 설명변수
# 선형회귀 - 절편이 있는 경우
lr = LinearRegression()
# 절편(Intercept) 없는 경우
#lr = LinearRegression(fit_intercept = False)
lr.fit(train.iloc[:, :-1], train.y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [5]:
pred = lr.predict(test.iloc[:,:-1])

In [6]:
# 회귀계수
lr.coef_

array([  10.45384922, -261.16601105,  538.84541221,  280.72544466,
       -855.21447839,  472.17305267,  166.51881384,  309.88763264,
        684.0489522 ,  102.37723262])

In [8]:
# 영향력이 큰 또는 예측값이 큰 변수 구할때 = 회귀계수 가장 큰 값과 해당 변수명 구하기
print(lr.coef_[lr.coef_.argmax()])
print(cols[lr.coef_.argmax()])

684.0489521950237
s5


In [9]:
# 절편구할때
lr.intercept_

152.61083063288848

In [10]:
# [1,1,1,1,1,1,1,1,1,?] 이고 y = 100 일때  ? 값
arr = [1,1,1,1,1,1,1,1,1,'?']
arr = arr[:-1]
ans = (100 - lr.intercept_ - np.dot(arr, lr.coef_[:-1].reshape(-1, 1)))/lr.coef_[-1]
print(ans)

[-13.66400969]


In [11]:
# 결정계수
# 선형회귀분석에서 종속변수에 대한 설명변수들의 설명력을 알고자 할 때 사용
# 총변동 중 회귀선에 의한 변동이 차지하는 비율
# r2 = 0.4 = 약 40% 설명
from sklearn.metrics import r2_score

# 결정 계수
print(r2_score(test.y, pred))

0.507828558489374


### statsmodels 사용 (식 직접 적는 법)

In [13]:
# 선형 회귀 LinearRegression - statesmodel.formula.api 사용
from statsmodels.formula.api import ols
s = 'y ~ '
for i, c in enumerate(cols[:-1]):
  if i==0:
    s = s + c
    continue
  s = s + '+' + c

# 아래 summary에서 P>|t| 를 통해 유의미한 개수 확인 가능 => 0.05 보다 작은 경우 유의미함
ols(s, df).fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.518
Model:,OLS,Adj. R-squared:,0.507
Method:,Least Squares,F-statistic:,46.27
Date:,"Sat, 12 Dec 2020",Prob (F-statistic):,3.8299999999999998e-62
Time:,08:45:13,Log-Likelihood:,-2386.0
No. Observations:,442,AIC:,4794.0
Df Residuals:,431,BIC:,4839.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,152.1335,2.576,59.061,0.000,147.071,157.196
age,-10.0122,59.749,-0.168,0.867,-127.448,107.424
sex,-239.8191,61.222,-3.917,0.000,-360.151,-119.488
bmi,519.8398,66.534,7.813,0.000,389.069,650.610
bp,324.3904,65.422,4.958,0.000,195.805,452.976
s1,-792.1842,416.684,-1.901,0.058,-1611.169,26.801
s2,476.7458,339.035,1.406,0.160,-189.621,1143.113
s3,101.0446,212.533,0.475,0.635,-316.685,518.774
s4,177.0642,161.476,1.097,0.273,-140.313,494.442

0,1,2,3
Omnibus:,1.506,Durbin-Watson:,2.029
Prob(Omnibus):,0.471,Jarque-Bera (JB):,1.404
Skew:,0.017,Prob(JB):,0.496
Kurtosis:,2.726,Cond. No.,227.0


### statsmodels 사용 (바로 x,y 대입)

In [14]:
# 선형 회귀 LinearRegression - statsmodel.api 사용
from statsmodels.api import OLS, add_constant
# const 칼럼 추가
# add_constant 안하면  r2 score 구할때 uncentered 되어 조금 다르게 나옴

train_data = add_constant(train)
model = OLS(train_data.y, train_data.drop(columns='y')).fit()

In [16]:
# 유의미한 변수 확인
p_cols = model.pvalues.index[model.pvalues <= 0.05]
print(p_cols)

Index(['const', 'sex', 'bmi', 'bp', 's5'], dtype='object')


In [18]:
# 유의미한 변수로 재학습
train_data_x = train_data[list(p_cols)[1:] + ['y']]
train_data_x = add_constant(train_data_x)
train_data_x

Unnamed: 0,const,sex,bmi,bp,s5,y
374,1.0,-0.044642,-0.034229,-0.067642,-0.000609,140.0
420,1.0,-0.044642,-0.036385,0.000068,-0.033249,146.0
204,1.0,0.050680,0.006728,0.028758,0.002008,277.0
263,1.0,0.050680,-0.077342,-0.046985,-0.072128,116.0
285,1.0,-0.044642,-0.020218,-0.015999,0.059881,233.0
...,...,...,...,...,...,...
230,1.0,0.050680,0.071397,-0.057314,0.050276,220.0
98,1.0,0.050680,-0.005128,-0.012556,-0.006080,92.0
322,1.0,0.050680,0.061696,0.062039,0.133396,242.0
382,1.0,-0.044642,0.060618,-0.022885,0.104138,132.0


In [19]:
model = OLS(train_data_x.y, train_data_x.drop(columns='y')).fit()

In [21]:
test_data_x = test[list(p_cols[1:])+['y']]
test_data_x = add_constant(test_data_x)
pred = model.predict(test_data_x.drop(columns='y'))

In [22]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.478
Model:                            OLS   Adj. R-squared:                  0.471
Method:                 Least Squares   F-statistic:                     69.53
Date:                Sat, 12 Dec 2020   Prob (F-statistic):           9.62e-42
Time:                        08:51:42   Log-Likelihood:                -1680.0
No. Observations:                 309   AIC:                             3370.
Df Residuals:                     304   BIC:                             3389.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        153.4090      3.189     48.105      0.0

In [23]:
# 모델 coefficient 추출
print(model.params)

const    153.408997
sex     -133.080599
bmi      636.798757
bp       263.286078
s5       513.571282
dtype: float64


In [24]:
# 모델 p_values 추출
print(model.pvalues)

const    3.540973e-144
sex       5.420117e-02
bmi       8.145613e-15
bp        5.677521e-04
s5        8.104906e-11
dtype: float64


In [26]:
# 모델 t_values
print(model.tvalues)

const    48.105306
sex      -1.932705
bmi       8.174046
bp        3.483424
s5        6.736875
dtype: float64


In [27]:
# 결정계수 / 수정된 결정 계수
print(model.rsquared, model.rsquared_adj)

0.47776198912466095 0.4708904363499855
