In [2]:
import numpy as np
from scipy import stats
from pandas import read_excel, DataFrame
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from statsmodels.regression import linear_model
from sklearn.metrics import mean_squared_error


In [3]:
origin = read_excel("fish2.xlsx")
origin.head()


Unnamed: 0,길이,높이,두께,무게
0,8.4,2.11,1.41,5.9
1,13.7,3.53,2.0,32.0
2,15.0,3.82,2.43,40.0
3,16.2,4.59,2.63,51.5
4,17.4,4.59,2.94,70.0


#### 머신러닝에 의한 회귀분석 수행

In [4]:
xnames = ['길이','높이','두께']
yname = '무게'
x= origin.filter(xnames)
y = origin[yname]

In [5]:
model = LinearRegression()
fit =model.fit(x,y)
coefficients = model.coef_
intercept = model.intercept_
print("계수: ", coefficients)
print("절편: ", intercept)

계수:  [ 2.9082713  67.20469902 67.26029602]
절편:  -546.4397914448659


#### 03. 결과보고에 필요한 값 구하기
##### 1) 절편과 계수를 하나의 배열로 결합

In [6]:
params = np.append(intercept, coefficients)
params


array([-546.43979144,    2.9082713 ,   67.20469902,   67.26029602])

##### 2) 상수항 추가하기

In [7]:
# 상수항 추가
designX = x.copy()
designX.insert(0, '상수', 1)
designX.head()

Unnamed: 0,상수,길이,높이,두께
0,1,8.4,2.11,1.41
1,1,13.7,3.53,2.0
2,1,15.0,3.82,2.43
3,1,16.2,4.59,2.63
4,1,17.4,4.59,2.94


##### 3) 행렬곱 구하기


In [8]:
dot = np.dot(designX.T,designX)
dot

array([[   56.    ,  1562.    ,   440.28  ,   265.75  ],
       [ 1562.    , 48045.12  , 13688.339 ,  8270.876 ],
       [  440.28  , 13688.339 ,  3917.2114,  2365.5425],
       [  265.75  ,  8270.876 ,  2365.5425,  1434.4117]])

##### 4) 행렬곱에 대한 역행렬

In [9]:
inv = np.linalg.inv(dot)
inv

array([[ 0.25997581, -0.02937614,  0.05587393,  0.02907514],
       [-0.02937614,  0.00811062, -0.0207489 , -0.00710593],
       [ 0.05587393, -0.0207489 ,  0.11758923, -0.08463348],
       [ 0.02907514, -0.00710593, -0.08463348,  0.17585582]])

In [10]:
predictions = model.predict(x)
MSE = (sum((y-predictions)**2)) / (len(designX)-len(designX.iloc[0])) # 관측치 56개 - (독립변수 개수 3개 + 1)
MSE

7374.273394715796

In [11]:
y_pred = model.predict(x)
mse = mean_squared_error(y, y_pred)
mse

6847.539580807524

#### 표준오차

In [14]:
var_b = MSE*(inv.diagonal())
se_b = np.sqrt(var_b)
se_b

array([43.78507388,  7.73368804, 29.44715768, 36.0112326 ])

#### 8) t-value 구하기

In [15]:
ts_b = params/se_b
ts_b

array([-12.48004726,   0.37605232,   2.28221344,   1.86775878])

#### 9) p-value 구하기

In [17]:
p_value = [2*(1-stats.t.cdf(np.abs(i), (len(designX.iloc[0])))) for i in ts_b]
p_value

[0.00023709485000988906,
 0.7259728743313012,
 0.08457983654778722,
 0.13518113364584705]

#### 10) VIF 구하기


In [18]:

vif = []

for i, v in enumerate(xnames):
    j = list(origin.columns).index(v)
    vif.append(variance_inflation_factor(origin, j))
    
vif

[338.76030542544714, 500.757055790855, 263.01505845905143]

#### 11) 결과표 구성하기

In [26]:
resultDf = DataFrame({
    "종속변수": [yname] * len(xnames),
    "독립변수": xnames,
    r"$$\hat{beta}$$": coefficients,
    "표준오차": se_b[1:],
    "β": 0,
    "t": ts_b[1:],
    "유의확률": p_value[1:],
    "VIF": vif,
})

resultDf

Unnamed: 0,종속변수,독립변수,$$\hat{beta}$$,표준오차,β,t,유의확률,VIF
0,무게,길이,2.908271,7.733688,0,0.376052,0.725973,338.760305
1,무게,높이,67.204699,29.447158,0,2.282213,0.08458,500.757056
2,무게,두께,67.260296,36.011233,0,1.867759,0.135181,263.015058
