# Kiểm định mô hình (cont.)

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices

In [2]:
np.random.seed(42)

## Kiểm định đa cộng tuyến

In [3]:
y = np.array([6, 8, 8, 7, 7, 12, 9, 8, 9, 10, 10, 11, 9, 10, 11])
x1 = np.array([9, 10, 8, 7, 10, 4, 5, 5, 6, 8, 7, 4, 9, 5, 8])
x2 = np.array([8, 13, 11, 10, 12, 16, 10, 10, 12, 14, 12, 16, 14, 10, 12])

In [4]:
df = pd.DataFrame({"x1" : x1, "x2": x2, "y" : y})

In [5]:
result = smf.ols("y ~ x1 + x2", df).fit()

In [6]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.693
Model:                            OLS   Adj. R-squared:                  0.642
Method:                 Least Squares   F-statistic:                     13.56
Date:                Sat, 10 Jun 2023   Prob (F-statistic):           0.000834
Time:                        09:56:09   Log-Likelihood:                -19.779
No. Observations:                  15   AIC:                             45.56
Df Residuals:                      12   BIC:                             47.68
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      6.2030      1.862      3.331      0.0



## Đa cộng tuyến

In [7]:
x1 = np.array([751.6, 779.2, 810.3, 864.7, 857.5, 874.9, 906.8, 942.9, 988.8, 1015.7])
x2 = x1 + 10
y = np.array([672.1, 696.8, 737.1, 767.9, 762.8, 779.4, 823.1, 864.3, 903.2, 927.6])

In [8]:
df = pd.DataFrame({"x1" : x1, "x2": x2, "y" : y})

In [9]:
result = smf.ols("y ~ x1 + x2", df).fit()

In [10]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.991
Method:                 Least Squares   F-statistic:                     959.9
Date:                Sat, 10 Jun 2023   Prob (F-statistic):           1.28e-09
Time:                        09:56:09   Log-Likelihood:                -34.107
No. Observations:                  10   AIC:                             72.21
Df Residuals:                       8   BIC:                             72.82
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.4211      0.550     -2.582      0.0



Độ tin cậy $x_2 = x_1 + 10$ tụt -> các biến không còn độc lập

Trước khi chạy hồi qui:

- Kiểm tra các biến $x_1, x_2, .., x_D$ có đa cộng tuyến (multicollinearity) hay không.
    - Dùng hệ số tương quan (Pearson, Spearman hoặc Kendall): cao thì phải bỏ (nên $< 0.5$). 
    - 3 biến trở lên: dùng VIF (dùng biến $x_i$ là $y$, chạy hồi qui $D - 1$ biến còn lại, xem hệ số xác định)

# Hồi qui (cont.)

## Hồi qui với biến chuẩn hóa

In [11]:
y = np.array([6, 8, 8, 7, 7, 12, 9, 8, 9, 10, 10, 11, 9, 10, 11])
x1 = np.array([9, 10, 8, 7, 10, 4, 5, 5, 6, 8, 7, 4, 9, 5, 8])
x2 = np.array([8, 13, 11, 10, 12, 16, 10, 10, 12, 14, 12, 16, 14, 10, 12])

In [12]:
x1_norm = (x1 - x1.mean()) / x1.std()
x2_norm = (x2 - x2.mean()) / x2.std()
y_norm = (y - y.mean()) / y.std()

In [13]:
df = pd.DataFrame({"x1" : x1_norm, "x2": x2_norm, "y" : y_norm})

In [14]:
print(df)

     x1        x2         y
0   1.0 -1.800901 -1.837117
1   1.5  0.450225 -0.612372
2   0.5 -0.450225 -0.612372
3   0.0 -0.900450 -1.224745
4   1.5  0.000000 -1.224745
5  -1.5  1.800901  1.837117
6  -1.0 -0.900450  0.000000
7  -1.0 -0.900450 -0.612372
8  -0.5  0.000000  0.000000
9   0.5  0.900450  0.612372
10  0.0  0.000000  0.612372
11 -1.5  1.800901  1.224745
12  1.0  0.900450  0.000000
13 -1.0 -0.900450  0.612372
14  0.5  0.000000  1.224745


In [15]:
result = smf.ols("y ~ x1 + x2", df).fit()

In [16]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.693
Model:                            OLS   Adj. R-squared:                  0.642
Method:                 Least Squares   F-statistic:                     13.56
Date:                Sat, 10 Jun 2023   Prob (F-statistic):           0.000834
Time:                        09:56:09   Log-Likelihood:                -12.422
No. Observations:                  15   AIC:                             30.84
Df Residuals:                      12   BIC:                             32.97
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1.388e-17      0.160  -8.68e-17      1.0



$|\beta_2| > |\beta_1|$ -> biến $x_2$ có tác động lớn hơn.

## Hồi qui với biến categorical

In [33]:
y = np.array([23.0,19.5,24.0,21.0,25.0,22.0,26.5,23.1,25.0,28.0,29.5,26.0,27.5,31.5,29.0])
x1 = np.array([1,1,2,2,3,3,4,4,5,5,6,6,7,7,8])
x2 = np.array([1,0,1,0,1,0,1,0,0,1,1,0,0,1,0])

In [34]:
df = pd.DataFrame({"x1":x1, "x2":x2, "y":y})

In [35]:
result = smf.ols("y ~ x1 + x2", df).fit()

In [36]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.993
Model:                            OLS   Adj. R-squared:                  0.992
Method:                 Least Squares   F-statistic:                     886.6
Date:                Sat, 10 Jun 2023   Prob (F-statistic):           9.23e-14
Time:                        10:20:48   Log-Likelihood:                -1.4621
No. Observations:                  15   AIC:                             8.924
Df Residuals:                      12   BIC:                             11.05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     17.9693      0.192     93.612      0.0



Nữ:

$$
\text{thu nhập} = 1.3707 \times \text{số năm công tác} + 17.9693
$$

Nam:

$$
\text{thu nhập} = 1.3707 \times \text{số năm công tác} + 3.3336 + 17.9693
$$

# Lựa chọn mô hình