## 11.9 二值选择模型的python实现

### 1. 导入数据，查看各变量的统计特征
- 'freq'字段表示，数据出现的频次

In [2]:
import pandas as pd
import numpy as np
from cq import describe_bcmodel
# 读取数据
df = pd.read_stata('../2_Data/Data-2e/titanic.dta')
 
des = describe_bcmodel(df, frequency='freq')

          count      mean       std  min  25%  50%  75%  max
class1   2201.0  0.147660  0.354843  0.0  0.0  0.0  0.0  1.0
class2   2201.0  0.129487  0.335814  0.0  0.0  0.0  0.0  1.0
class3   2201.0  0.320763  0.466876  0.0  0.0  0.0  1.0  1.0
class4   2201.0  0.402090  0.490431  0.0  0.0  0.0  1.0  1.0
child    2201.0  0.049523  0.217006  0.0  0.0  0.0  0.0  1.0
female   2201.0  0.213539  0.409898  0.0  0.0  0.0  0.0  1.0
survive  2201.0  0.323035  0.467742  0.0  0.0  0.0  1.0  1.0


> `np.repeat(df.index.values, df['freq'])`：
> - `df.index.values` 返回数据框的索引值，这是一个代表行号的数组。
> - `df['freq']` 返回'freq'列的值，这是一个代表信息重复次数的数组。
> - `np.repeat`函数将行索引根据'freq'列的值进行重复，以便在最终结果中重复出现对应次数。
> 
> `df.loc[]`：
> - `df.loc` 是用于按标签选择行和列的方法。在这里，它使用重复后的索引来选择数据框中的行。
>
> `.drop('freq', axis=1)`：
> - drop 方法用于删除数据框中的列。在这里，它删除了名为'freq'的列。参数axis=1表示删除列。
>
> `.reset_index(drop=True)`：
> - `reset_index` 方法用于重置索引。参数`drop=True`表示删除原始索引，使新索引从零开始。这样可以确保最终结果的索引是连续的整数序列。
>
> 综合起来，这行代码的作用是将数据框中的行根据'freq'列的值重复多次，然后丢弃'freq'列，并重置索引，以得到非'freq'列的信息按照出现次数重复的结果。

### 2.观察不同特征下的存活率

In [3]:
# 分别计算小孩、女士、各等舱的乘客存活率
for col in des.drop('survive', axis=1).columns:
    describe_bcmodel(df, 
                     'freq', 
                     target='survive', 
                     condition_col=col, 
                     condition=1)
    print('------------------------------------------------------------------')

when class1 is 1:
         count      mean       std  min  25%  50%  75%  max
survive  325.0  0.624615  0.484969  0.0  0.0  1.0  1.0  1.0
------------------------------------------------------------------
when class2 is 1:
         count      mean       std  min  25%  50%  75%  max
survive  285.0  0.414035  0.493421  0.0  0.0  0.0  1.0  1.0
------------------------------------------------------------------
when class3 is 1:
         count      mean      std  min  25%  50%  75%  max
survive  706.0  0.252125  0.43454  0.0  0.0  0.0  1.0  1.0
------------------------------------------------------------------
when class4 is 1:
         count      mean       std  min  25%  50%  75%  max
survive  885.0  0.239548  0.427049  0.0  0.0  0.0  0.0  1.0
------------------------------------------------------------------
when child is 1:
         count      mean       std  min  25%  50%  75%  max
survive  109.0  0.522936  0.501781  0.0  0.0  1.0  1.0  1.0
---------------------------------------------

### 3.构建OLS参照系

In [4]:
import statsmodels.api as sm

X = des[['class1','class2','class3','child','female']]
y = des['survive']
X = sm.add_constant(X)
model_ols = sm.OLS(y,X)
results_ols = model_ols.fit()
print(results_ols.summary())

                            OLS Regression Results                            
Dep. Variable:                survive   R-squared:                       0.253
Model:                            OLS   Adj. R-squared:                  0.251
Method:                 Least Squares   F-statistic:                     148.6
Date:                Mon, 29 Apr 2024   Prob (F-statistic):          3.55e-136
Time:                        22:55:24   Log-Likelihood:                -1129.3
No. Observations:                2201   AIC:                             2271.
Df Residuals:                    2195   BIC:                             2305.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2268      0.014     16.654      0.0

### 4.使用Logit模型进行估计
`sm.logit(endog, exog).fit(disp=0)`
-  `disp = 0` 不现实迭代过程，只显示结果
-  `disp = 1` 显示迭代过程

In [5]:
model_logit = sm.Logit(y, X)
result_logit = model_logit.fit(disp=1)
print(result_logit.summary())

Optimization terminated successfully.
         Current function value: 0.502058
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                survive   No. Observations:                 2201
Model:                          Logit   Df Residuals:                     2195
Method:                           MLE   Df Model:                            5
Date:                Mon, 29 Apr 2024   Pseudo R-squ.:                  0.2020
Time:                        22:55:24   Log-Likelihood:                -1105.0
converged:                       True   LL-Null:                       -1384.7
Covariance Type:            nonrobust   LLR p-value:                1.195e-118
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2339      0.080    -15.329      0.000      -1.392      -1.076
class1         0.8577      0.

### 5.使用稳健标准误进行Logit估计

In [6]:
# print(model_logit.fit(cov_type='HC0').summary())
# print(model_logit.fit(cov_type='HC1').summary())
# print(model_logit.fit(cov_type='HC2').summary())
print(model_logit.fit(cov_type='HC3').summary())

Optimization terminated successfully.
         Current function value: 0.502058
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                survive   No. Observations:                 2201
Model:                          Logit   Df Residuals:                     2195
Method:                           MLE   Df Model:                            5
Date:                Mon, 29 Apr 2024   Pseudo R-squ.:                  0.2020
Time:                        22:55:24   Log-Likelihood:                -1105.0
converged:                       True   LL-Null:                       -1384.7
Covariance Type:                  HC3   LLR p-value:                1.195e-118
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2339      0.080    -15.449      0.000      -1.390      -1.077
class1         0.8577      0.

### 6.显示Logit回归的几率比

In [7]:
import numpy as np

odds_ratios = np.exp(result_logit.params)
result_logit_or = pd.DataFrame({'odds ratio': odds_ratios, 
                                'std err': result_logit.bse,
                                'z':result_logit.tvalues,
                                'p>|z|':result_logit.pvalues,
                                }, 
                               index=result_logit.params.index)
pd.set_option('display.float_format', '{:.4f}'.format)
result_logit_or

Unnamed: 0,odds ratio,std err,z,p>|z|
const,0.2912,0.0805,-15.329,0.0
class1,2.3577,0.1573,5.4511,0.0
class2,0.8518,0.1738,-0.9231,0.356
class3,0.3985,0.1486,-6.1923,0.0
child,2.8908,0.244,4.3501,0.0
female,11.2465,0.1404,17.2357,0.0


### 7.计算Logit模型的平均边际效应

In [8]:
mfx = result_logit.get_margeff()
print(mfx.summary())

        Logit Marginal Effects       
Dep. Variable:                survive
Method:                          dydx
At:                           overall
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
class1         0.1400      0.025      5.578      0.000       0.091       0.189
class2        -0.0262      0.028     -0.923      0.356      -0.082       0.029
class3        -0.1501      0.024     -6.300      0.000      -0.197      -0.103
child          0.1732      0.039      4.399      0.000       0.096       0.250
female         0.3949      0.017     22.965      0.000       0.361       0.429


### 8.计算均值处的平均边际效应

In [9]:
mfx = result_logit.get_margeff(at='mean')
print(mfx.summary())

        Logit Marginal Effects       
Dep. Variable:                survive
Method:                          dydx
At:                              mean
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
class1         0.1796      0.033      5.405      0.000       0.114       0.245
class2        -0.0336      0.036     -0.924      0.356      -0.105       0.038
class3        -0.1927      0.031     -6.253      0.000      -0.253      -0.132
child          0.2223      0.051      4.353      0.000       0.122       0.322
female         0.5069      0.030     16.699      0.000       0.447       0.566


### 9.准确度测量

用模型预测值与实际值进行比较，计算预测值与实际值相符的比例

In [10]:
predicted_classes = result_logit.predict(X) > 0.5

# 计算准确率
accuracy = (predicted_classes == y).mean()

print(f"Accuracy of the model: {accuracy*100:.2f}%")

Accuracy of the model: 77.83%


### 10.数据预测

In [12]:
msrose = pd.DataFrame([1,1,0,0,0,1],
                      index=result_logit.params.index,columns=['MS-ROSE'])
# 两种不同的赋值方式
mrjack = pd.DataFrame({'const':1,
                       'class1':0,
                       'class2':0,
                       'class3':1,
                       'child':0,
                       'female':0},
                      index=['MR-Jack'],columns=result_logit.params.index.T)

print(result_logit.predict(msrose.T))
print(result_logit.predict(mrjack))

MS-ROSE   0.8853
dtype: float64
MR-Jack   0.1040
dtype: float64


### 11.使用Probit模型进行回归

In [None]:
model_probit = sm.Probit(y,X)
results_probit = model_probit.fit()
print(results_probit.summary())

Optimization terminated successfully.
         Current function value: 0.502642
         Iterations 5
                          Probit Regression Results                           
Dep. Variable:                survive   No. Observations:                 2201
Model:                         Probit   Df Residuals:                     2195
Method:                           MLE   Df Model:                            5
Date:                Mon, 29 Apr 2024   Pseudo R-squ.:                  0.2011
Time:                        11:15:44   Log-Likelihood:                -1106.3
converged:                       True   LL-Null:                       -1384.7
Covariance Type:            nonrobust   LLR p-value:                4.286e-118
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.7530      0.047    -16.063      0.000      -0.845      -0.661
class1         0.5399      0.

In [None]:
mfx_probit = results_probit.get_margeff()
print(mfx_probit.summary())

       Probit Marginal Effects       
Dep. Variable:                survive
Method:                          dydx
At:                           overall
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
class1         0.1526      0.026      5.802      0.000       0.101       0.204
class2        -0.0254      0.029     -0.873      0.383      -0.082       0.032
class3        -0.1378      0.022     -6.175      0.000      -0.182      -0.094
child          0.1640      0.039      4.246      0.000       0.088       0.240
female         0.4097      0.018     23.050      0.000       0.375       0.445


In [None]:
predicted_classes_probit = results_probit.predict(X) > 0.5
# 计算准确率
accuracy = (predicted_classes_probit == y).mean()

print(f"Accuracy of the model: {accuracy*100:.2f}%")

Accuracy of the model: 77.83%


In [None]:
df = pd.DataFrame(np.corrcoef(predicted_classes,predicted_classes_probit),index=['logit','probit'],columns=['logit','probit'])
df

Unnamed: 0,logit,probit
logit,1.0,1.0
probit,1.0,1.0
