## 11.9 二值选择模型的python实现

In [92]:
import pandas as pd
import numpy as np

# 读取数据
df = pd.read_stata('../2_Data/Data-2e/titanic.dta')

def describe_bcmodel(df, frequency, target=None, condition_col=None, condition=None):
    '''describe_bcmodel 二值模型的描述性统计，返回原始数据
    Arguments:
        df:dataframe -- 含有频次的数据集
        frequency:str --  频次的字段名

    Keyword Arguments:
        target:str -- 观测对象的字段名 (default: {None})
        condition_col:str -- 条件变量的字段名 (default: {None})
        condition:any -- 条件值 (default: {None})

    Returns:
         -- 按频次还原后的数据集
    '''
    result = df.loc[np.repeat(df.index.values, df[frequency])].drop(frequency, axis=1).reset_index(drop=True)
    if (target is None) and (condition_col is None): 
        print(result.describe().T) 
    else:
        result = result[[target, condition_col]][result[condition_col] == condition].drop(condition_col, axis=1).reset_index(drop=True)
        print(f'when {condition_col} is {condition}:')
        print(result.describe().T)
        return result
    return result
    
des = describe_bcmodel(df, frequency='freq')

          count      mean       std  min  25%  50%  75%  max
class1   2201.0  0.147660  0.354843  0.0  0.0  0.0  0.0  1.0
class2   2201.0  0.129487  0.335814  0.0  0.0  0.0  0.0  1.0
class3   2201.0  0.320763  0.466876  0.0  0.0  0.0  1.0  1.0
class4   2201.0  0.402090  0.490431  0.0  0.0  0.0  1.0  1.0
child    2201.0  0.049523  0.217006  0.0  0.0  0.0  0.0  1.0
female   2201.0  0.213539  0.409898  0.0  0.0  0.0  0.0  1.0
survive  2201.0  0.323035  0.467742  0.0  0.0  0.0  1.0  1.0


解释一下每个函数的作用：

`np.repeat(df.index.values, df['freq'])`：
- `df.index.values` 返回数据框的索引值，这是一个代表行号的数组。
- `df['freq']` 返回'freq'列的值，这是一个代表信息重复次数的数组。
- `np.repeat`函数将行索引根据'freq'列的值进行重复，以便在最终结果中重复出现对应次数。

`df.loc[]`：
- `df.loc` 是用于按标签选择行和列的方法。在这里，它使用重复后的索引来选择数据框中的行。

`.drop('freq', axis=1)`：
- drop 方法用于删除数据框中的列。在这里，它删除了名为'freq'的列。参数axis=1表示删除列。

`.reset_index(drop=True)`：
- `reset_index` 方法用于重置索引。参数`drop=True`表示删除原始索引，使新索引从零开始。这样可以确保最终结果的索引是连续的整数序列。

综合起来，这行代码的作用是将数据框中的行根据'freq'列的值重复多次，然后丢弃'freq'列，并重置索引，以得到非'freq'列的信息按照出现次数重复的结果。

In [93]:
# 分别计算小孩、女士、各等舱的乘客存货率
for col in des.drop('survive', axis=1).columns:
    describe_bcmodel(df, 'freq', target='survive', condition_col=col, condition=1)
    print('\n')

when class1 is 1:
         count      mean       std  min  25%  50%  75%  max
survive  325.0  0.624615  0.484969  0.0  0.0  1.0  1.0  1.0


when class2 is 1:
         count      mean       std  min  25%  50%  75%  max
survive  285.0  0.414035  0.493421  0.0  0.0  0.0  1.0  1.0


when class3 is 1:
         count      mean      std  min  25%  50%  75%  max
survive  706.0  0.252125  0.43454  0.0  0.0  0.0  1.0  1.0


when class4 is 1:
         count      mean       std  min  25%  50%  75%  max
survive  885.0  0.239548  0.427049  0.0  0.0  0.0  0.0  1.0


when child is 1:
         count      mean       std  min  25%  50%  75%  max
survive  109.0  0.522936  0.501781  0.0  0.0  1.0  1.0  1.0


when female is 1:
         count      mean       std  min  25%  50%  75%  max
survive  470.0  0.731915  0.443434  0.0  0.0  1.0  1.0  1.0




In [95]:
import statsmodels.api as sm

X = des[['class1','class2','class3','child','female']]
y = des['survive']
X = sm.add_constant(X)
model_ols = sm.OLS(y,X)
results_ols = model_ols.fit()
print(results_ols.summary())

                            OLS Regression Results                            
Dep. Variable:                survive   R-squared:                       0.253
Model:                            OLS   Adj. R-squared:                  0.251
Method:                 Least Squares   F-statistic:                     148.6
Date:                Sun, 28 Apr 2024   Prob (F-statistic):          3.55e-136
Time:                        05:39:22   Log-Likelihood:                -1129.3
No. Observations:                2201   AIC:                             2271.
Df Residuals:                    2195   BIC:                             2305.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2268      0.014     16.654      0.0

In [97]:
model_logit = sm.Logit(y, X)
result_logit = model_logit.fit()
print(result_logit.summary())

Optimization terminated successfully.
         Current function value: 0.502058
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                survive   No. Observations:                 2201
Model:                          Logit   Df Residuals:                     2195
Method:                           MLE   Df Model:                            5
Date:                Sun, 28 Apr 2024   Pseudo R-squ.:                  0.2020
Time:                        05:41:53   Log-Likelihood:                -1105.0
converged:                       True   LL-Null:                       -1384.7
Covariance Type:            nonrobust   LLR p-value:                1.195e-118
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2339      0.080    -15.329      0.000      -1.392      -1.076
class1         0.8577      0.

In [101]:
print(model_logit.fit(cov_type='HC0').summary())
print(model_logit.fit(cov_type='HC1').summary())
print(model_logit.fit(cov_type='HC2').summary())
print(model_logit.fit(cov_type='HC3').summary())

Optimization terminated successfully.
         Current function value: 0.502058
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                survive   No. Observations:                 2201
Model:                          Logit   Df Residuals:                     2195
Method:                           MLE   Df Model:                            5
Date:                Sun, 28 Apr 2024   Pseudo R-squ.:                  0.2020
Time:                        05:42:50   Log-Likelihood:                -1105.0
converged:                       True   LL-Null:                       -1384.7
Covariance Type:                  HC0   LLR p-value:                1.195e-118
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2339      0.080    -15.449      0.000      -1.390      -1.077
class1         0.8577      0.

In [108]:

#
res.params


Optimization terminated successfully.
         Current function value: 0.502058
         Iterations 6


const    -1.233899
class1    0.857676
class2   -0.160419
class3   -0.920086
child     1.061542
female    2.420060
dtype: float64