# 回归
## StatsModels
Anaconda自带StatsModels包。

In [1]:
%matplotlib inline
from __future__ import print_function, division

import numpy as np
import pandas
import thinkstats2
import thinkplot
import nsfg

In [2]:
def MakeFrames():
    """Reads pregnancy data and partitions first babies and others.

    returns: DataFrames (all live births, first babies, others)
    """
    preg = nsfg.ReadFemPreg()

    live = preg[preg.outcome == 1]
    firsts = live[live.birthord == 1]
    others = live[live.birthord != 1]

    assert len(live) == 9148
    assert len(firsts) == 4413
    assert len(others) == 4735

    return live, firsts, others

In [3]:
import statsmodels.formula.api as smf

live, firsts, others = MakeFrames()
live['isfirst'] = (live.birthord == 1)

def SummarizeResults(results):
    """Prints the most important parts of linear regression results:

    results: RegressionResults object
    """
    for name, param in results.params.iteritems():
        pvalue = results.pvalues[name]
        print('%s   %0.3g   (%.3g)' % (name, param, pvalue))

    try:
        print('R^2 %.4g' % results.rsquared)
        ys = results.model.endog
        print('Std(ys) %.4g' % ys.std())
        print('Std(res) %.4g' % results.resid.std())
    except AttributeError:
        print('R^2 %.4g' % results.prsquared)

def RunSimpleRegression(live):
    """Runs a simple regression and compare results to thinkstats2 functions.

    live: DataFrame of live births
    """
    # run the regression with thinkstats2 functions
    live_dropna = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    ages = live_dropna.agepreg
    weights = live_dropna.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    res = thinkstats2.Residuals(ages, weights, inter, slope)
    r2 = thinkstats2.CoefDetermination(weights, res)

    # run the regression with statsmodels
    formula = 'totalwgt_lb ~ agepreg'
    model = smf.ols(formula, data=live)
    results = model.fit()
    SummarizeResults(results)

    def AlmostEquals(x, y, tol=1e-6):
        return abs(x-y) < tol

    assert(AlmostEquals(results.params['Intercept'], inter))
    assert(AlmostEquals(results.params['agepreg'], slope))
    assert(AlmostEquals(results.rsquared, r2))
RunSimpleRegression(live)

Intercept   6.83   (0)
agepreg   0.0175   (5.72e-11)
R^2 0.004738
Std(ys) 1.408
Std(res) 1.405


## 多重回归与非线性关系
由于agepreg对新生儿体重的影响可能不是线性的，所以可以考虑非线性模型(增加变量agepreg2)。

In [4]:
def FormatRow(results, columns):
    """Converts regression results to a string.

    results: RegressionResults object

    returns: string
    """
    t = []
    for col in columns:
        coef = results.params.get(col, np.nan)
        pval = results.pvalues.get(col, np.nan)
        if np.isnan(coef):
            s = '--'
        elif pval < 0.001:
            s = '%0.3g (*)' % (coef)
        else:
            s = '%0.3g (%0.2g)' % (coef, pval)
        t.append(s)

    try:
        t.append('%.2g' % results.rsquared)
    except AttributeError:
        t.append('%.2g' % results.prsquared)
        
    return t

def RunModels(live):
    """Runs regressions that predict birth weight.

    live: DataFrame of pregnancy records
    """
    columns = ['isfirst[T.True]', 'agepreg', 'agepreg2']
    header = ['isfirst', 'agepreg', 'agepreg2']

    rows = []
    formula = 'totalwgt_lb ~ isfirst'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)

    formula = 'totalwgt_lb ~ agepreg'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
    
    formula = 'totalwgt_lb ~ isfirst + agepreg'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
    
    live['agepreg2'] = live.agepreg**2
    formula = 'totalwgt_lb ~ isfirst + agepreg + agepreg2'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
RunModels(live)

totalwgt_lb ~ isfirst
Intercept   7.33   (0)
isfirst[T.True]   -0.125   (2.55e-05)
R^2 0.00196
Std(ys) 1.408
Std(res) 1.407
totalwgt_lb ~ agepreg
Intercept   6.83   (0)
agepreg   0.0175   (5.72e-11)
R^2 0.004738
Std(ys) 1.408
Std(res) 1.405
totalwgt_lb ~ isfirst + agepreg
Intercept   6.91   (0)
isfirst[T.True]   -0.0698   (0.0253)
agepreg   0.0154   (3.93e-08)
R^2 0.005289
Std(ys) 1.408
Std(res) 1.405
totalwgt_lb ~ isfirst + agepreg + agepreg2
Intercept   5.69   (1.38e-86)
isfirst[T.True]   -0.0504   (0.109)
agepreg   0.112   (3.23e-07)
agepreg2   -0.00185   (8.8e-06)
R^2 0.007462
Std(ys) 1.408
Std(res) 1.403


## 预测
办公室赌局：假设一位待产同事，在孩子出生几周前，预测新生儿体重。

In [6]:
import chap01soln
import re

def JoinFemResp(df):
    """Reads the female respondent file and joins on caseid.

    df: DataFrame
    """
    resp = chap01soln.ReadFemResp()
    resp.index = resp.caseid

    join = df.join(resp, on='caseid', rsuffix='_r')

    # convert from colon-separated time strings to datetimes
    join.screentime = pandas.to_datetime(join.screentime)

    return join

def GoMining(df):
    """Searches for variables that predict birth weight.

    df: DataFrame of pregnancy records

    returns: list of (rsquared, variable name) pairs
    """
    variables = []
    for name in df.columns:
        try:
            if df[name].var() < 1e-7:
                continue

            formula = 'totalwgt_lb ~ agepreg + ' + name
            formula = formula.encode('ascii')

            model = smf.ols(formula, data=df)
            if model.nobs < len(df)/2:
                continue

            results = model.fit()
        except (ValueError, TypeError):
            continue
        except patsy.PatsyError:
            raise ValueError(MESSAGE)

        variables.append((results.rsquared, name))

    return variables

def MiningReport(variables, n=30):
    """Prints variables with the highest R^2.

    t: list of (R^2, variable name) pairs
    n: number of pairs to print
    """
    all_vars = ReadVariables()

    variables.sort(reverse=True)
    for mse, name in variables[:n]:
        key = re.sub('_r$', '', name)
        try:
            desc = all_vars.loc[key].desc
            if isinstance(desc, pandas.Series):
                desc = desc[0]
            print(name, mse, desc)
        except KeyError:
            print(name, mse)

def ReadVariables():
    """Reads Stata dictionary files for NSFG data.

    returns: DataFrame that maps variables names to descriptions
    """
    vars1 = thinkstats2.ReadStataDct('2002FemPreg.dct').variables
    vars2 = thinkstats2.ReadStataDct('2002FemResp.dct').variables

    all_vars = vars1.append(vars2)
    all_vars.index = all_vars.name
    return all_vars

def PredictBirthWeight(live):
    """Predicts birth weight of a baby at 30 weeks.

    live: DataFrame of live births
    """
    live = live[live.prglngth>30]
    join = JoinFemResp(live)

    t = GoMining(join)
    MiningReport(t)

    formula = ('totalwgt_lb ~ agepreg + C(race) + babysex==1 + '
               'nbrnaliv>1 + paydu==1 + totincr')
    results = smf.ols(formula, data=join).fit()
    SummarizeResults(results)
PredictBirthWeight(live)

totalwgt_lb 1.0
birthwgt_lb 0.949812730598 BD-3 BIRTHWEIGHT IN POUNDS - 1ST BABY FROM THIS PREGNANCY
lbw1 0.300824078447 LOW BIRTHWEIGHT - BABY 1
prglngth 0.130125194886 DURATION OF COMPLETED PREGNANCY IN WEEKS
wksgest 0.123400413634 GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN WEEKS)
agecon 0.102031499282 AGE AT TIME OF CONCEPTION
mosgest 0.0271442746396 GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN MONTHS)
babysex 0.0185509252939 BD-2 SEX OF 1ST LIVEBORN BABY FROM THIS PREGNANCY
race_r 0.0161995035863 RACE
race 0.0161995035863 RACE
nbrnaliv 0.0160177527098 BC-2 NUMBER OF BABIES BORN ALIVE FROM THIS PREGNANCY
paydu 0.0140037955781 IB-10 CURRENT LIVING QUARTERS OWNED/RENTED, ETC
rmarout03 0.0134300664657 INFORMAL MARITAL STATUS WHEN PREGNANCY ENDED - 3RD
birthwgt_oz 0.0131024576157 BD-3 BIRTHWEIGHT IN OUNCES - 1ST BABY FROM THIS PREGNANCY
anynurse 0.0125290225418 BH-1 WHETHER R BREASTFED THIS CHILD AT ALL - 1ST FROM THIS PREG
bfeedwks 0.0121936884045 DURATION OF BREASTFEEDING IN WEEKS

### 列表中变量分析
- totalwgt_lb和brithwgt_lb，显然新生儿体重不能用来预测新生儿体重。
- prglngth 具有一些预测能力，但是在这个赌局里，妊娠时间(以及相关变量)好属于未知。
- babysex，代表新生儿性别。
- race，种族。
- nbrnaliv，是否是多胞胎。
- paydu，参与调查者是否拥有自己的住宅。
- 有些变量的信息要到生产后才会知道，例如婴儿母乳喂养周数bfeddwks。这些变量无法用于预测，但是你会猜想为什么bfeddwks会与新生儿体重有关。

### 测试模型
找到有潜力的解释变量后，测试模型，其中一个是：

    formula = ('totalwgt_lb ~ agepreg + C(race) + babysex==1 + '
               'nbrnaliv>1 + paydu==1 + totincr')

- C(race):将race作为分类变量。
- babysex:男性编码为1，女性编码为2。表达式babysex==1即是否为男性。
- paydu==1:是否是多胞胎。
- nbrnaliv>1:是否拥有自己的住宅。
- totincr:编码值为1-14，值每增加1代表年收入增加约5000美元。

### 解释模型结果
- race:值为1代表黑人，2代表白人，3代表其他种族。黑人母亲产下的孩子比其他种族的孩子重0.27磅，而白人是0.36磅。
- babysex:男孩比女孩重大概0.3磅。
- nbrnaliv:多胞胎比其他新生儿轻1.4磅。
- paydu:拥有自己住宅的母亲产下的孩子比其他新生儿重0.12磅。
- agepreg:母亲年龄的参数比前一节结果小，说明其他一些变量与年龄相关，这些变量可能包括paydu和totincr。

所有这些变量都是统计显著的，但有些p值很低，但$R^2$值只有0.06。不使用这个模型时，均方根误差值为1.27磅，使用这个模型时，均方根误差为1.23磅。因此这个模型对你帮助并不明显。

## 逻辑回归
- 优势(odds):一个事件的“优势”就是该事件发生概率与不发生概率的比值。 o=p/(1-p)
- Logistic模型： $logo=\beta_0 + \beta_1x_1 + \beta_2x_2 + \epsilon$
- 办公室赌局：预测新生儿性别。

In [8]:
def LogisticRegressionExample():
    """Runs a simple example of logistic regression and prints results.
    """
    y = np.array([0, 1, 0, 1])
    x1 = np.array([0, 0, 0, 1])
    x2 = np.array([0, 1, 1, 1])

    beta = [-1.5, 2.8, 1.1]

    log_o = beta[0] + beta[1] * x1 + beta[2] * x2 
    print(log_o)

    o = np.exp(log_o)
    print(o)

    p = o / (o+1)
    print(p)

    like = y * p + (1-y) * (1-p)
    print(like)
    print(np.prod(like))

    df = pandas.DataFrame(dict(y=y, x1=x1, x2=x2))
    results = smf.logit('y ~ x1 + x2', data=df).fit()
    print(results.summary())
LogisticRegressionExample()

[-1.5 -0.4 -0.4  2.4]
[  0.22313016   0.67032005   0.67032005  11.02317638]
[ 0.18242552  0.40131234  0.40131234  0.9168273 ]
[ 0.81757448  0.40131234  0.59868766  0.9168273 ]
0.180093352967
         Current function value: 0.346574
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                    4
Model:                          Logit   Df Residuals:                        1
Method:                           MLE   Df Model:                            2
Date:                Sat, 28 Oct 2017   Pseudo R-squ.:                  0.5000
Time:                        08:37:51   Log-Likelihood:                -1.3863
converged:                      False   LL-Null:                       -2.7726
                                        LLR p-value:                    0.2500
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------



In [10]:
def RunLogisticModels(live):
    """Runs regressions that predict sex.

    live: DataFrame of pregnancy records
    """
    #live = linear.ResampleRowsWeighted(live)

    df = live[live.prglngth>30]

    df.loc[:, 'boy'] = (df.babysex==1).astype(int)
    df.loc[:, 'isyoung'] = (df.agepreg<20).astype(int)
    df.loc[:, 'isold'] = (df.agepreg<35).astype(int)
    df.loc[:, 'season'] = (((df.datend+1) % 12) / 3).astype(int)

    # run the simple model
    model = smf.logit('boy ~ agepreg', data=df)    
    results = model.fit()
    print('nobs', results.nobs)
    print(type(results))
    SummarizeResults(results)

    # run the complex model
    model = smf.logit('boy ~ agepreg + hpagelb + birthord + C(race)', data=df)
    results = model.fit()
    print('nobs', results.nobs)
    print(type(results))
    SummarizeResults(results)

    # make the scatter plot
    exog = pandas.DataFrame(model.exog, columns=model.exog_names)
    endog = pandas.DataFrame(model.endog, columns=[model.endog_names])
    
    xs = exog['agepreg']
    lo = results.fittedvalues
    o = np.exp(lo)
    p = o / (o+1)

    #thinkplot.Scatter(xs, p, alpha=0.1)
    #thinkplot.Show()

    # compute accuracy
    actual = endog['boy']
    baseline = actual.mean()

    predict = (results.predict() >= 0.5)
    true_pos = predict * actual
    true_neg = (1 - predict) * (1 - actual)

    acc = (sum(true_pos) + sum(true_neg)) / len(actual)
    print(acc, baseline)

    columns = ['agepreg', 'hpagelb', 'birthord', 'race']
    new = pandas.DataFrame([[35, 39, 3, 1]], columns=columns)
    y = results.predict(new)
    print(y)
live['isfirst'] = (live.birthord == 1)
RunLogisticModels(live)

Optimization terminated successfully.
         Current function value: 0.693015
         Iterations 3
nobs 8884
<class 'statsmodels.discrete.discrete_model.BinaryResultsWrapper'>
Intercept   0.00579   (0.953)
agepreg   0.00105   (0.783)
R^2 6.144e-06
Optimization terminated successfully.
         Current function value: 0.692944
         Iterations 3
nobs 8782
<class 'statsmodels.discrete.discrete_model.BinaryResultsWrapper'>
Intercept   -0.0301   (0.772)
C(race)[T.2]   -0.0224   (0.66)
C(race)[T.3]   -0.000457   (0.996)
agepreg   -0.00267   (0.629)
hpagelb   0.0047   (0.266)
birthord   0.00501   (0.821)
R^2 0.000144
0.511500797085 0.507173764518
[ 0.51868037]


- 模型1：
    - agepreg的参数为正，说明年龄较大母亲生男孩的可能性更大，但p值为0.783，说明这一直观效应可能是偶然发生。
    - 决定系数$R^2$不适用于Logistic回归，但有几个可作为“伪$R^2$值”的度量。这些值可以用于进行模型的比较。
- 模型2：
    - 除了母亲的年龄，这个模型还使用了孩子出生时父亲的年龄(hpagelb)、孩子的出生排行(birthord)以及种族作为分类变量。
    - 这些参数都不是统计显著的。结果中伪$R^2$略高，当也可能是偶然导致的。

## 准确度
计算准确度：

    acc = (sum(true_pos) + sum(true_neg)) / len(actual)
结果为0.519。