In [1]:
import numpy as np
import pandas as pd
import altair as alt
from sklearn.linear_model import LinearRegression
from scipy.stats import norm
from scipy import stats

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
grid_df = pd.DataFrame({'n_meteorites': np.arange(50)})
grid_df['negative binomial'] = stats.nbinom.pmf(k = grid_df.n_meteorites, n = 12, p = 0.4)
grid_df['poisson'] = stats.poisson.pmf(k = grid_df.n_meteorites, mu = 15)


grid_df = grid_df.melt(
    id_vars = 'n_meteorites',
    var_name = 'distribution',
    value_name = 'pmf'
)

dists = alt.Chart(grid_df).mark_line(point = True).encode(
    x = 'n_meteorites',
    y = 'pmf',
    color = 'distribution'
)

np.random.seed(80621)
toy_data = pd.DataFrame({'n_meteorites': np.random.negative_binomial(12, 0.4, 225)})

hist = alt.Chart(toy_data).transform_bin(
    as_ = 'bin', 
    field = 'n_meteorites', 
    bin = alt.Bin(step = 3)
).transform_aggregate(
    Count = 'count()',
    groupby = ['bin']
).transform_calculate(
    density = 'datum.Count/(3*225)',
    binshift = 'datum.bin + 1.5'
).mark_bar(size = 12, opacity = 0.8).encode(
    x = alt.X('binshift:Q', title = 'number of meteorites'),
    y = 'density:Q'
)

dists + hist

In [3]:
toy_data.sum()

n_meteorites    3907
dtype: int64

In [4]:
# import grade-aggregated seda data from hw2
seda = pd.read_csv('data/seda.csv')
seda.head()

Unnamed: 0,id,log_income,subject,gap
0,600001,11.392048,math,-0.562855
1,600006,11.607236,math,0.061163
2,600011,10.70457,math,-0.015417
3,600012,10.589787,math,
4,600013,11.399662,math,0.054454


In [5]:
# plot from hw2
base = alt.Chart(seda).mark_point(opacity = 0.5).encode(
    y = alt.Y('gap', title = 'estimated gender gap (m - f)'),
    x = alt.X('log_income', scale = alt.Scale(zero = False), title = 'log(median income)'),
    color = 'subject'
)

hw2_plot = base + base.transform_regression('log_income', 'gap', groupby = ['subject']).mark_line()

hw2_plot

In [6]:
# filter to math and remove NaNs
regdata = seda[seda.subject == 'math'].dropna()
regdata.head()

Unnamed: 0,id,log_income,subject,gap
0,600001,11.392048,math,-0.562855
1,600006,11.607236,math,0.061163
2,600011,10.70457,math,-0.015417
4,600013,11.399662,math,0.054454
5,600014,10.826107,math,0.020526


In [7]:
# simple scatterplot of math gap vs district income
scatter = alt.Chart(regdata).mark_point().encode(
    x = alt.X('log_income', scale = alt.Scale(zero = False)),
    y = 'gap'
)

# show
scatter

In [8]:
# save explanatory variable and response variable separately as arrays
x = regdata.log_income.values
y = regdata.gap.values

# check dimensions of x -- must be n x 1 for regression
x.shape

(625,)

In [9]:
# add axis
x = x[:, np.newaxis]
x.shape

(625, 1)

In [10]:
# configure regression module
slr = LinearRegression()

# fit slr model
slr.fit(x, y)

LinearRegression()

In [11]:
# store estimates
slope, intercept = slr.coef_, slr.intercept_

estimates = np.append(intercept, slope)
estimates

array([-1.35616996,  0.12105696])

In [28]:
# ols solution, by hand
x_mx = np.vstack([np.repeat(1, len(x)), x[:, 0]]).transpose() # X
xtx = x_mx.transpose().dot(x_mx) # X'X
xtx_inv = np.linalg.inv(xtx) # (X'X)^{-1}
xtx_inv.dot(x_mx.transpose()).dot(y) # (X'X)^{-1} X'y

array([-1.35616996,  0.12105696])

In [12]:
# fitted values
fitted = slr.predict(x)

In [13]:
# residuals
resid = y - fitted

In [14]:
# store data with fitted values and residuals
fit_df = pd.DataFrame({'log_income': x[:, 0], 
              'gap': y,
              'fitted': fitted,
              'residuals': resid})

# base chart
base = alt.Chart(fit_df).encode(
    x = alt.X('log_income', scale = alt.Scale(zero = False))
)

# data scatter
points = base.mark_point(opacity = 0.5).encode(y = 'gap')

# grid of values along regression line
line_df = pd.DataFrame({'log_income': np.linspace(x.min(), x.max(), 500)})
line_df['gap'] = line_df.log_income*slope + intercept

# plot line
line = alt.Chart(line_df).mark_line(
    color = 'red',
    opacity = 0.4
).encode(
    x = 'log_income',
    y = 'gap'
)

# show residuals as vertical lines
resids = base.mark_errorbar(opacity = 0.3).encode(
    y = 'gap',
    y2 = 'fitted'
)

# display
(points + line).properties(title = 'line + scatter') | (points + line + resids).properties(title = 'residuals shown in grey')

In [31]:
(points + line).properties(title = 'by hand') | (points.transform_regression('log_income', 'gap').mark_line() + points).properties(title = 'using Altair')

In [15]:
fit_df.shape

(625, 4)

In [16]:
# plot distribution of residuals
hist = alt.Chart(fit_df).transform_bin(
    as_ = 'bin', 
    field = 'residuals', 
    bin = alt.Bin(step = 0.05)
).transform_aggregate(
    Count = 'count()',
    groupby = ['bin']
).transform_calculate(
    density = 'datum.Count/(0.05*625)',
    binshift = 'datum.bin + 0.025'
).mark_bar(size = 12, opacity = 0.8).encode(
    x = alt.X('binshift:Q', title = 'residuals'),
    y = 'density:Q'
)

smooth = alt.Chart(fit_df).transform_density(
    density = 'residuals',
    as_ = ['residuals', 'density'],
    bandwidth = 0.05,
    extent = [-0.8, 0.6],
    steps = 500
).mark_line(color = 'black', opacity = 0.6).encode(
    x = 'residuals:Q',
    y = 'density:Q'
)

hist + smooth

In [17]:
# residual mean, variance
resid.var(), resid.mean()

(0.013129022870165687, -1.744382416291046e-16)

In [18]:
# residual SE
n = len(x)
p = 2
resid_se = np.sqrt(resid.var()*(n - 1)/(n - p))

resid_se

0.11467387123120727

In [19]:
pdf_df = pd.DataFrame({'residual': np.linspace(-0.8, 0.6, 500)})
pdf_df['density'] = norm.pdf(pdf_df.residual, loc = 0, scale = resid_se)

normal_density = alt.Chart(pdf_df).mark_line(
    color = 'red',
    opacity = 0.4
).encode(
    y = 'density', 
    x = 'residual'
)

(hist + smooth + normal_density).properties(title = 'normal density in red')

In [20]:
# coefficient variances/covariances
x_mx = np.vstack([np.repeat(1, n), x[:, 0]]).transpose()
coef_vcov = np.linalg.inv(x_mx.transpose().dot(x_mx))*(resid_se**2)

# coefficient standard errors
coef_se = np.sqrt(coef_vcov.diagonal())

# coefficient intervals
np.vstack([estimates + 2*coef_se, estimates - 2*coef_se])

array([[-1.09498451,  0.14472448],
       [-1.61735541,  0.09738944]])

In [21]:
def subsample_reg():
    # subsample data
    subsamp = regdata.sample(n = 200)

    # save explanatory variable and response variable separately as arrays
    x = subsamp.log_income.values
    y = subsamp.gap.values

    # add axis
    x = x[:, np.newaxis]

    # configure regression module
    slr = LinearRegression()

    # fit slr model
    slr.fit(x, y)

    # store estimates
    slope, intercept = slr.coef_, slr.intercept_

    estimates = np.append(intercept, slope)

    # fitted values
    fitted = slr.predict(x)

    # residuals
    resid = y - fitted

    # store data with fitted values and residuals
    fit_df = pd.DataFrame({'log_income': x[:, 0], 
                  'gap': y,
                  'fitted': fitted,
                  'residuals': resid})

    # base chart
    base = alt.Chart(fit_df).encode(
        x = alt.X('log_income', scale = alt.Scale(domain = (10, 12.4)))
    )

    # data scatter
    points = base.mark_point(opacity = 0.5).encode(y = alt.Y('gap', scale = alt.Scale(domain = (-1, 0.5))))

    # grid of values along regression line
    line_df = pd.DataFrame({'log_income': np.linspace(10, 12.4, 500)})
    line_df['gap'] = line_df.log_income*slope + intercept

    # plot line
    line = alt.Chart(line_df).mark_line(
        color = 'red',
        opacity = 0.4
    ).encode(
        x = 'log_income',
        y = 'gap'
    )


    # display
    plot = (points + line)

    return plot

In [22]:
(subsample_reg() + subsample_reg() + subsample_reg() + subsample_reg() + subsample_reg()).properties(title = 'lines fit to 5 subsamples')

In [32]:
# fitted variances
xtx_inv = np.linalg.inv(x_mx.transpose().dot(x_mx))
x0_mx = np.vstack([np.repeat(1, len(line_df)), np.linspace(x.min(), x.max(), 500)]).transpose()
fit_var = x0_mx.dot(xtx_inv).dot(x0_mx.transpose()).diagonal()*(resid_se**2)
fit_se = np.sqrt(fit_var)

line_df['lwr'] = line_df.gap - 2*fit_se
line_df['upr'] = line_df.gap + 2*fit_se

band = alt.Chart(line_df).mark_errorband(color = 'grey').encode(
    x = 'log_income',
    y = alt.Y('lwr', title = 'gap'),
    y2 = 'upr'
)

(points + line + band).properties(title = 'regression line with uncertainty band') 

In [45]:
# prediction
newobs = np.array([1, np.log(86000)])
pred = estimates.dot(newobs)

pred_se = np.sqrt((resid_se**2)*(1 + newobs.dot(xtx_inv).dot(newobs)))

In [50]:
pred_df = pd.DataFrame({'log_income': [np.log(86000)], 'lwr': [pred - 2*pred_se], 'upr': [pred + 2*pred_se], 'gap': pred})
pred_bar = alt.Chart(pred_df).mark_errorbar(opacity = 0.6).encode(
    x = 'log_income',
    y = alt.Y('lwr', title = 'gap'),
    y2 = 'upr'
)

pred_pt = alt.Chart(pred_df).mark_circle(color = 'black', opacity = 0.6, size = 50).encode(
    x = 'log_income',
    y = 'gap'
)

points + line + band + pred_bar + pred_pt