# Project III: Classical Non-linear Models and Police use of force
This notebook uses the Police Public Contact Survey (PPCS) dataset: `ppcs_cc.csv`.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import probit as probit
import logit as logit
import estimation as est
from scipy.stats import norm
from scipy.stats import t

### Load the data 

In [None]:
# Load the dataset
dat = pd.read_csv('ppcs_cc.csv')

# Inspect distribution of the target variable
print("\nDistribution of 'anyuseofforce_coded':")
print(dat['anyuseofforce_coded'].value_counts(normalize=True))

# Inspect value counts for categorical variables
categorical_vars = ["sblack", "shisp", "swhite", "sother", "smale", "omajblack", 
                    "omajhisp", "omajwhite", "omajother", "osplit", "inctype_lin", "sbehavior"]

for var in categorical_vars:
    print(f"\nValue Counts for {var}:")
    print(dat[var].value_counts())

Table with summary statistics

In [None]:
# Define groups for demographic categories
group_vars = ["swhite", "sblack", "shisp", "sother"]

# List of all variables for which we want to compute means
all_vars = dat.columns

# Initialize an empty DataFrame to store results
summary_table = pd.DataFrame()

# Calculate the overall mean for each variable
overall_means = dat[all_vars].mean()
summary_table["Variable"] = all_vars
summary_table["Full Sample"] = overall_means.values

# Calculate the mean for each variable within each group
for group in group_vars:
    group_means = dat.loc[dat[group] == 1, all_vars].mean()
    summary_table[group.capitalize()] = group_means.values

# Add a row for "Number of Observations"
num_obs_row = pd.DataFrame({
    "Variable": ["Number of Observations"],
    "Full Sample": [dat.shape[0]],
    **{group.capitalize(): [dat.loc[dat[group] == 1].shape[0]] for group in group_vars}
})

# Append the "Number of Observations" row to the summary table
summary_table = pd.concat([summary_table, num_obs_row], ignore_index=True)

# Format the table for display
summary_table = summary_table.set_index("Variable")
print(summary_table)

# Optional: Save the table to a CSV for further analysis
summary_table.to_csv('grouped_summary_statistics_with_observations.csv')


In [None]:
# Declare labels    
y_lab = 'anyuseofforce_coded'
#x_lab = ['const', 'sblack', 'shisp', 'sother']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'sagesq']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'sagesq', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother']
x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'sagesq', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior']

dat['sage'] = dat['sage'] / 10
dat['sagesq'] = dat.sage * dat.sage 

# create extra variables 
N = dat.shape[0]
dat['const'] = np.ones((N,))

# Rebuild the dataset
dat = dat[[y_lab] + x_lab].copy()

# Check for missing data
assert dat.notnull().all(axis=1).all(), 'Missing values detected. Clean your data!'

dat.tail(5)

In [None]:
# Extract y and X
y = dat[y_lab].values
x = dat[x_lab].values
K = x.shape[1]

print(K)
print(np.shape(x))

In [None]:
count_violent_1 = (dat['anyuseofforce_coded'] == 1).sum()
print(f"Number of 1s in 'anyuseofforce_coded': {count_violent_1}")

## Estimate using Probit

In [None]:
# Initialize starting values
theta0 = probit.starting_values(y, x)

# Estimate model with probit
probit_results = est.estimate(probit.q, theta0, y, x, cov_type='Sandwich')

In [None]:
probit_tab = est.print_table(x_lab, probit_results, title=f'Probit, y = {y_lab}')
probit_tab

## Estimate using Logit

In [None]:
# Initialize starting values
theta0 = logit.starting_values(y, x)

# Estimate model with logit
logit_results = est.estimate(logit.q, theta0, y, x, cov_type='Sandwich')

In [None]:
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab}')
logit_tab

## Average partial effects

### Probit

In [None]:
# Estimating the average partial effects using the probit
indices = [x_lab.index('sblack'), x_lab.index('shisp'), x_lab.index('sother')]  
labels = ['sblack', 'shispanic', 'sother'] 
probit.properties(x, probit_results['theta'],print_out = True,se=True,indices=indices, labels = labels)

### Logit

In [None]:
# Estimating the average partial effects using the logit
indices = [x_lab.index('sblack'), x_lab.index('shisp'), x_lab.index('sother')]  
labels = ['sblack', 'shispanic', 'sother']  
logit.properties(x, logit_results['theta'],print_out = True,se=True,indices=indices, labels = labels)

## Partial Effects

#### Defining different fixed vectors

In [None]:
#means of the regressors
print(f"{np.mean(dat['sage']):.2f}")
print(f"{np.mean(dat['sagesq']):.2f}")
print(f"{np.mean(dat['sincome']):.2f}")
print(f"{np.mean(dat['spop']):.2f}")

In [None]:
# Original vector
x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sage', 'sempl', 'sincome', 'spop', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior','sagesq']

x_me = np.array([1, 0, 0, 0, 1, 4.10, 0, 2.16, 1.36,0,1,0,0,0, 0,19.42]).reshape(1, -1)
pd.DataFrame(x_me, columns=x_lab, index=['x_me'])


In [15]:
### BEHAVIOR = 1 ###
# Let us make a vector of the values we want to investigate
#x_me= np.array([1, 0, 0, 0, 1, 4.1, 0, 2.16, 1.36,0,1,0,0,0,1,19.42]).reshape(1, -1)
#pd.DataFrame(x_me, columns=x_lab, index=['x_behavior'])


In [16]:
###DAYTIME = 1###
# Let us make a vector of the values we want to investigate
#x_me = np.array([1, 0, 0, 0, 1, 4.1, 0, 2.16, 1.36,1,1,0,0,0,0,19.42]).reshape(1, -1)
#pd.DataFrame(x_me, columns=x_lab, index=['x_daytime'])


#### Swiching race from white to black, hispanic and other

In [None]:
#k=1: black 
#k=2: hispanic
#k=3: other

k = 1
x_me2 = x_me.copy()
x_me2[:, k] = 1   
pd.DataFrame(x_me2, columns=x_lab, index=['x_me2'])

### Probit

In [18]:
b_pr = probit_tab.theta.values
me_race_pr = probit.G(x_me2@b_pr) - probit.G(x_me@b_pr) 

In [19]:
gx0 = norm.pdf(x_me@b_pr)
gx2 = norm.pdf(x_me2@b_pr)

grad_d_pr = gx2*x_me2 - gx0*x_me

In [20]:
def get_se(grad, cov):
    cov_me = grad@cov@grad.T
    return np.sqrt(np.diag(cov_me))

se_d_pr = get_se(grad_d_pr, probit_results['cov'])

In [None]:
me_dict = {'Marginal Effect': me_race_pr[0],
           's.e.':            se_d_pr}
tab = pd.DataFrame(me_dict)
tab['t'] = tab['Marginal Effect'] / tab['s.e.']
tab.index.name = 'Var'
tab.round(6)

### Logit

In [22]:
b_lg = logit_tab.theta.values
me_race_lg = logit.G(x_me2@b_lg) - logit.G(x_me@b_lg)

In [23]:
# Compute the logistic function exponential terms for x_me2 and x_me
exp_x0_b = np.exp(-(x_me@b_lg))
exp_x2_b = np.exp(-(x_me2@b_lg))

grad_d_lg = (x_me2 * exp_x2_b)/ (1 + exp_x2_b)**2 - (x_me * exp_x0_b)/ (1 + exp_x0_b)**2

se_d_lg = get_se(grad_d_lg, logit_results['cov'])

In [None]:
me_dict = {'Marginal Effect': me_race_lg[0],
           's.e.':            se_d_lg}
tab = pd.DataFrame(me_dict)
tab['t'] = tab['Marginal Effect'] / tab['s.e.']
tab.index.name = 'Var'
tab.round(6)