# Project III: Classical Non-linear Models and Police use of force
This notebook uses the Police Public Contact Survey (PPCS) dataset: `ppcs_cc.csv`.

In [51]:
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import probit as probit
import logit as logit
import estimation as est


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load the data 

In [52]:
# Load the dataset
dat = pd.read_csv('ppcs_cc.csv')

# Inspect distribution of the target variable
print("\nDistribution of 'anyuseofforce_coded':")
print(dat['anyuseofforce_coded'].value_counts(normalize=True))

# Inspect value counts for categorical variables
categorical_vars = ["sblack", "shisp", "swhite", "sother", "smale", "omajblack", 
                    "omajhisp", "omajwhite", "omajother", "osplit", "inctype_lin", "sbehavior"]

for var in categorical_vars:
    print(f"\nValue Counts for {var}:")
    print(dat[var].value_counts())


Distribution of 'anyuseofforce_coded':
anyuseofforce_coded
0    0.994999
1    0.005001
Name: proportion, dtype: float64

Value Counts for sblack:
sblack
0    3379
1     420
Name: count, dtype: int64

Value Counts for shisp:
shisp
0    3413
1     386
Name: count, dtype: int64

Value Counts for swhite:
swhite
1    2808
0     991
Name: count, dtype: int64

Value Counts for sother:
sother
0    3614
1     185
Name: count, dtype: int64

Value Counts for smale:
smale
1    2012
0    1787
Name: count, dtype: int64

Value Counts for omajblack:
omajblack
0    3568
1     231
Name: count, dtype: int64

Value Counts for omajhisp:
omajhisp
0    3708
1      91
Name: count, dtype: int64

Value Counts for omajwhite:
omajwhite
1    3433
0     366
Name: count, dtype: int64

Value Counts for omajother:
omajother
0    3755
1      44
Name: count, dtype: int64

Value Counts for osplit:
osplit
0    3799
Name: count, dtype: int64

Value Counts for inctype_lin:
inctype_lin
2    3641
1     158
Name: count, dtype

In [53]:
# Declare labels    
y_lab = 'anyuseofforce_coded'
#x_lab = ['const', 'sblack', 'shisp', 'sother']

#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'sage']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior']
x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'agesq', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior', 'year']

dat['sage'] = dat['sage'] / 10
dat['agesq'] = dat.sage * dat.sage 
dat['year'] = dat['year'] / 1000

#['const', 'sblack', 'shisp', 'sother', 'smale', 'sage', 'sempl', 'sincome', 'spop', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior', 'year']  # Dropped 'osplit' and year // Multikollinearitet mellem hhv. "omajblack", "omajhisp" "omajwhite" og "omajother" +  "sblack", "shisp", "swhite", "sother"

## Declare labels, her er sother og omajother taget ud, men I kan bare rette til white
#y_lab = 'anyuseofforce_coded'
#x_lab = ["const", "sblack", "shisp", "swhite",  # Drop 'sother' giver Multikollinearitet  sammen med "sblack", "shisp", "swhite"
#         "smale", "sage", "sempl", "sincome", 
#         "spop", "daytime", "inctype_lin", 
#         "omajblack", "omajhisp", "omajwhite",  # Drop 'omajother' giver Multikollinearitet sammen med "omajblack", "omajhisp", "omajwhite"
#         "sbehavior"] # dropped year and osplit (indeholder kun 0?)

# create extra variables 
N = dat.shape[0]
dat['const'] = np.ones((N,))

# Rebuild the dataset without 'osplit'
dat = dat[[y_lab] + x_lab].copy()

# Check for missing data
assert dat.notnull().all(axis=1).all(), 'Missing values detected. Clean your data!'

dat.head(5)

Unnamed: 0,anyuseofforce_coded,const,sblack,shisp,sother,smale,sempl,sincome,spop,agesq,daytime,inctype_lin,omajblack,omajhisp,omajother,sbehavior,year
0,0,1.0,1,0,0,1,0,1,1,3.24,1,2,0,0,0,0,2.011
1,0,1.0,1,0,0,1,1,2,4,4.0,0,2,0,0,0,0,2.011
2,0,1.0,1,0,0,1,1,2,3,4.84,1,2,0,0,0,0,2.011
3,0,1.0,1,0,0,1,1,3,1,4.84,1,2,0,0,0,0,2.011
4,0,1.0,1,0,0,1,1,1,1,4.84,1,2,0,0,0,0,2.011


In [54]:
# Extract y and X
y = dat[y_lab].values
x = dat[x_lab].values
K = x.shape[1]

Updated Correlation Matrix:
              const    sblack     shisp    sother     smale     sempl  \
const          NaN       NaN       NaN       NaN       NaN       NaN   
sblack         NaN  1.000000 -0.118565 -0.079767 -0.007463 -0.005631   
shisp          NaN -0.118565  1.000000 -0.076088  0.037650 -0.014089   
sother         NaN -0.079767 -0.076088  1.000000  0.014754 -0.004405   
smale          NaN -0.007463  0.037650  0.014754  1.000000  0.105154   
sempl          NaN -0.005631 -0.014089 -0.004405  0.105154  1.000000   
sincome        NaN -0.095218 -0.066364  0.015163  0.026403  0.159589   
spop           NaN  0.135596  0.127457  0.034981  0.008400  0.020678   
agesq          NaN -0.047676 -0.096659 -0.046398 -0.011562 -0.228068   
daytime        NaN -0.033697 -0.017122 -0.031908 -0.043603 -0.003481   
inctype_lin    NaN  0.014580 -0.008493  0.004252 -0.048396  0.071278   
omajblack      NaN  0.145640 -0.019946 -0.011510  0.012489  0.017596   
omajhisp       NaN  0.049076  0.101

## Estimate using Probit

In [55]:
# Initialize starting values
theta0 = probit.starting_values(y, x)

# Display starting values
print("Starting values for theta:", theta0)

probit_results = est.estimate(probit.q, theta0, y, x, cov_type='Sandwich')

Starting values for theta: [ 0.13601146  0.00505802  0.02219694 -0.00087277  0.01072022 -0.01502908
  0.00227683  0.01026461 -0.00045621 -0.00437033 -0.07026753 -0.01362392
 -0.00322422 -0.01596695  0.08838557  0.00214257]
Optimization terminated successfully.
         Current function value: 0.022410
         Iterations: 142
         Function evaluations: 2669
         Gradient evaluations: 157


In [56]:
probit_tab = est.print_table(x_lab, probit_results, title=f'Probit, y = {y_lab}')
probit_tab

Optimizer succeded after 142 iter. (2669 func. evals.). Final criterion:  0.02241.
Probit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-0.261,0.2907,-0.8976
sblack,0.2411,0.3073,0.7846
shisp,0.421,0.2385,1.7655
sother,0.088,0.4594,0.1915
smale,0.5454,0.2019,2.7009
sempl,-0.4332,0.2093,-2.07
sincome,0.0994,0.1244,0.7993
spop,0.2046,0.075,2.7265
agesq,-0.018,0.0084,-2.1507
daytime,-0.1235,0.1976,-0.6251


## Estimate using Logit

In [57]:
logit_results = est.estimate(logit.q, theta0, y, x, cov_type='Sandwich')

Optimization terminated successfully.
         Current function value: 0.022553
         Iterations: 182
         Function evaluations: 3264
         Gradient evaluations: 192


In [58]:
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab}')
logit_tab

Optimizer succeded after 182 iter. (3264 func. evals.). Final criterion:  0.02255.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-0.6257,2.4039,-0.2603
sblack,0.4809,0.8365,0.575
shisp,0.9381,0.6061,1.5477
sother,-0.1989,1.3272,-0.1498
smale,1.1174,0.5537,2.018
sempl,-1.0192,0.6102,-1.6703
sincome,0.2054,0.3422,0.6003
spop,0.5118,0.1908,2.6819
agesq,-0.0523,0.0254,-2.0604
daytime,-0.3363,0.5309,-0.6335


## Average partial effects

### Probit

In [59]:
# Estimating the average partial effects 
indices = [x_lab.index('sblack'), x_lab.index('shisp'), x_lab.index('sother')]  
labels = ['sblack', 'shispanic', 'sother'] 
probit.properties(x, probit_results['theta'],probit_results['cov'],print_out = True,se=True,indices=indices, labels = labels)

Unnamed: 0,Estimate,SE,t-value,p-value
sblack,0.003,0.005,0.657,0.511
shispanic,0.006,0.005,1.338,0.181
sother,0.001,0.006,0.177,0.86


### Logit

In [60]:
# Estimating the average partial effects 
indices = [x_lab.index('sblack'), x_lab.index('shisp'), x_lab.index('sother')]  
labels = ['sblack', 'shispanic', 'sother']  
logit.properties(x, logit_results['theta'],logit_results['cov'],print_out = True,se=True,indices=indices, labels = labels)

Unnamed: 0,Estimate,SE,t-value,p-value
sblack,0.003,0.005,0.492,0.622
shispanic,0.005,0.005,1.186,0.236
sother,-0.001,0.005,-0.162,0.871
