# Project III: Classical Non-linear Models and Police use of force
This notebook uses the Police Public Contact Survey (PPCS) dataset: `ppcs_cc.csv`.

In [121]:
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import probit as probit
import logit as logit
import estimation as est


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load the data 

In [122]:
# Load the dataset
dat = pd.read_csv('ppcs_cc.csv')

# Inspect distribution of the target variable
print("\nDistribution of 'anyuseofforce_coded':")
print(dat['anyuseofforce_coded'].value_counts(normalize=True))

# Inspect value counts for categorical variables
categorical_vars = ["sblack", "shisp", "swhite", "sother", "smale", "omajblack", 
                    "omajhisp", "omajwhite", "omajother", "osplit", "inctype_lin", "sbehavior"]

for var in categorical_vars:
    print(f"\nValue Counts for {var}:")
    print(dat[var].value_counts())


Distribution of 'anyuseofforce_coded':
anyuseofforce_coded
0    0.994999
1    0.005001
Name: proportion, dtype: float64

Value Counts for sblack:
sblack
0    3379
1     420
Name: count, dtype: int64

Value Counts for shisp:
shisp
0    3413
1     386
Name: count, dtype: int64

Value Counts for swhite:
swhite
1    2808
0     991
Name: count, dtype: int64

Value Counts for sother:
sother
0    3614
1     185
Name: count, dtype: int64

Value Counts for smale:
smale
1    2012
0    1787
Name: count, dtype: int64

Value Counts for omajblack:
omajblack
0    3568
1     231
Name: count, dtype: int64

Value Counts for omajhisp:
omajhisp
0    3708
1      91
Name: count, dtype: int64

Value Counts for omajwhite:
omajwhite
1    3433
0     366
Name: count, dtype: int64

Value Counts for omajother:
omajother
0    3755
1      44
Name: count, dtype: int64

Value Counts for osplit:
osplit
0    3799
Name: count, dtype: int64

Value Counts for inctype_lin:
inctype_lin
2    3641
1     158
Name: count, dtype

In [123]:
# Declare labels    
y_lab = 'anyuseofforce_coded'
x_lab = ['const', 'sblack', 'shisp', 'sother']

#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'sage']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior']
#x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome', 'spop', 'sage', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior', 'year']

dat['sage'] = dat['sage'] / 10
dat['year'] = dat['year'] / 1000

#['const', 'sblack', 'shisp', 'sother', 'smale', 'sage', 'sempl', 'sincome', 'spop', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior', 'year']  # Dropped 'osplit' and year // Multikollinearitet mellem hhv. "omajblack", "omajhisp" "omajwhite" og "omajother" +  "sblack", "shisp", "swhite", "sother"

## Declare labels, her er sother og omajother taget ud, men I kan bare rette til white
#y_lab = 'anyuseofforce_coded'
#x_lab = ["const", "sblack", "shisp", "swhite",  # Drop 'sother' giver Multikollinearitet  sammen med "sblack", "shisp", "swhite"
#         "smale", "sage", "sempl", "sincome", 
#         "spop", "daytime", "inctype_lin", 
#         "omajblack", "omajhisp", "omajwhite",  # Drop 'omajother' giver Multikollinearitet sammen med "omajblack", "omajhisp", "omajwhite"
#         "sbehavior"] # dropped year and osplit (indeholder kun 0?)

# create extra variables 
N = dat.shape[0]
dat['const'] = np.ones((N,))

# Rebuild the dataset without 'osplit'
dat = dat[[y_lab] + x_lab].copy()

# Check for missing data
assert dat.notnull().all(axis=1).all(), 'Missing values detected. Clean your data!'

dat.head(5)

Unnamed: 0,anyuseofforce_coded,const,sblack,shisp,sother
0,0,1.0,1,0,0
1,0,1.0,1,0,0
2,0,1.0,1,0,0
3,0,1.0,1,0,0
4,0,1.0,1,0,0


In [124]:
# Extract y and X
y = dat[y_lab].values
x = dat[x_lab].values
K = x.shape[1]

# Correlation matrix to check multicollinearity
corr_matrix = pd.DataFrame(dat[x_lab]).corr()
print("Updated Correlation Matrix:\n", corr_matrix)


Updated Correlation Matrix:
         const    sblack     shisp    sother
const     NaN       NaN       NaN       NaN
sblack    NaN  1.000000 -0.118565 -0.079767
shisp     NaN -0.118565  1.000000 -0.076088
sother    NaN -0.079767 -0.076088  1.000000


## Estimate using Probit

In [125]:
# Initialize starting values
theta0 = probit.starting_values(y, x)

# Display starting values
print("Starting values for theta:", theta0)

probit_results = est.estimate(probit.q, theta0, y, x)

Starting values for theta: [0.00801282 0.00984432 0.03084728 0.00550069]
Optimization terminated successfully.
         Current function value: 0.030440
         Iterations: 37
         Function evaluations: 195
         Gradient evaluations: 39


In [126]:
probit_tab = est.print_table(x_lab, probit_results, title=f'Probit, y = {y_lab}')
probit_tab

Optimizer succeded after 37 iter. (195 func. evals.). Final criterion:  0.03044.
Probit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-2.7262,0.1098,-24.8201
sblack,0.2759,0.2344,1.1767
shisp,0.5703,0.1951,2.9232
sother,0.179,0.3649,0.4904


## Estimate using Logit

In [127]:
logit_results = est.estimate(logit.q, theta0, y, x)

Optimization terminated successfully.
         Current function value: 0.030440
         Iterations: 51
         Function evaluations: 265
         Gradient evaluations: 53


In [128]:
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab}')
logit_tab

Optimizer succeded after 51 iter. (265 func. evals.). Final criterion:  0.03044.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-5.7413,0.3339,-17.1962
sblack,0.8058,0.6687,1.2049
shisp,1.5933,0.5299,3.007
sother,0.5342,1.0568,0.5054


## Average partial effects

### Probit

In [129]:
# Estimating the average partial effects 
indices = [x_lab.index('sblack'), x_lab.index('shisp'), x_lab.index('sother')]  
labels = ['sblack', 'shispanic', 'sother'] 
probit.properties(x, probit_results['theta'],probit_results['cov'],print_out = True,se=True,indices=indices, labels = labels)

Unnamed: 0,Estimate,SE,t-value,p-value
sblack,0.005,0.006,0.919,0.358
shispanic,0.014,0.007,1.878,0.061
sother,0.003,0.008,0.405,0.685


### Logit

In [130]:
# Estimating the average partial effects 
indices = [x_lab.index('sblack'), x_lab.index('shisp'), x_lab.index('sother')]  
labels = ['sblack', 'shispanic', 'sother']  
logit.properties(x, logit_results['theta'],logit_results['cov'],print_out = True,se=True,indices=indices, labels = labels)

Unnamed: 0,Estimate,SE,t-value,p-value
sblack,0.006,0.006,0.892,0.373
shispanic,0.014,0.008,1.805,0.071
sother,0.003,0.009,0.401,0.689
