# Project III: Classical Non-linear Models and Police use of force
This notebook uses the Police Public Contact Survey (PPCS) dataset: `ppcs_cc.csv`.

In [26]:
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import probit as probit
import logit as logit
import estimation as est


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load the data 

In [27]:
# Load the dataset
dat = pd.read_csv('ppcs_cc.csv')

# Inspect distribution of the target variable
print("\nDistribution of 'anyuseofforce_coded':")
print(dat['anyuseofforce_coded'].value_counts(normalize=True))

# Inspect value counts for categorical variables
categorical_vars = ["sblack", "shisp", "swhite", "sother", "smale", "omajblack", 
                    "omajhisp", "omajwhite", "omajother", "osplit", "inctype_lin", "sbehavior"]

for var in categorical_vars:
    print(f"\nValue Counts for {var}:")
    print(dat[var].value_counts())


Distribution of 'anyuseofforce_coded':
anyuseofforce_coded
0    0.994999
1    0.005001
Name: proportion, dtype: float64

Value Counts for sblack:
sblack
0    3379
1     420
Name: count, dtype: int64

Value Counts for shisp:
shisp
0    3413
1     386
Name: count, dtype: int64

Value Counts for swhite:
swhite
1    2808
0     991
Name: count, dtype: int64

Value Counts for sother:
sother
0    3614
1     185
Name: count, dtype: int64

Value Counts for smale:
smale
1    2012
0    1787
Name: count, dtype: int64

Value Counts for omajblack:
omajblack
0    3568
1     231
Name: count, dtype: int64

Value Counts for omajhisp:
omajhisp
0    3708
1      91
Name: count, dtype: int64

Value Counts for omajwhite:
omajwhite
1    3433
0     366
Name: count, dtype: int64

Value Counts for omajother:
omajother
0    3755
1      44
Name: count, dtype: int64

Value Counts for osplit:
osplit
0    3799
Name: count, dtype: int64

Value Counts for inctype_lin:
inctype_lin
2    3641
1     158
Name: count, dtype

In [37]:
# Declare labels
y_lab = 'anyuseofforce_coded'
x_lab = ['const', 'sblack', 'shisp', 'sother', 'smale', 'sage']

#['const', 'sblack', 'shisp', 'sother', 'smale', 'sage', 'sempl', 'sincome', 'spop', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp', 'omajother', 'sbehavior', 'year']  # Dropped 'osplit' and year // Multikollinearitet mellem hhv. "omajblack", "omajhisp" "omajwhite" og "omajother" +  "sblack", "shisp", "swhite", "sother"

## Declare labels, her er sother og omajother taget ud, men I kan bare rette til white
#y_lab = 'anyuseofforce_coded'
#x_lab = ["const", "sblack", "shisp", "swhite",  # Drop 'sother' giver Multikollinearitet  sammen med "sblack", "shisp", "swhite"
#         "smale", "sage", "sempl", "sincome", 
#         "spop", "daytime", "inctype_lin", 
#         "omajblack", "omajhisp", "omajwhite",  # Drop 'omajother' giver Multikollinearitet sammen med "omajblack", "omajhisp", "omajwhite"
#         "sbehavior"] # dropped year and osplit (indeholder kun 0?)

# create extra variables 
N = dat.shape[0]
dat['const'] = np.ones((N,))

# Rebuild the dataset without 'osplit'
dat = dat[[y_lab] + x_lab].copy()

# Check for missing data
assert dat.notnull().all(axis=1).all(), 'Missing values detected. Clean your data!'

dat.head(5)

NameError: name 'x_vars' is not defined

In [40]:
# Extract y and X
y = dat[y_lab].values
x = dat[x_lab].values
K = x.shape[1]

# Correlation matrix to check multicollinearity
corr_matrix = pd.DataFrame(dat[x_lab]).corr()
print("Updated Correlation Matrix:\n", corr_matrix)

Updated Correlation Matrix:
         const    sblack     shisp    sother     smale      sage
const     NaN       NaN       NaN       NaN       NaN       NaN
sblack    NaN  1.000000 -0.118565 -0.079767 -0.007463 -0.039889
shisp     NaN -0.118565  1.000000 -0.076088  0.037650 -0.099664
sother    NaN -0.079767 -0.076088  1.000000  0.014754 -0.044000
smale     NaN -0.007463  0.037650  0.014754  1.000000 -0.018036
sage      NaN -0.039889 -0.099664 -0.044000 -0.018036  1.000000


## Estimate using Probit

In [32]:
# Initialize starting values
theta0 = probit.starting_values(y, x)

# Display starting values
print("Starting values for theta:", theta0)

# Calculate log-likelihood at the starting values
ll = probit.loglikelihood(theta0, y, x)
print("Log-likelihood at starting values:", np.mean(ll))

# Verify if it matches the expected result
expected_ll_mean = -1.0411283428047824
print("Log-likelihood check:", np.isclose(np.mean(ll), expected_ll_mean))

probit_results = est.estimate(probit.q, theta0, y, x)

probit_tab = est.print_table(x_lab, probit_results, title=f'Probit, y = {y_lab}')


Starting values for theta: [ 0.01950952  0.00861707  0.02755749  0.00319188  0.01218717 -0.00042353]
Log-likelihood at starting values: -0.7030183991613362
Log-likelihood check: False
Optimization terminated successfully.
         Current function value: 0.092128
         Iterations: 1
         Function evaluations: 14
         Gradient evaluations: 2


LinAlgError: Singular matrix

Jeg får Singular Matrix selvom jeg prøver at tage højde for Multikollinearitet, chatten foreslår at lave om i "estimation.py"

In [31]:
probit_tab = est.print_table(x_lab, probit_results, title=f'Probit, y = {y_lab}')
probit_tab

NameError: name 'probit_results' is not defined

## Estimate using Logit

In [33]:
logit_results = est.estimate(logit.q, theta0, y, x)

Optimization terminated successfully.
         Current function value: 0.028891
         Iterations: 65
         Function evaluations: 588
         Gradient evaluations: 84


In [36]:
ll = logit.loglikelihood(theta0, y, x)
np.isclose(np.mean(ll),-0.9974267061091704)

False

In [34]:
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab}')
logit_tab

Optimizer succeded after 65 iter. (588 func. evals.). Final criterion:  0.02889.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-4.8365,0.8876,-5.449
sblack,0.7384,0.6865,1.0757
shisp,1.3266,0.5357,2.4765
sother,0.3254,1.33,0.2447
smale,1.1243,0.6258,1.7966
sage,-0.0446,0.0288,-1.5499


## Partial and Average Effects

## Standard errors of the marginal effects with the Delta Method
