# Project III: Classical Non-linear Models and Police use of force
This notebook uses the Police Public Contact Survey (PPCS) dataset: `ppcs_cc.csv`.

In [209]:
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import probit as probit
import logit as logit
import estimation as est


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [210]:

# Load the dataset
dat = pd.read_csv('ppcs_cc.csv')

# Inspect distribution of the target variable
print("\nDistribution of 'anyuseofforce_coded':")
print(dat['anyuseofforce_coded'].value_counts(normalize=True))

# Inspect value counts for categorical variables
categorical_vars = ["sblack", "shisp", "swhite", "sother", "smale", "omajblack", 
                    "omajhisp", "omajwhite", "omajother", "osplit", "inctype_lin", "sbehavior"]

for var in categorical_vars:
    print(f"\nValue Counts for {var}:")
    print(dat[var].value_counts())


Distribution of 'anyuseofforce_coded':
anyuseofforce_coded
0    0.994999
1    0.005001
Name: proportion, dtype: float64

Value Counts for sblack:
sblack
0    3379
1     420
Name: count, dtype: int64

Value Counts for shisp:
shisp
0    3413
1     386
Name: count, dtype: int64

Value Counts for swhite:
swhite
1    2808
0     991
Name: count, dtype: int64

Value Counts for sother:
sother
0    3614
1     185
Name: count, dtype: int64

Value Counts for smale:
smale
1    2012
0    1787
Name: count, dtype: int64

Value Counts for omajblack:
omajblack
0    3568
1     231
Name: count, dtype: int64

Value Counts for omajhisp:
omajhisp
0    3708
1      91
Name: count, dtype: int64

Value Counts for omajwhite:
omajwhite
1    3433
0     366
Name: count, dtype: int64

Value Counts for omajother:
omajother
0    3755
1      44
Name: count, dtype: int64

Value Counts for osplit:
osplit
0    3799
Name: count, dtype: int64

Value Counts for inctype_lin:
inctype_lin
2    3641
1     158
Name: count, dtype

In [None]:
# Declare labels
y_lab = 'anyuseofforce_coded'
x_lab = ["const", "sblack", "shisp", "swhite", "sother", "smale", "sage", "sempl", 
         "sincome", "spop", "daytime", "inctype_lin", "omajblack", "omajhisp", 
         "omajwhite", "omajother", "sbehavior"]  # Dropped 'osplit' and year // Multikollinearitet mellem hhv. "omajblack", "omajhisp" "omajwhite" og "omajother" +  "sblack", "shisp", "swhite", "sother"

## Declare labels, her er sother og omajother taget ud, men I kan bare rette til white
#y_lab = 'anyuseofforce_coded'
#x_lab = ["const", "sblack", "shisp", "swhite",  # Drop 'sother' giver Multikollinearitet  sammen med "sblack", "shisp", "swhite"
#         "smale", "sage", "sempl", "sincome", 
#         "spop", "daytime", "inctype_lin", 
#         "omajblack", "omajhisp", "omajwhite",  # Drop 'omajother' giver Multikollinearitet sammen med "omajblack", "omajhisp", "omajwhite"
#         "sbehavior"] # dropped year and osplit (indeholder kun 0?)


N = dat.shape[0]
dat['const'] = np.ones((N,))


# Rebuild the dataset without 'osplit'
dat = dat[[y_lab] + x_lab].copy()

# Check for missing data
assert dat.notnull().all(axis=1).all(), 'Missing values detected. Clean your data!'

# Extract y and X
y = dat[y_lab].values
x = dat[x_lab].values
K = x.shape[1]
N = x.shape[0]


# Correlation matrix to check multicollinearity
corr_matrix = pd.DataFrame(dat[x_lab]).corr()
print("Updated Correlation Matrix:\n", corr_matrix)



Updated Correlation Matrix:
              const    sblack     shisp    swhite    sother     smale  \
const          NaN       NaN       NaN       NaN       NaN       NaN   
sblack         NaN  1.000000 -0.118565 -0.593461 -0.079767 -0.007463   
shisp          NaN -0.118565  1.000000 -0.566092 -0.076088  0.037650   
swhite         NaN -0.593461 -0.566092  1.000000 -0.380850 -0.027808   
sother         NaN -0.079767 -0.076088 -0.380850  1.000000  0.014754   
smale          NaN -0.007463  0.037650 -0.027808  0.014754  1.000000   
sage           NaN -0.039889 -0.099664  0.118629 -0.044000 -0.018036   
sempl          NaN -0.005631 -0.014089  0.015875 -0.004405  0.105154   
sincome        NaN -0.095218 -0.066364  0.106229  0.015163  0.026403   
spop           NaN  0.135596  0.127457 -0.201679  0.034981  0.008400   
daytime        NaN -0.033697 -0.017122  0.051486 -0.031908 -0.043603   
inctype_lin    NaN  0.014580 -0.008493 -0.006652  0.004252 -0.048396   
omajblack      NaN  0.145640 -0.019

In [212]:
# Initialize starting values
theta0 = probit.starting_values(y, x)

# Display starting values
print("Starting values for theta:", theta0)

# Calculate log-likelihood at the starting values
ll = probit.loglikelihood(theta0, y, x)
print("Log-likelihood at starting values:", np.mean(ll))

# Verify if it matches the expected result
expected_ll_mean = -1.0411283428047824
print("Log-likelihood check:", np.isclose(np.mean(ll), expected_ll_mean))

probit_results = est.estimate(probit.q, theta0, y, x)

probit_tab = est.print_table(x_lab, probit_results, title=f'Probit, y = {y_lab}')


Starting values for theta: [-0.06336431  0.07218228  0.08920596  0.0668541   0.06618924  0.01054008
 -0.00039334 -0.0139809   0.00237788  0.01027833 -0.00428795 -0.07021874
  0.12946404  0.13957977  0.14302527  0.12698641  0.08837279]
Log-likelihood at starting values: -0.7027469326475566
Log-likelihood check: False
Optimization terminated successfully.
         Current function value: 0.092128
         Iterations: 1
         Function evaluations: 36
         Gradient evaluations: 2


LinAlgError: Singular matrix

Jeg får Singular Matrix selvom jeg prøver at tage højde for Multikollinearitet, chatten foreslår at lave om i "estimation.py"