# Load modules

In [1]:
import pandas as pd
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.linear_model import LogisticRegression as logistic
from sklearn.linear_model import LinearRegression as ols

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve

# Load data


In [2]:
universe = pd.read_csv('https://www.dropbox.com/scl/fi/5wg86ttvvx9cu15zn8xae/lab7_inst414.csv?rlkey=d9qinlmgn0q2n5qzrw8jib314&dl=1')
universe['filing_date'] = pd.to_datetime(universe.filing_date)
universe.head()

Unnamed: 0,arrest_id,person_id,filing_date,age_at_arrest,sex,race,charge_degree_felony,num_arr_last_year,offense_category_drug,offense_category_property,offense_category_violent,num_prop_arr_last_2yrs
0,4150651,3245369,2018-03-20,29.117808,M,Black,True,3.0,False,True,False,1.0
1,7518413,589868,2016-01-30,26.394521,M,Black,True,1.0,True,False,False,0.0
2,6709912,910296,2016-08-24,28.830137,M,Black,True,1.0,False,False,True,0.0
3,4951609,2849904,2017-10-25,44.254795,M,White,False,0.0,False,False,False,0.0
4,7444510,2539404,2016-02-20,28.115068,F,White,False,0.0,True,False,False,0.0


In [3]:
arrest_events = pd.read_csv('https://www.dropbox.com/scl/fi/wv9kthwbj4ahzli3edrd7/arrest_events_lab6.feather?rlkey=mhxozpazqjgmo6qqahc2vd0xp&dl=1')
arrest_events['filing_date'] = pd.to_datetime(arrest_events.filing_date)
arrest_events.head()

Unnamed: 0,person_id,arrest_id,filing_date,charge_degree,offense_category
0,364573,9911130,2013-12-11,felony,other
1,54187,6600129,2016-09-23,misdemeanor,violent
2,39291,2575499,2018-12-07,felony,other
3,39291,2938529,2018-09-30,felony,other
4,1026471,4327311,2018-02-15,felony,other


# Create outcome 

In [4]:
temp_df = universe[['person_id', 'arrest_id', 'filing_date']].merge(arrest_events, 
                                                                    on=['person_id'],
                                                                    how='left',
                                                                    suffixes=['_univ', '_arr'])
temp_df.shape

(6188, 7)

## Limit to dates that happened within one year

First, limit to dates that happened less than one year after the current arrest date (filing date is same thing as arrest date)


In [5]:
temp_df = temp_df[temp_df.filing_date_arr.between(temp_df.filing_date_univ + pd.DateOffset(days=1),
                                                  temp_df.filing_date_univ + pd.DateOffset(years=1))]
temp_df.head()
temp_df.shape


Unnamed: 0,person_id,arrest_id_univ,filing_date_univ,arrest_id_arr,filing_date_arr,charge_degree,offense_category
0,3245369,4150651,2018-03-20,3408539,2018-07-18,felony,property
1,3245369,4150651,2018-03-20,3395731,2018-07-20,felony,property
2,3245369,4150651,2018-03-20,3903975,2018-04-19,felony,property
4,3245369,4150651,2018-03-20,3869073,2018-04-25,felony,property
7,3245369,4150651,2018-03-20,3579637,2018-06-19,felony,other


(1088, 7)

Second, limit to arrests that happened after the current arrest date

## Create a the outcome column in the universe table
The dataframe we just created `temp_df` contains everyone who had an arrest within one year of their filingdate listed in the universe table. 

We will use the `isin` function to figure out if someone had an arrest in the next year.

In [6]:
universe['outcome_any_arrest'] = universe.arrest_id.isin(temp_df.arrest_id_univ)

universe.head()


Unnamed: 0,arrest_id,person_id,filing_date,age_at_arrest,sex,race,charge_degree_felony,num_arr_last_year,offense_category_drug,offense_category_property,offense_category_violent,num_prop_arr_last_2yrs,outcome_any_arrest
0,4150651,3245369,2018-03-20,29.117808,M,Black,True,3.0,False,True,False,1.0,True
1,7518413,589868,2016-01-30,26.394521,M,Black,True,1.0,True,False,False,0.0,True
2,6709912,910296,2016-08-24,28.830137,M,Black,True,1.0,False,False,True,0.0,True
3,4951609,2849904,2017-10-25,44.254795,M,White,False,0.0,False,False,False,0.0,False
4,7444510,2539404,2016-02-20,28.115068,F,White,False,0.0,True,False,False,0.0,True


The share of people who were rearrested, or the base rate 

In [7]:
universe['outcome_any_arrest'].mean()

0.434

# Training an OLS model
First instantiate a version of the ols model.

In [8]:
ols_model = ols()

Then, create a list that has the names of the columns we want for features.

In [9]:
universe.columns

Index(['arrest_id', 'person_id', 'filing_date', 'age_at_arrest', 'sex', 'race',
       'charge_degree_felony', 'num_arr_last_year', 'offense_category_drug',
       'offense_category_property', 'offense_category_violent',
       'num_prop_arr_last_2yrs', 'outcome_any_arrest'],
      dtype='object')

In [10]:
features = ['charge_degree_felony', 'num_arr_last_year']
universe[features]

Unnamed: 0,charge_degree_felony,num_arr_last_year
0,True,3.0
1,True,1.0
2,True,1.0
3,False,0.0
4,False,0.0
...,...,...
995,True,0.0
996,True,0.0
997,False,2.0
998,False,4.0


In [11]:
ols_model = ols_model.fit(X=universe[features], y=universe['outcome_any_arrest'])


We will now look at the $\beta$s (Betas)/coefficients 

In [12]:
ols_model.intercept_
ols_model.coef_

0.47845748384387865

array([-0.1464595 ,  0.08076016])

The model that was found was:

$outcome.any.arrest = -.48 - 0.14 * felony.charge + 0.08 * num.arr.last.year $

Now we will predict

In [13]:
universe['pred_ols'] = ols_model.predict(X=universe[features])
universe.head()

Unnamed: 0,arrest_id,person_id,filing_date,age_at_arrest,sex,race,charge_degree_felony,num_arr_last_year,offense_category_drug,offense_category_property,offense_category_violent,num_prop_arr_last_2yrs,outcome_any_arrest,pred_ols
0,4150651,3245369,2018-03-20,29.117808,M,Black,True,3.0,False,True,False,1.0,True,0.574278
1,7518413,589868,2016-01-30,26.394521,M,Black,True,1.0,True,False,False,0.0,True,0.412758
2,6709912,910296,2016-08-24,28.830137,M,Black,True,1.0,False,False,True,0.0,True,0.412758
3,4951609,2849904,2017-10-25,44.254795,M,White,False,0.0,False,False,False,0.0,False,0.478457
4,7444510,2539404,2016-02-20,28.115068,F,White,False,0.0,True,False,False,0.0,True,0.478457


## Compute performance metrics
First, we will set the threshold to be the outcome rate. 

In [14]:
universe['yhat'] = universe['pred_ols'] > universe['outcome_any_arrest'].mean()

Let's compute the True Positive Rate. 

In [15]:
universe[universe['outcome_any_arrest']==1]['yhat'].mean()

0.5483870967741935

Let's compute the Positive Predictive Value

In [16]:
universe[universe['yhat']==1]['outcome_any_arrest'].mean()

0.6485013623978202

## Create out-of-sample dataset

In [17]:
train, test = train_test_split(universe, 
                               test_size=.5, 
                               shuffle=True, 
                               stratify=universe['outcome_any_arrest'])
train.shape
test.shape

(500, 15)

(500, 15)

Stratifying by the outcome means that both train and test sets should have equal (or very similar) base rates.

In [18]:
train['outcome_any_arrest'].mean()
test['outcome_any_arrest'].mean()

0.434

0.434

### Run OLS model on train

In [19]:
ols_model = ols_model.fit(X=train[features], y=train['outcome_any_arrest'])


In [20]:
ols_model.intercept_
ols_model.coef_

0.4759149616895392

array([-0.1502265 ,  0.08425118])

The model that was found was:

$outcome.any.arrest = -.48 - 0.15 * felony.charge + 0.08 * num.arr.last.year $

In [21]:
# predict for the test set
test['pred_ols'] = ols_model.predict(test[features])

In [22]:
# convert to yhat using base rate of the outcome as threshold
test['yhat'] = test['pred_ols'] > train['outcome_any_arrest'].mean()

In [23]:
# compute TPR
test[test['outcome_any_arrest']==1]['yhat'].mean()

0.543778801843318

In [24]:
# compute PPV
test[test['yhat']]['outcome_any_arrest'].mean()

0.6519337016574586

## Run Logistic Regression model on train

In [25]:
logistic_model = logistic(C=.1)

In [26]:
logistic_model = logistic_model.fit(X=train[features], y=train['outcome_any_arrest'])

In [27]:
test['pred_logistic'] = logistic_model.predict_proba(test[features])[:, 1]

In [28]:
test['yhat'] = test['pred_logistic'] > train['outcome_any_arrest'].mean()

In [29]:
test[test['outcome_any_arrest']==1]['yhat'].mean()

0.5898617511520737

In [30]:
test[test['yhat']]['outcome_any_arrest'].mean()

0.7191011235955056

# Lab Task

1. Perform the following steps:
 - Run logistic regression using C = 10 on the training set.
 - Predict for the test set using a new column called pred_logistic2
 - Create a new yhat column (yhat2) and convert predictions to yhats (predicted outcomes) using the base rate of the training set as the threshold
 - Compute TPR and PPV
 - Which C gives better performance? 

2. Create a new outcome called `outcome_violent_arrest`

3. Run OLS on the training set using the following predictors:
 - charge_degree_felony	
 - num_arr_last_year	
 - offense_category_violent	
 - num_prop_arr_last_2yrs
 
The outcome will be `outcome_violent_arrest` 

Make sure to first create a new train and test set

Predict for the test set

Compute TPR and PPV


4. Using the last model you trained, compute $P(predicton | sex)$
Which sex gets higher predictions on average?



5. Using the last model you trained, compute $P(predicton | race)$
Which race gets higher predictions on average?

