**Before you start:** Click **File â†’ Save a copy in Drive** so you have your own version of this notebook. If you skip this step, your work will not be saved.

# Load modules

In [1]:
# first thing is to import pandas
import pandas as pd

pd.options.display.max_columns = None
pd.options.display.max_rows = 20

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load DataFrames

In [2]:
universe = pd.read_csv('https://www.dropbox.com/scl/fi/69syqjo6pfrt9123rubio/universe_lab6.feather?rlkey=h2gt4o6z9r5649wo6h6ud6dce&dl=1')
arrest_events = pd.read_csv('https://www.dropbox.com/scl/fi/wv9kthwbj4ahzli3edrd7/arrest_events_lab6.feather?rlkey=mhxozpazqjgmo6qqahc2vd0xp&dl=1')

# convert string dates to date types.
# filing date is the same thing as the arrest date
universe['filing_date'] = pd.to_datetime(universe.filing_date)
arrest_events['filing_date'] = pd.to_datetime(arrest_events.filing_date)

In [3]:
universe.head()

Unnamed: 0,arrest_id,person_id,filing_date,age_at_arrest,sex,race
0,7268817,25928,2016-04-06,59.336986,M,Black
1,5958672,1448386,2017-03-29,44.989041,M,Black
2,5014551,1571572,2017-10-14,39.394521,F,Black
3,3573863,126282,2018-06-20,36.465753,M,Black
4,5502020,883298,2017-07-16,18.547945,M,White


In [4]:
arrest_events.head()

Unnamed: 0,person_id,arrest_id,filing_date,charge_degree,offense_category
0,78786,3835604,2018-05-01,felony,property
1,78786,11999735,2018-05-01,felony,property
2,1064849,3442497,2018-07-12,misdemeanor,other
3,78786,4205882,2018-03-09,felony,property
4,78786,12006352,2018-03-09,felony,property


# Current Incident Feature
## Create charge degree dummies 

In [5]:
arrest_events = pd.get_dummies(data=arrest_events, columns=['charge_degree'])
arrest_events.head()

Unnamed: 0,person_id,arrest_id,filing_date,offense_category,charge_degree_felony,charge_degree_misdemeanor
0,78786,3835604,2018-05-01,property,1,0
1,78786,11999735,2018-05-01,property,1,0
2,1064849,3442497,2018-07-12,other,0,1
3,78786,4205882,2018-03-09,property,1,0
4,78786,12006352,2018-03-09,property,1,0


## Merge with universe
We are going to merge on `arrest_id` because this is a feature about the **CURRENT ARREST**.

In [6]:
universe = universe.merge(
    right = arrest_events[['arrest_id', 'charge_degree_felony']],
    on=['arrest_id'], 
    how='left')
universe.head()

Unnamed: 0,arrest_id,person_id,filing_date,age_at_arrest,sex,race,charge_degree_felony
0,7268817,25928,2016-04-06,59.336986,M,Black,1
1,5958672,1448386,2017-03-29,44.989041,M,Black,0
2,5014551,1571572,2017-10-14,39.394521,F,Black,0
3,3573863,126282,2018-06-20,36.465753,M,Black,0
4,5502020,883298,2017-07-16,18.547945,M,White,0


# Prior History Feature
## Create number of arrests in last year column


In [7]:
temp_df = universe[['arrest_id', 'person_id', 'filing_date']].merge(
    arrest_events, on=['person_id'], how='left', suffixes=['_univ', '_arr']
)
temp_df.shape
temp_df.head()


(6519, 8)

Unnamed: 0,arrest_id_univ,person_id,filing_date_univ,arrest_id_arr,filing_date_arr,offense_category,charge_degree_felony,charge_degree_misdemeanor
0,7268817,25928,2016-04-06,10695699,2013-03-18,property,0,1
1,7268817,25928,2016-04-06,10584440,2013-04-26,property,0,1
2,7268817,25928,2016-04-06,5224389,2017-09-09,property,1,0
3,7268817,25928,2016-04-06,7268817,2016-04-06,violent,1,0
4,5958672,1448386,2017-03-29,11383129,2012-07-14,property,0,1


## Filter to rows where arrest date is less than arrest date in universe table

In [8]:
temp_df = temp_df[temp_df.filing_date_arr < temp_df.filing_date_univ]
temp_df.shape
temp_df.head()


(3045, 8)

Unnamed: 0,arrest_id_univ,person_id,filing_date_univ,arrest_id_arr,filing_date_arr,offense_category,charge_degree_felony,charge_degree_misdemeanor
0,7268817,25928,2016-04-06,10695699,2013-03-18,property,0,1
1,7268817,25928,2016-04-06,10584440,2013-04-26,property,0,1
4,5958672,1448386,2017-03-29,11383129,2012-07-14,property,0,1
6,5958672,1448386,2017-03-29,21440843,2012-04-09,property,0,1
7,5958672,1448386,2017-03-29,11566425,2012-05-15,property,0,1


## Filter to rows where arrest is within one year of arrest date in universe table


In [9]:
temp_df = temp_df[temp_df.filing_date_arr > (temp_df.filing_date_univ - pd.DateOffset(years=1))]
temp_df.shape
temp_df.head()

(904, 8)

Unnamed: 0,arrest_id_univ,person_id,filing_date_univ,arrest_id_arr,filing_date_arr,offense_category,charge_degree_felony,charge_degree_misdemeanor
14,3573863,126282,2018-06-20,4778011,2017-11-20,other,0,1
15,3573863,126282,2018-06-20,4516013,2018-01-12,property,1,0
22,3573863,126282,2018-06-20,4762793,2017-11-24,property,0,1
29,3573863,126282,2018-06-20,4406759,2018-02-01,drug,0,1
33,5502020,883298,2017-07-16,6080238,2017-02-23,other,0,1


## Create a DataFrame that has the number of arrests in the last year

In [10]:
temp_df.groupby(['arrest_id_univ', 'person_id']).size()

arrest_id_univ  person_id
2472356         242203       1
2500555         4931         1
2511968         358578       2
2532252         2582921      1
2547343         122092       2
                            ..
7563566         49472        4
7587118         997983       1
7591648         2326         2
7598842         1531743      1
12005804        1065717      1
Length: 373, dtype: int64

In [11]:
temp_df = temp_df.groupby(['arrest_id_univ', 'person_id']).size().reset_index(name="num_arr_last_year")
temp_df.shape
temp_df.head()

(373, 3)

Unnamed: 0,arrest_id_univ,person_id,num_arr_last_year
0,2472356,242203,1
1,2500555,4931,1
2,2511968,358578,2
3,2532252,2582921,1
4,2547343,122092,2


## Merge temp_df into Universe table

**Note:** Here we are using the parameters:
 - `left_on`
 - `right_on`
 
We are doing this because the universe DataFrame does not have a column called `arrest_id_univ`, that only got created after merging universe with arrest_events. 

But the `arrest_id_univ` column in arrest_events is referring to the `arrest_id` column in the universe DataFrame. 

Therefore, we use `left_on` and `right_on` to tell Pandas which columns names refer to the same keys (i.e. relevant columns) in the different DataFrames we want to merge. 

In [12]:
universe.columns
temp_df.columns

Index(['arrest_id', 'person_id', 'filing_date', 'age_at_arrest', 'sex', 'race',
       'charge_degree_felony'],
      dtype='object')

Index(['arrest_id_univ', 'person_id', 'num_arr_last_year'], dtype='object')

In [13]:
universe = universe.merge(
    right=temp_df,
    left_on=['arrest_id','person_id'],
    right_on=['arrest_id_univ', 'person_id'],
    how='left')
universe.shape
universe.head()

(1000, 9)

Unnamed: 0,arrest_id,person_id,filing_date,age_at_arrest,sex,race,charge_degree_felony,arrest_id_univ,num_arr_last_year
0,7268817,25928,2016-04-06,59.336986,M,Black,1,,
1,5958672,1448386,2017-03-29,44.989041,M,Black,0,,
2,5014551,1571572,2017-10-14,39.394521,F,Black,0,,
3,3573863,126282,2018-06-20,36.465753,M,Black,0,3573863.0,4.0
4,5502020,883298,2017-07-16,18.547945,M,White,0,5502020.0,1.0


## Fill NaN with 0

In [14]:
universe['num_arr_last_year'] = universe['num_arr_last_year'].fillna(value=0)
universe.head()

Unnamed: 0,arrest_id,person_id,filing_date,age_at_arrest,sex,race,charge_degree_felony,arrest_id_univ,num_arr_last_year
0,7268817,25928,2016-04-06,59.336986,M,Black,1,,0.0
1,5958672,1448386,2017-03-29,44.989041,M,Black,0,,0.0
2,5014551,1571572,2017-10-14,39.394521,F,Black,0,,0.0
3,3573863,126282,2018-06-20,36.465753,M,Black,0,3573863.0,4.0
4,5502020,883298,2017-07-16,18.547945,M,White,0,5502020.0,1.0


Also note that we can achieve the same effect by using the `inplace` parameter in `fillna` rather than assigning it back to the same column. Here is the synax

In [15]:
universe['num_arr_last_year'].fillna(value=0, inplace=True)

## Drop `arrest_id_univ` columns
The merge resulted in the `universe` DataFrame having:
  - `arrest_id` column (this was always there)
  - `arrest_id_univ` column (from the temp_df DataFrame). 

We no longer need `arrest_id_univ` so we are going to drop it (as it will cause issues below). 




In [16]:
universe.columns
universe.drop(columns=['arrest_id_univ'], inplace=True)
universe.columns

Index(['arrest_id', 'person_id', 'filing_date', 'age_at_arrest', 'sex', 'race',
       'charge_degree_felony', 'arrest_id_univ', 'num_arr_last_year'],
      dtype='object')

Index(['arrest_id', 'person_id', 'filing_date', 'age_at_arrest', 'sex', 'race',
       'charge_degree_felony', 'num_arr_last_year'],
      dtype='object')

# Lab Task

## Adding in one-hot encoded columns
- Use the get_dummies to one-hot encode the `offense_category` column in `arrest_events`

- Left-merge the following offense categories into the Universe table: 
     - drug
     - property
     - violent
     
(We won't merge in `other` because it will be redundant --- if drug, property, and violent are all False, it will mean that the charge was `other`)

- How many rows are universe after the resulting merge?

- What share of charges are felonies for these 1,000 defendants?

## Adding in a prior history column

- Now we want to create a feature which is the number of property arrests in the last two years
  - First create a temporary DataFrame by merging in the relevant columns from the universe and arrest_events tables
  - Remember that we are using this temporary DataFrame to figure out which arrests came before and after the current arrest.
  - Make sure to use the `suffixes` parameter

- Limit to rows where the arrest date is less than the universe date

- Limit to rows where the arrest date is within two years of the universe date

- Limit to rows where the offense_category is property

- Use groupby with reset_index to create a new DataFrame which has the number of property arrests in last two years. Call this column `num_prop_arr_last_2yrs`

- Merge this into the Universe DataFrame

- Fill Null values in the new column with zeros


- What is the average number of property arrests in the last 2 years for these 1,000 defendants?