In [75]:
import pandas as pd
import matplotlib.pyplot as plt

In [76]:
hf = pd.read_csv('police.csv') #Hartford state can be found here https://openpolicing.stanford.edu/data/
hf.head()

Unnamed: 0,raw_row_number,date,time,location,lat,lng,district,subject_age,subject_race,subject_sex,...,outcome,contraband_found,search_conducted,search_vehicle,search_basis,reason_for_stop,raw_subject_race_code,raw_subject_ethnicity_code,raw_search_authorization_code,raw_intervention_disposition_code
0,1,2013-10-13,15:21:00,LINNMORE ST AT ROGER,41.732189,-72.69976,SOUTH WEST,38.0,hispanic,female,...,citation,,False,False,,Stop Sign,W,H,N,I
1,2,2013-10-24,01:12:00,HAWTHORN STREET @ IMLAY STREET,41.764621,-72.695362,ASYLUM HILL,20.0,black,male,...,citation,,False,False,,Defective Lights,B,N,N,I
2,3,2013-10-26,10:06:00,NEW PARK MIRRILL,41.747837,-72.712933,PARKVILLE,26.0,white,female,...,citation,,False,False,,Traffic Control Signal,W,N,N,I
3,4,2013-10-26,18:06:00,nfew park at merrill st,41.748069,-72.712641,PARKVILLE,26.0,white,female,...,citation,,False,False,,Traffic Control Signal,W,N,N,I
4,5,2013-10-26,19:56:00,SUMMIT ST AT ZION ST,41.753945,-72.693278,FROG HOLLOW,39.0,white,male,...,citation,,False,False,,Stop Sign,W,N,N,I


In [77]:
print(hf.shape)
hf.isna().sum()

(18439, 26)


raw_row_number                           0
date                                     4
time                                     4
location                                 0
lat                                    210
lng                                    210
district                              1231
subject_age                              4
subject_race                             0
subject_sex                              0
officer_id_hash                          0
department_name                          0
type                                     0
arrest_made                              5
citation_issued                          0
outcome                               2522
contraband_found                     13254
search_conducted                         0
search_vehicle                           2
search_basis                         13255
reason_for_stop                          0
raw_subject_race_code                    0
raw_subject_ethnicity_code               0
raw_search_

## 1-Cleaning and verification of the data

In [78]:
# Dropping lat and lng cause location values exist and all columns starting with raw
hf.drop(['raw_row_number', 'lat','lng','raw_subject_race_code','raw_subject_ethnicity_code','raw_search_authorization_code','raw_intervention_disposition_code'], axis='columns', inplace=True)

In [79]:
hf.head(10)

Unnamed: 0,date,time,location,district,subject_age,subject_race,subject_sex,officer_id_hash,department_name,type,arrest_made,citation_issued,warning_issued,outcome,contraband_found,search_conducted,search_vehicle,search_basis,reason_for_stop
0,2013-10-13,15:21:00,LINNMORE ST AT ROGER,SOUTH WEST,38.0,hispanic,female,9623b482ee,Hartford,vehicular,False,True,False,citation,,False,False,,Stop Sign
1,2013-10-24,01:12:00,HAWTHORN STREET @ IMLAY STREET,ASYLUM HILL,20.0,black,male,616e63d3fb,Hartford,vehicular,False,True,False,citation,,False,False,,Defective Lights
2,2013-10-26,10:06:00,NEW PARK MIRRILL,PARKVILLE,26.0,white,female,9623b482ee,Hartford,vehicular,False,True,False,citation,,False,False,,Traffic Control Signal
3,2013-10-26,18:06:00,nfew park at merrill st,PARKVILLE,26.0,white,female,5ac7b58482,Hartford,vehicular,False,True,False,citation,,False,False,,Traffic Control Signal
4,2013-10-26,19:56:00,SUMMIT ST AT ZION ST,FROG HOLLOW,39.0,white,male,9623b482ee,Hartford,vehicular,False,True,False,citation,,False,False,,Stop Sign
5,2013-10-26,21:57:00,LAUREL ST AT CAPITOL AV,ASYLUM HILL,51.0,hispanic,male,9623b482ee,Hartford,vehicular,False,True,False,citation,,False,False,,Traffic Control Signal
6,2013-10-26,22:08:00,RUSS ST AT PUTNAM S T,FROG HOLLOW,34.0,white,male,9623b482ee,Hartford,vehicular,False,True,False,citation,,False,False,,Stop Sign
7,2013-10-29,13:18:00,FLATBUSH AV AT HILLSIDE,BEHIND THE ROCKS,24.0,hispanic,male,9623b482ee,Hartford,vehicular,False,True,False,citation,,False,False,,Moving Violation
8,2013-10-29,13:50:00,HUDSON ST @ JEFFERSON,SOUTH GREEN,37.0,white,female,172bfdfefb,Hartford,vehicular,False,True,False,citation,,False,False,,Cell Phone
9,2013-10-30,11:57:00,BROAD AT MADISON,FROG HOLLOW,42.0,black,female,9623b482ee,Hartford,vehicular,False,True,False,citation,,False,False,,Stop Sign


In [80]:
# getting rid of the 4 missing values for date and time along with the 4 missing subject_age
hf.dropna(subset=['date','time','subject_age'], inplace=True)

In [81]:
# display shape 
print(hf.shape)
hf['contraband_found'].unique()

(18431, 19)


array([nan, False, True], dtype=object)

In [82]:
# checking and setting types of the attributes
print(hf.info())
#Store the columns to convert and the appropriate type in dict
type_cols={'arrest_made':bool,'search_vehicle':bool,'contraband_found':bool,'subject_age':int}
hf=hf.astype(type_cols)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18431 entries, 0 to 18438
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              18431 non-null  object 
 1   time              18431 non-null  object 
 2   location          18431 non-null  object 
 3   district          17200 non-null  object 
 4   subject_age       18431 non-null  float64
 5   subject_race      18431 non-null  object 
 6   subject_sex       18431 non-null  object 
 7   officer_id_hash   18431 non-null  object 
 8   department_name   18431 non-null  object 
 9   type              18431 non-null  object 
 10  arrest_made       18426 non-null  object 
 11  citation_issued   18431 non-null  bool   
 13  outcome           15910 non-null  object 
 14  contraband_found  5183 non-null   object 
 15  search_conducted  18431 non-null  bool   
 16  search_vehicle    18429 non-null  object 
 17  search_basis      5182 non-null   object

In [83]:
hf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18431 entries, 0 to 18438
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              18431 non-null  object
 1   time              18431 non-null  object
 2   location          18431 non-null  object
 3   district          17200 non-null  object
 4   subject_age       18431 non-null  int32 
 5   subject_race      18431 non-null  object
 6   subject_sex       18431 non-null  object
 7   officer_id_hash   18431 non-null  object
 8   department_name   18431 non-null  object
 9   type              18431 non-null  object
 10  arrest_made       18431 non-null  bool  
 11  citation_issued   18431 non-null  bool  
 13  outcome           15910 non-null  object
 14  contraband_found  18431 non-null  bool  
 15  search_conducted  18431 non-null  bool  
 16  search_vehicle    18431 non-null  bool  
 17  search_basis      5182 non-null   object
 18  reason_for_s

In [84]:
#Combine date and time into one col and converting it to datetime
combined=hf.date.str.cat(hf.time,sep=' ')
hf['stop_datetime'] = pd.to_datetime(combined)
#Setting the new column as index
hf.set_index('stop_datetime', inplace=True)

In [85]:
hf.drop(['date','time'],axis='columns',inplace=True)
hf.head(9)

Unnamed: 0_level_0,location,district,subject_age,subject_race,subject_sex,officer_id_hash,department_name,type,arrest_made,citation_issued,warning_issued,outcome,contraband_found,search_conducted,search_vehicle,search_basis,reason_for_stop
stop_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2013-10-13 15:21:00,LINNMORE ST AT ROGER,SOUTH WEST,38,hispanic,female,9623b482ee,Hartford,vehicular,False,True,False,citation,True,False,False,,Stop Sign
2013-10-24 01:12:00,HAWTHORN STREET @ IMLAY STREET,ASYLUM HILL,20,black,male,616e63d3fb,Hartford,vehicular,False,True,False,citation,True,False,False,,Defective Lights
2013-10-26 10:06:00,NEW PARK MIRRILL,PARKVILLE,26,white,female,9623b482ee,Hartford,vehicular,False,True,False,citation,True,False,False,,Traffic Control Signal
2013-10-26 18:06:00,nfew park at merrill st,PARKVILLE,26,white,female,5ac7b58482,Hartford,vehicular,False,True,False,citation,True,False,False,,Traffic Control Signal
2013-10-26 19:56:00,SUMMIT ST AT ZION ST,FROG HOLLOW,39,white,male,9623b482ee,Hartford,vehicular,False,True,False,citation,True,False,False,,Stop Sign
2013-10-26 21:57:00,LAUREL ST AT CAPITOL AV,ASYLUM HILL,51,hispanic,male,9623b482ee,Hartford,vehicular,False,True,False,citation,True,False,False,,Traffic Control Signal
2013-10-26 22:08:00,RUSS ST AT PUTNAM S T,FROG HOLLOW,34,white,male,9623b482ee,Hartford,vehicular,False,True,False,citation,True,False,False,,Stop Sign
2013-10-29 13:18:00,FLATBUSH AV AT HILLSIDE,BEHIND THE ROCKS,24,hispanic,male,9623b482ee,Hartford,vehicular,False,True,False,citation,True,False,False,,Moving Violation
2013-10-29 13:50:00,HUDSON ST @ JEFFERSON,SOUTH GREEN,37,white,female,172bfdfefb,Hartford,vehicular,False,True,False,citation,True,False,False,,Cell Phone


## 2- Exploring relationships 


In [86]:
hf['outcome'].value_counts()

citation    11752
arrest        712
Name: outcome, dtype: int64