In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pathlib import Path

In [2]:
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [3]:
charlotte = pd.read_csv('data/nc_charlotte_2020_04_01.csv')
durham = pd.read_csv('data/nc_durham_2020_04_01.csv')
fayetteville = pd.read_csv('data/nc_fayetteville_2020_04_01.csv')
greensboro = pd.read_csv('data/nc_greensboro_2020_04_01.csv')
raleigh = pd.read_csv('data/nc_raleigh_2020_04_01.csv')
winston_salem = pd.read_csv('data/nc_winston-salem_2020_04_01.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
combined = pd.concat([charlotte, durham, fayetteville, greensboro, raleigh, winston_salem], axis=0)
combined = combined.reset_index(drop=True)
combined

Unnamed: 0,raw_row_number,date,time,location,county_name,subject_age,subject_race,subject_sex,officer_id_hash,department_name,type,arrest_made,citation_issued,warning_issued,outcome,contraband_found,contraband_drugs,contraband_weapons,frisk_performed,search_conducted,search_person,search_vehicle,search_basis,reason_for_frisk,reason_for_search,reason_for_stop,raw_Ethnicity,raw_Race,raw_action_description
0,3254,2000-01-01,17:20:00,Unknown,,38.0,black,female,22e35044ed,UNC Charlotte University Police Department,vehicular,False,False,True,warning,,,,False,False,False,False,,,,Safe Movement Violation,N,B,Verbal Warning
1,3259,2000-01-02,11:40:00,Unknown,,33.0,hispanic,male,22e35044ed,UNC Charlotte University Police Department,vehicular,False,True,False,citation,,,,False,False,False,False,,,,Vehicle Regulatory Violation,H,W,Citation Issued
2,3281,2000-01-02,02:41:00,Unknown,,23.0,white,male,22e35044ed,UNC Charlotte University Police Department,vehicular,False,True,False,citation,,,,False,False,False,False,,,,Vehicle Equipment Violation,N,W,Citation Issued
3,3282,2000-01-23,03:35:00,Unknown,,23.0,white,female,22e35044ed,UNC Charlotte University Police Department,vehicular,False,True,False,citation,,,,False,False,False,False,,,,Vehicle Regulatory Violation,N,W,Citation Issued
4,3283,2000-01-02,16:45:00,Unknown,,23.0,white,female,22e35044ed,UNC Charlotte University Police Department,vehicular,False,False,True,warning,,,,False,False,False,False,,,,Safe Movement Violation,N,W,Verbal Warning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4320461,20278399,2015-12-19,02:27:00,"nan, Forsyth County",Forsyth County,24.0,black,male,0a7f06db99,Winston-Salem Police Department,vehicular,False,True,False,citation,True,True,False,False,True,True,True,probable cause,,Observation of Suspected Contraband,Other Motor Vehicle Violation,N,B,Citation Issued
4320462,20278400,2015-12-09,23:38:00,"nan, Forsyth County",Forsyth County,37.0,black,male,223001a5be,Winston-Salem Police Department,vehicular,False,True,False,citation,True,True,False,False,True,True,True,probable cause,,Other Official Information,Vehicle Equipment Violation,N,B,Citation Issued
4320463,20278401,2015-12-11,01:24:00,"nan, Forsyth County",Forsyth County,21.0,hispanic,male,b17198e1b4,Winston-Salem Police Department,vehicular,True,False,False,arrest,False,False,False,False,True,True,True,other,,Observation of Suspected Contraband,Vehicle Equipment Violation,H,W,On-View Arrest
4320464,20278402,2015-12-11,02:21:00,"nan, Forsyth County",Forsyth County,27.0,white,male,0f4172ddc4,Winston-Salem Police Department,vehicular,True,False,False,arrest,False,False,False,False,True,True,True,probable cause,,Other Official Information,Driving While Impaired,N,W,On-View Arrest


## Save train and test as csv files

In [5]:
train, test = train_test_split(combined, test_size=0.2, 
                               stratify = combined['department_name']) # stratify according to county

In [6]:
train_filepath = Path('train.csv')  
train_filepath.parent.mkdir(parents=True, exist_ok=True)  
train.to_csv(train_filepath, index = False)

test_filepath = Path('test.csv')  
test_filepath.parent.mkdir(parents=True, exist_ok=True)  
test.to_csv(test_filepath, index = False)

## Drop Missing Rate > 95%

In [7]:
def drop_high_missing_rate(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    stat_df = df.describe(include='all').T
    stat_df['missing_rate'] = missing_value_df['percent_missing']
    stat_df = stat_df.drop(columns = ['count'])
    
    high_missing_rate = stat_df[stat_df.missing_rate > 95].index.values
    df = df.drop(high_missing_rate, axis=1)
    
    return df

processed_train = drop_high_missing_rate(train).reset_index(drop=True)
processed_train

Unnamed: 0,raw_row_number,date,time,location,county_name,subject_age,subject_race,subject_sex,officer_id_hash,department_name,type,arrest_made,citation_issued,warning_issued,outcome,frisk_performed,search_conducted,search_person,search_vehicle,reason_for_stop,raw_Ethnicity,raw_Race,raw_action_description
0,13898103,2011-10-14,13:10:00,"Charlotte Area, Mecklenburg County",Mecklenburg County,41.0,white,female,d1e8da930a,Charlotte-Mecklenburg Police Department,vehicular,False,True,False,citation,False,False,False,False,Vehicle Regulatory Violation,N,W,Citation Issued
1,16635989,2013-01-06,18:48:00,"nan, Forsyth County",Forsyth County,36.0,black,male,da4269670d,Winston-Salem Police Department,vehicular,False,False,True,warning,False,False,False,False,Safe Movement Violation,N,B,Verbal Warning
2,3644747,2003-07-25,,", Forsyth County",Forsyth County,20.0,hispanic,male,40f7ef4db6,Winston-Salem Police Department,vehicular,False,True,False,citation,False,False,False,False,Vehicle Regulatory Violation,H,U,Citation Issued
3,4231260,2004-06-23,11:28:00,"RALEIGH, Wake County",Wake County,29.0,black,female,db839dfc32,Raleigh Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued
4,19082891,2015-01-18,01:51:00,"RALEIGH, Wake County",Wake County,27.0,white,male,b152422867,Raleigh Police Department,vehicular,True,False,False,arrest,False,True,True,True,Driving While Impaired,N,W,On-View Arrest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3456367,19335192,2015-04-18,09:45:00,"RALEIGH, Wake County",Wake County,58.0,white,male,599f5d18f7,Raleigh Police Department,vehicular,False,False,True,warning,False,False,False,False,Vehicle Regulatory Violation,N,W,Verbal Warning
3456368,14991256,2012-05-18,11:10:00,"nan, Durham County",Durham County,63.0,black,male,7a89a58beb,Durham Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued
3456369,6127358,2006-07-12,21:33:00,"Charlotte Area, Mecklenburg County",Mecklenburg County,22.0,black,male,11b0ef1d58,Charlotte-Mecklenburg Police Department,vehicular,True,False,False,arrest,False,True,True,True,Other Motor Vehicle Violation,N,B,On-View Arrest
3456370,2773870,2003-03-24,14:35:00,"GREE04102, Guilford County",Guilford County,30.0,black,female,8f589487ed,Greensboro Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued


## Preprocessing - combine date and time

In [8]:
def to_date_time(df):
    date_time = pd.to_datetime(df['date'] + df['time'], 
                               format='%Y-%m-%d%H:%M:%S') # allow comparing date time
    df['date_time'] = date_time
    df = df.drop(['date','time'],axis=1)
    
    return df

processed_train = to_date_time(processed_train)
processed_train

Unnamed: 0,raw_row_number,location,county_name,subject_age,subject_race,subject_sex,officer_id_hash,department_name,type,arrest_made,citation_issued,warning_issued,outcome,frisk_performed,search_conducted,search_person,search_vehicle,reason_for_stop,raw_Ethnicity,raw_Race,raw_action_description,date_time
0,13898103,"Charlotte Area, Mecklenburg County",Mecklenburg County,41.0,white,female,d1e8da930a,Charlotte-Mecklenburg Police Department,vehicular,False,True,False,citation,False,False,False,False,Vehicle Regulatory Violation,N,W,Citation Issued,2011-10-14 13:10:00
1,16635989,"nan, Forsyth County",Forsyth County,36.0,black,male,da4269670d,Winston-Salem Police Department,vehicular,False,False,True,warning,False,False,False,False,Safe Movement Violation,N,B,Verbal Warning,2013-01-06 18:48:00
2,3644747,", Forsyth County",Forsyth County,20.0,hispanic,male,40f7ef4db6,Winston-Salem Police Department,vehicular,False,True,False,citation,False,False,False,False,Vehicle Regulatory Violation,H,U,Citation Issued,NaT
3,4231260,"RALEIGH, Wake County",Wake County,29.0,black,female,db839dfc32,Raleigh Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued,2004-06-23 11:28:00
4,19082891,"RALEIGH, Wake County",Wake County,27.0,white,male,b152422867,Raleigh Police Department,vehicular,True,False,False,arrest,False,True,True,True,Driving While Impaired,N,W,On-View Arrest,2015-01-18 01:51:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3456367,19335192,"RALEIGH, Wake County",Wake County,58.0,white,male,599f5d18f7,Raleigh Police Department,vehicular,False,False,True,warning,False,False,False,False,Vehicle Regulatory Violation,N,W,Verbal Warning,2015-04-18 09:45:00
3456368,14991256,"nan, Durham County",Durham County,63.0,black,male,7a89a58beb,Durham Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued,2012-05-18 11:10:00
3456369,6127358,"Charlotte Area, Mecklenburg County",Mecklenburg County,22.0,black,male,11b0ef1d58,Charlotte-Mecklenburg Police Department,vehicular,True,False,False,arrest,False,True,True,True,Other Motor Vehicle Violation,N,B,On-View Arrest,2006-07-12 21:33:00
3456370,2773870,"GREE04102, Guilford County",Guilford County,30.0,black,female,8f589487ed,Greensboro Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued,2003-03-24 14:35:00


## Fill NA

In [9]:
def fillna(df):
    for col in df.columns:
        if col == 'subject_age':
            df[col].fillna(df[col].mean(), inplace = True) # for numeric, fill mean
        else:
            df[col].fillna(df[col].mode().values[0], inplace = True) # for categoric fill mode
    return df
processed_train = fillna(processed_train)
processed_train

Unnamed: 0,raw_row_number,location,county_name,subject_age,subject_race,subject_sex,officer_id_hash,department_name,type,arrest_made,citation_issued,warning_issued,outcome,frisk_performed,search_conducted,search_person,search_vehicle,reason_for_stop,raw_Ethnicity,raw_Race,raw_action_description,date_time
0,13898103,"Charlotte Area, Mecklenburg County",Mecklenburg County,41.0,white,female,d1e8da930a,Charlotte-Mecklenburg Police Department,vehicular,False,True,False,citation,False,False,False,False,Vehicle Regulatory Violation,N,W,Citation Issued,2011-10-14 13:10:00
1,16635989,"nan, Forsyth County",Forsyth County,36.0,black,male,da4269670d,Winston-Salem Police Department,vehicular,False,False,True,warning,False,False,False,False,Safe Movement Violation,N,B,Verbal Warning,2013-01-06 18:48:00
2,3644747,", Forsyth County",Forsyth County,20.0,hispanic,male,40f7ef4db6,Winston-Salem Police Department,vehicular,False,True,False,citation,False,False,False,False,Vehicle Regulatory Violation,H,U,Citation Issued,2008-07-22 12:06:29
3,4231260,"RALEIGH, Wake County",Wake County,29.0,black,female,db839dfc32,Raleigh Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued,2004-06-23 11:28:00
4,19082891,"RALEIGH, Wake County",Wake County,27.0,white,male,b152422867,Raleigh Police Department,vehicular,True,False,False,arrest,False,True,True,True,Driving While Impaired,N,W,On-View Arrest,2015-01-18 01:51:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3456367,19335192,"RALEIGH, Wake County",Wake County,58.0,white,male,599f5d18f7,Raleigh Police Department,vehicular,False,False,True,warning,False,False,False,False,Vehicle Regulatory Violation,N,W,Verbal Warning,2015-04-18 09:45:00
3456368,14991256,"nan, Durham County",Durham County,63.0,black,male,7a89a58beb,Durham Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued,2012-05-18 11:10:00
3456369,6127358,"Charlotte Area, Mecklenburg County",Mecklenburg County,22.0,black,male,11b0ef1d58,Charlotte-Mecklenburg Police Department,vehicular,True,False,False,arrest,False,True,True,True,Other Motor Vehicle Violation,N,B,On-View Arrest,2006-07-12 21:33:00
3456370,2773870,"GREE04102, Guilford County",Guilford County,30.0,black,female,8f589487ed,Greensboro Police Department,vehicular,False,True,False,citation,False,False,False,False,Speed Limit Violation,N,B,Citation Issued,2003-03-24 14:35:00


## Save processed_train as csv

In [10]:
train_filepath = Path('processed_train.csv')  
train_filepath.parent.mkdir(parents=True, exist_ok=True)  
processed_train.to_csv(train_filepath, index = False)