In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv('SF.csv') # Large file

  exec(code_obj, self.user_global_ns, self.user_ns)


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905070 entries, 0 to 905069
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   raw_row_number                     905070 non-null  object 
 1   date                               905070 non-null  object 
 2   time                               905035 non-null  object 
 3   location                           905027 non-null  object 
 4   lat                                903373 non-null  float64
 5   lng                                903373 non-null  float64
 6   district                           852883 non-null  object 
 7   subject_age                        846182 non-null  float64
 8   subject_race                       905070 non-null  object 
 9   subject_sex                        905070 non-null  object 
 10  type                               905070 non-null  object 
 11  arrest_made                        9050

#### Remove columns that are uninformative or all nulls

In [19]:
# Location down to district is informative enough 
# Type is 'vehicular' for all 
# Raw result included in other columns
# Raw search outcome included in contraband found
# Reason for stop encoded 
# Removing outcome as includes target information (arrest)
# Removing search basis since few values and only (consent, other)

In [20]:
cols_to_drop = ['raw_row_number', 'lat', 'lng', 'location', 'type', 
       'raw_result_of_contact_description', 'raw_search_vehicle_description', 'reason_for_stop', 'outcome', 'search_basis']

data = df.drop(columns=cols_to_drop)

#### Replace time with hour and date with a month and year column

In [21]:
data['time'] = data['time'].apply(lambda x: str(x).split(':')[0])
data['year'] = data['date'].apply(lambda x: str(x).split('-')[0])
data['month'] = data['date'].apply(lambda x: str(x).split('-')[1])
data.drop(['date'], inplace=True, axis=1)

#### Filling NaNs with false in contraband found 

In [22]:
data['contraband_found'].value_counts()

False    45405
True      7976
Name: contraband_found, dtype: int64

In [23]:
data['contraband_found'].fillna(False, inplace=True)

#### Change boolean to numerical binary labels

In [24]:
bool_cols = ['contraband_found', 'search_conducted', 'warning_issued', 'citation_issued',
             'search_vehicle', 'arrest_made']
for col in bool_cols:
    data[col] = data[col].apply(lambda x: 1 if x == True else 0)
    
data['subject_sex'] = data['subject_sex'].apply(lambda x: 0 if x == 'female' else 1)

#### Impute categorical and numerical features 

In [25]:
categorical = ['district', 'subject_race']

In [26]:
def replace_nan_categorical(df,column):
    prob = dict(df[column].value_counts()/len(df))
    keys = list(prob.keys())
    sum_prob = sum(prob.values())
    for k,v in prob.items():
        prob[k] = prob[k]/sum_prob
    prob_list = list(prob.values())
    to_fill = np.random.choice(keys, len(df[column].loc[df[column].isnull()]), p = prob_list) 
    df[column].loc[df[column].isnull()] = to_fill   
    return df

In [27]:
for c in categorical:
    data = replace_nan_categorical(data,c)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [28]:
data.drop(['subject_age'], axis=1, inplace=True)

In [29]:
categorical = categorical + ['year', 'month', 'time']
dummies = pd.get_dummies(data,columns=categorical, )
dummies.to_csv('SF_cleaned.csv',index=False)

In [30]:
dummies

Unnamed: 0,subject_sex,arrest_made,citation_issued,warning_issued,contraband_found,search_conducted,search_vehicle,district_A,district_B,district_C,...,time_15,time_16,time_17,time_18,time_19,time_20,time_21,time_22,time_23,time_nan
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905065,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
905066,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
905067,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
905068,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
