In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

In [2]:
os.getcwd()

'/home/vizziwo/capcookie/notebooks'

In [3]:
df = pd.read_csv('../data/processed/fatalwrangling.csv', index_col=0)
df.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [4]:
df.drop(columns='id', inplace=True)

#### Dummy Encoding will drop one category, so there's less collinearity, but it's very important to me that the models are readable, so I will perform One-Hot encoding instead.

#### Drop 'Nonbinary' because there is just 1 person in this category. 


In [5]:
df.gender.unique()

array(['M', 'F', 'Nonbinary'], dtype=object)

In [6]:
df[df['gender'].str.contains('M|F')==False]

Unnamed: 0,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
2543,Scout Schultz,2017-09-16,shot,knife,21.0,Nonbinary,W,Atlanta,GA,True,other,Not fleeing,False


In [7]:
df = pd.get_dummies(df.drop(index=2543), columns=['gender','threat_level','flee','manner_of_death', 'state'])

In [8]:
df.columns

Index(['name', 'date', 'armed', 'age', 'race', 'city',
       'signs_of_mental_illness', 'body_camera', 'gender_F', 'gender_M',
       'threat_level_attack', 'threat_level_other',
       'threat_level_undetermined', 'flee_Car', 'flee_Foot',
       'flee_Not fleeing', 'flee_Other', 'manner_of_death_shot',
       'manner_of_death_shot and Tasered', 'state_AK', 'state_AL', 'state_AR',
       'state_AZ', 'state_CA', 'state_CO', 'state_CT', 'state_DC', 'state_DE',
       'state_FL', 'state_GA', 'state_HI', 'state_IA', 'state_ID', 'state_IL',
       'state_IN', 'state_KS', 'state_KY', 'state_LA', 'state_MA', 'state_MD',
       'state_ME', 'state_MI', 'state_MN', 'state_MO', 'state_MS', 'state_MT',
       'state_NC', 'state_ND', 'state_NE', 'state_NH', 'state_NJ', 'state_NM',
       'state_NV', 'state_NY', 'state_OH', 'state_OK', 'state_OR', 'state_PA',
       'state_RI', 'state_SC', 'state_SD', 'state_TN', 'state_TX', 'state_UT',
       'state_VA', 'state_VT', 'state_WA', 'state_WI', 'state_

#### Transform Boolean cols to binary.

In [9]:
df.body_camera = df.body_camera *1
df.signs_of_mental_illness = df.signs_of_mental_illness*1

#### Rename columns for readability and codeability

In [10]:
df.rename(columns={'manner_of_death_shot and Tasered':'manner_of_death_shot_and_tasered', 'flee_Other':'flee_unspecified', 'threat_level_other':'threat_level_not_attack', 'threat_level_undetermined':'threat_level_unspecified'}, inplace=True)

#### Use Datetime functions to add 'day of week', 'month' and 'season' features.

In [11]:
df['day_of_week'] = pd.to_datetime(df.date).dt.day_name()

In [12]:
df['month'] = pd.to_datetime(df.date).dt.month_name()

In [13]:
df['season']=((pd.to_datetime(df.date).dt.month % 12 + 3) // 3)

In [14]:
df.season.unique()

array([1, 2, 3, 4])

In [15]:
df.season = df.season.map({1:'winter',2:'spring',3:'summer',4:'fall'})

In [16]:
df.season.unique()

array(['winter', 'spring', 'summer', 'fall'], dtype=object)

In [17]:
df = pd.get_dummies(df, columns=['season','month','day_of_week'])

#### Bin less common weapons into 'Other' category for 'armed' column

In [18]:
df.armed.value_counts().head(20)

gun                2759
knife               707
undetermined        352
unarmed             347
toy weapon          172
vehicle             120
unknown weapon       66
machete              39
Taser                24
sword                22
ax                   21
baseball bat         16
gun and knife        15
hammer               14
metal pipe           12
screwdriver          12
sharp object         11
hatchet              11
box cutter           11
gun and vehicle      10
Name: armed, dtype: int64

In [19]:
counts = df.armed.value_counts()
mask = df.armed.isin(counts[counts<21].index)
df['armed'][mask]='other'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
df.armed.value_counts()

gun               2759
knife              707
undetermined       352
unarmed            347
other              270
toy weapon         172
vehicle            120
unknown weapon      66
machete             39
Taser               24
sword               22
ax                  21
Name: armed, dtype: int64

In [21]:
df = pd.get_dummies(df, columns=['armed'])

In [22]:
df.columns

Index(['name', 'date', 'age', 'race', 'city', 'signs_of_mental_illness',
       'body_camera', 'gender_F', 'gender_M', 'threat_level_attack',
       ...
       'armed_gun', 'armed_knife', 'armed_machete', 'armed_other',
       'armed_sword', 'armed_toy weapon', 'armed_unarmed',
       'armed_undetermined', 'armed_unknown weapon', 'armed_vehicle'],
      dtype='object', length=104)

#### Making 2 Different Target Columns: 
#### I want to see if the models perform better if I look for African-American victims only or both African-American and Hispanic victims.

#### One target column will be 'black_or_not' and the other will be 'blackhispanic_or_not.'

In [23]:
df.race.str.contains('B').sum()

1298

In [24]:
df['black_or_not'] = 0
df['blackhispanic_or_not'] = 0
df.loc[df['race'].str.contains('B'), 'black_or_not'] = 1
df.loc[df['race'].str.contains('H|B'), 'blackhispanic_or_not'] = 1

In [25]:
df.head()

Unnamed: 0,name,date,age,race,city,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,...,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,black_or_not,blackhispanic_or_not
0,Tim Elliot,2015-01-02,53.0,A,Shelton,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Lewis Lee Lembke,2015-01-02,47.0,W,Aloha,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,John Paul Quintero,2015-01-03,23.0,H,Wichita,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
3,Matthew Hoffman,2015-01-04,32.0,W,San Francisco,1,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
4,Michael Rodriguez,2015-01-04,39.0,H,Evans,0,0,0,1,1,...,0,1,0,0,0,0,0,0,0,1


#### Train_test Split with 'Black or Not' as y 

In [26]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['name','date','race','city','black_or_not','blackhispanic_or_not'])
y1= df.black_or_not

X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.25, random_state=42)

#### Train_test Split with 'Blackhispanic or Not' as y.

Since the random_state is the same, this training data will be the same as the previous training data, but for consistency I will label it X2_train

In [27]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['name','date','race','city','black_or_not','blackhispanic_or_not'])
y2= df.blackhispanic_or_not

X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.25, random_state=42)

#### Scaling 'Age' Column in Training Data

In [28]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X1_train[['age']])
X1_train['standardized_age'] = scaler.transform(X1_train[['age']])
X1_train.drop(columns='age', inplace=True)

In [29]:
X1_train.head()

Unnamed: 0,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,threat_level_not_attack,threat_level_unspecified,flee_Car,flee_Foot,flee_Not fleeing,...,armed_knife,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,standardized_age
2836,0,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0.740107
1157,1,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.504791
744,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0.269475
1448,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0.58323
3339,0,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,-0.907106


In [30]:
#y1_train.head()

#### Scaling 'Age' Column in Test Data

In [31]:
X1_test['standardized_age'] = scaler.transform(X1_test[['age']])
X1_test.drop(columns='age', inplace=True)

In [32]:
X1_test.head()

Unnamed: 0,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,threat_level_not_attack,threat_level_unspecified,flee_Car,flee_Foot,flee_Not fleeing,...,armed_knife,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,standardized_age
4657,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-0.04428
3660,0,1,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,-0.04428
907,0,0,0,1,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1.367617
4353,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.112598
3272,0,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,-0.279596


In [33]:
#y1_test.head()

#### Scaling X2 Training Data

In [34]:
scaler2=StandardScaler()
scaler2.fit(X2_train[['age']])
X2_train['standardized_age'] = scaler2.transform(X2_train[['age']])
X2_train.drop(columns='age', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [35]:
X2_train.head()

Unnamed: 0,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,threat_level_not_attack,threat_level_unspecified,flee_Car,flee_Foot,flee_Not fleeing,...,armed_knife,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,standardized_age
2836,0,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0.740107
1157,1,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.504791
744,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0.269475
1448,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0.58323
3339,0,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,-0.907106


In [36]:
#y2_train.head()

#### Scaling X2 Test Data

In [37]:
X2_test['standardized_age'] = scaler.transform(X2_test[['age']])
X2_test.drop(columns='age', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [38]:
X2_test.head()

Unnamed: 0,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,threat_level_not_attack,threat_level_unspecified,flee_Car,flee_Foot,flee_Not fleeing,...,armed_knife,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,standardized_age
4657,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-0.04428
3660,0,1,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,-0.04428
907,0,0,0,1,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1.367617
4353,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.112598
3272,0,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,-0.279596


In [39]:
#y2_test.head()

#### Saving my Xs and ys

In [40]:
X1_train.to_csv('../data/processed/X1_train.csv')
X1_test.to_csv('../data/processed/X1_test.csv')
y1_train.to_csv('../data/processed/y1_train.csv')
y1_test.to_csv('../data/processed/y1_test.csv')
X2_train.to_csv('../data/processed/X2_train.csv')
X2_test.to_csv('../data/processed/X2_test.csv')
y2_train.to_csv('../data/processed/y2_train.csv')
y2_test.to_csv('../data/processed/y2_test.csv')