In [29]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

# cntl ? makes something a comment
# shift R enter runs all above (on chromebook)
#esc cmd shift R runs all cells
# esc 1, 2, 3, 4, 5 makes something a kind of header
# esc X cuts a cell

In [30]:
os.getcwd()

'/home/vizziwo/capcookie/notebooks'

In [31]:
df = pd.read_csv('/home/vizziwo/capcookie/data/processed/fatalwrangling.csv', index_col=0)
df.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [32]:
df.drop(columns='id', inplace=True)

#### Dummy Encoding will drop one category, so there's less collinearity. 
#### But it's very important to me that the models are readable, so I will perform One-Hot encoding instead.

In [33]:
df = pd.get_dummies(df, columns=['gender','threat_level','flee','manner_of_death']).drop(columns=['gender_Nonbinary'])

##### Dropped 'Nonbinary' because there was just 1 person in this category. 

In [34]:
df.body_camera = df.body_camera *1
df.signs_of_mental_illness = df.signs_of_mental_illness*1

In [35]:
df.rename(columns={'manner_of_death_shot and Tasered':'manner_of_death_shot_and_tasered', 'flee_Other':'flee_unspecified', 'threat_level_other':'threat_level_not_attack', 'threat_level_undetermined':'threat_level_unspecified'}, inplace=True)

In [36]:
df['day_of_week'] = pd.to_datetime(df.date).dt.dayofweek

In [37]:
df['month'] = pd.to_datetime(df.date).dt.month

In [38]:
df['season']=((df.month % 12 + 3) // 3)

In [39]:
df.armed.value_counts().head(20)

gun                2759
knife               708
undetermined        352
unarmed             347
toy weapon          172
vehicle             120
unknown weapon       66
machete              39
Taser                24
sword                22
ax                   21
baseball bat         16
gun and knife        15
hammer               14
screwdriver          12
metal pipe           12
box cutter           11
sharp object         11
hatchet              11
gun and vehicle      10
Name: armed, dtype: int64

In [40]:
counts = df.armed.value_counts()
mask = df.armed.isin(counts[counts<21].index)
df['armed'][mask]='other'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [41]:
df.armed.value_counts()

gun               2759
knife              708
undetermined       352
unarmed            347
other              270
toy weapon         172
vehicle            120
unknown weapon      66
machete             39
Taser               24
sword               22
ax                  21
Name: armed, dtype: int64

In [42]:
df = pd.get_dummies(df, columns=['armed'])

In [43]:
df.columns

Index(['name', 'date', 'age', 'race', 'city', 'state',
       'signs_of_mental_illness', 'body_camera', 'gender_F', 'gender_M',
       'threat_level_attack', 'threat_level_not_attack',
       'threat_level_unspecified', 'flee_Car', 'flee_Foot', 'flee_Not fleeing',
       'flee_unspecified', 'manner_of_death_shot',
       'manner_of_death_shot_and_tasered', 'day_of_week', 'month', 'season',
       'armed_Taser', 'armed_ax', 'armed_gun', 'armed_knife', 'armed_machete',
       'armed_other', 'armed_sword', 'armed_toy weapon', 'armed_unarmed',
       'armed_undetermined', 'armed_unknown weapon', 'armed_vehicle'],
      dtype='object')

### Changed 'State' to a number. I'm not sure about this. Is this a good idea?

In [44]:
df.state = pd.Categorical(df.state)
df['state_code'] = df.state.cat.codes
df.drop(columns=['state'], inplace=True)

### I'm not sure if my target variable should be "black' and 'not black' or 'black-or-hispanic' and 'not-black-or-hispanic' so I'll make a column for each so maybe i can try both in my modeling.

In [45]:
df.race.str.contains('B').sum()

1298

In [46]:
df['black_or_not'] = 0
df['blackhispanic_or_not'] = 0
df.loc[df['race'].str.contains('B'), 'black_or_not'] = 1
df.loc[df['race'].str.contains('H|B'), 'blackhispanic_or_not'] = 1

In [47]:
df.head()

Unnamed: 0,name,date,age,race,city,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,...,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,state_code,black_or_not,blackhispanic_or_not
0,Tim Elliot,2015-01-02,53.0,A,Shelton,1,0,0,1,1,...,0,0,0,0,0,0,0,47,0,0
1,Lewis Lee Lembke,2015-01-02,47.0,W,Aloha,0,0,0,1,1,...,0,0,0,0,0,0,0,37,0,0
2,John Paul Quintero,2015-01-03,23.0,H,Wichita,0,0,0,1,0,...,0,0,0,1,0,0,0,16,0,1
3,Matthew Hoffman,2015-01-04,32.0,W,San Francisco,1,0,0,1,1,...,0,0,1,0,0,0,0,4,0,0
4,Michael Rodriguez,2015-01-04,39.0,H,Evans,0,0,0,1,1,...,1,0,0,0,0,0,0,5,0,1


### Additionally, I know I have to do test_train split in this step. And that I need to fit the scaler on the training data, and transform on the test data. 
### BUT since I have separate notebooks for Preprocessing and Modeling, how do I 'save' my trained Scaler so I can 'transform' in my Modeling notebook? I'm just wondering about File Organization. Is it possible or desireable to save my Scaler somewhere?
### The same goes for my data, am I saving my 'y_train' and 'y_test' as separate CSV files in this notebook? And then load those CSV files in my Modeling notebook?

### Train_test Split with 'Black or Not' as y 

In [48]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['name','date','race','city','black_or_not','blackhispanic_or_not'])
y1= df.black_or_not

X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.25, random_state=42)

### Train_test Split with 'Blackhispanic or Not' as y.
#### Since the random_state is the same, I actually this X_train will be the same as X2_train but I'm not sure so I'm still going to distinguish them here.

In [49]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['name','date','race','city','black_or_not','blackhispanic_or_not'])
y2= df.blackhispanic_or_not

X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.25, random_state=42)

##### How should I scale age? Should I normalize it? StandardScaler is normalization, right? Or should I bin it? Or transform it in another way?
##### Does it depend on my model performance? Should I try it several ways and see what best works in the model?
##### Or does it depend on the kind of model? I definitely want to do classification. RandomForest/Dec Tree doesn't need to be scaled, but Logistic Regression does need to be scaled, I think

In [50]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X1_train[['age']])
X1_train['standardized_age'] = scaler.transform(X1_train[['age']])
X1_train.drop(columns='age', inplace=True)

In [51]:
X1_train.head()

Unnamed: 0,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,threat_level_not_attack,threat_level_unspecified,flee_Car,flee_Foot,flee_Not fleeing,...,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,state_code,standardized_age
2835,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,16,-1.533369
1157,1,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,27,0.501482
744,0,0,0,1,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,22,0.266692
1448,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,50,0.579746
3339,0,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,42,-0.907261


In [52]:
y1_train.head()

2835    0
1157    0
744     0
1448    0
3339    1
Name: black_or_not, dtype: int64

##### also transforming X1_test

In [53]:
X1_test['standardized_age'] = scaler.transform(X1_test[['age']])
X1_test.drop(columns='age', inplace=True)

In [54]:
X1_test.head()

Unnamed: 0,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,threat_level_not_attack,threat_level_unspecified,flee_Car,flee_Foot,flee_Not fleeing,...,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,state_code,standardized_age
4657,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,23,-0.046362
3539,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,22,-0.907261
907,0,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,4,1.362381
4353,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,43,0.110165
3745,1,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3,0.814536


In [55]:
y1_test.head()

4657    0
3539    0
907     0
4353    0
3745    0
Name: black_or_not, dtype: int64

##### Just in case it is necessary to differentiate X2_train, I'd transfrom that separately:

In [115]:
scaler2=StandardScaler()
scaler2.fit(X2_train[['age']])
X2_train['standardized_age'] = scaler2.transform(X2_train[['age']])
X2_train.drop(columns='age', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [119]:
X2_train.head()

Unnamed: 0,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,threat_level_not_attack,threat_level_unspecified,flee_Car,flee_Foot,flee_Not fleeing,...,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,state_code,standardized_age
2835,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,16,-1.533369
1157,1,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,27,0.501482
744,0,0,0,1,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,22,0.266692
1448,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,50,0.579746
3339,0,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,42,-0.907261


In [121]:
y2_train.head()

2835    0
1157    0
744     0
1448    1
3339    1
Name: blackhispanic_or_not, dtype: int64

##### also transforming X2_test 

In [56]:
X2_test['standardized_age'] = scaler.transform(X2_test[['age']])
X2_test.drop(columns='age', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [57]:
X2_test.head()

Unnamed: 0,signs_of_mental_illness,body_camera,gender_F,gender_M,threat_level_attack,threat_level_not_attack,threat_level_unspecified,flee_Car,flee_Foot,flee_Not fleeing,...,armed_machete,armed_other,armed_sword,armed_toy weapon,armed_unarmed,armed_undetermined,armed_unknown weapon,armed_vehicle,state_code,standardized_age
4657,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,23,-0.046362
3539,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,22,-0.907261
907,0,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,4,1.362381
4353,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,43,0.110165
3745,1,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3,0.814536


In [58]:
y2_test.head()

4657    0
3539    0
907     0
4353    1
3745    0
Name: blackhispanic_or_not, dtype: int64

### Do I need to standardize my coded States too?

# Saving my Xs and ys

In [59]:
X1_train.to_csv('/home/vizziwo/capcookie/data/processed/X1_train.csv')
X1_test.to_csv('/home/vizziwo/capcookie/data/processed/X1_test.csv')
y1_train.to_csv('/home/vizziwo/capcookie/data/processed/y1_train.csv')
y1_test.to_csv('/home/vizziwo/capcookie/data/processed/y1_test.csv')
X2_train.to_csv('/home/vizziwo/capcookie/data/processed/X2_train.csv')
X2_test.to_csv('/home/vizziwo/capcookie/data/processed/X2_test.csv')
y2_train.to_csv('/home/vizziwo/capcookie/data/processed/y2_train.csv')
y2_test.to_csv('/home/vizziwo/capcookie/data/processed/y2_test.csv')