# Testing Project Data Uploads

# Clean the Data

We spent a lot of time trying to clean our data and make sure that it would be fit to run through a neural network. The majority of our features are categorical, with the exception of Age and DOB (which are in themselves related). The following code snippets are how we were able to relabel our data appropriately and handle missing data when it happened.

In [1]:
#Some Import Statements
import pandas as pd
import numpy as np

In [2]:
#Read in our project data
#Note: This is not the complete original data. The original data contained other information, 
#including the names of offenders and victims. However, we excluded this part of the data set for obvious privacy reasons.

df = pd.read_excel('Project Data Stripped.xlsx')

In [3]:
#Let's see what our data looks like!

df.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,ReportDateFull,Gender,DOB,CodeSection,Badge,Supervisor,Medical,PC/CVC,OTHER
0,M,37.0,534,2013,1,2013-01-01,M,1975-07-30 00:00:00,T14/CCR 4326(a),1433,1245,,,T14/CCR 4326(a)
1,,,564,2013,1,2013-01-02,M,Unknown,Public Complaint,1464,1245,,,Public Complaint
2,,78.0,534,2013,1,2013-01-03,F,1934-06-03 00:00:00,Medical - Fall,1464,1245,Medical - Fall,,
3,F,,534,2013,1,2013-01-05,U,Unknown,Traffic Collision (Injury),1245,1075,,,Traffic Collision (Injury)
4,F,43.0,534,2013,1,2013-01-11,F,1969-11-10 00:00:00,Felony Warrant,1564,1245,,,Felony Warrant


In [4]:
#Create a new column called "MedicalBinary"
#MedicalBinary as true or false
#This feature simply indicates whether the report was a medical incident or not
#0 - Not a Medical Incident
#1 - Medical Incident


df['MedicalBinary'] = df['Medical']
df.head()

df_notnull = df.notnull()
s,f = np.shape(df)
for i in range(s):
    if df_notnull.loc[i, 'MedicalBinary'] == True:
        df.loc[i, 'MedicalBinary'] = 1
    else:
        df.loc[i, 'MedicalBinary'] = 0
df.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,ReportDateFull,Gender,DOB,CodeSection,Badge,Supervisor,Medical,PC/CVC,OTHER,MedicalBinary
0,M,37.0,534,2013,1,2013-01-01,M,1975-07-30 00:00:00,T14/CCR 4326(a),1433,1245,,,T14/CCR 4326(a),0
1,,,564,2013,1,2013-01-02,M,Unknown,Public Complaint,1464,1245,,,Public Complaint,0
2,,78.0,534,2013,1,2013-01-03,F,1934-06-03 00:00:00,Medical - Fall,1464,1245,Medical - Fall,,,1
3,F,,534,2013,1,2013-01-05,U,Unknown,Traffic Collision (Injury),1245,1075,,,Traffic Collision (Injury),0
4,F,43.0,534,2013,1,2013-01-11,F,1969-11-10 00:00:00,Felony Warrant,1564,1245,,,Felony Warrant,0


In [5]:
#Relabel Gender
#0 - Male
#1 - Female
#-1 - Unknown

s,f = np.shape(df)
for i in range(s):
    if df.loc[i, 'Gender'] == 'M':
        df.loc[i, 'Gender'] = 0
    elif df.loc[i, 'Gender'] == 'F':
        df.loc[i, 'Gender'] = 1
    else:
        df.loc[i, 'Gender'] = -1
df.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,ReportDateFull,Gender,DOB,CodeSection,Badge,Supervisor,Medical,PC/CVC,OTHER,MedicalBinary
0,M,37.0,534,2013,1,2013-01-01,0,1975-07-30 00:00:00,T14/CCR 4326(a),1433,1245,,,T14/CCR 4326(a),0
1,,,564,2013,1,2013-01-02,0,Unknown,Public Complaint,1464,1245,,,Public Complaint,0
2,,78.0,534,2013,1,2013-01-03,1,1934-06-03 00:00:00,Medical - Fall,1464,1245,Medical - Fall,,,1
3,F,,534,2013,1,2013-01-05,-1,Unknown,Traffic Collision (Injury),1245,1075,,,Traffic Collision (Injury),0
4,F,43.0,534,2013,1,2013-01-11,1,1969-11-10 00:00:00,Felony Warrant,1564,1245,,,Felony Warrant,0


In [6]:
#Relabel Misdeamenor/Felony
# 0 - Misdemeanor
# 1- Felony

for i in range(s):
    if df.loc[i, 'M/F'] == 'M':
        df.loc[i, 'M/F'] = 0
    elif df.loc[i, 'M/F'] == 'F':
        df.loc[i, 'M/F'] = 1
    else:
        df.loc[i, 'M/F'] = -1       
df.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,ReportDateFull,Gender,DOB,CodeSection,Badge,Supervisor,Medical,PC/CVC,OTHER,MedicalBinary
0,0,37.0,534,2013,1,2013-01-01,0,1975-07-30 00:00:00,T14/CCR 4326(a),1433,1245,,,T14/CCR 4326(a),0
1,-1,,564,2013,1,2013-01-02,0,Unknown,Public Complaint,1464,1245,,,Public Complaint,0
2,-1,78.0,534,2013,1,2013-01-03,1,1934-06-03 00:00:00,Medical - Fall,1464,1245,Medical - Fall,,,1
3,1,,534,2013,1,2013-01-05,-1,Unknown,Traffic Collision (Injury),1245,1075,,,Traffic Collision (Injury),0
4,1,43.0,534,2013,1,2013-01-11,1,1969-11-10 00:00:00,Felony Warrant,1564,1245,,,Felony Warrant,0


In [7]:
#Change names of lifeguards under Badge Numbers to 1000
for i in range(s):
    if not isinstance(df.loc[i, 'Badge'], int):
        df.loc[i, 'Badge'] = 1000

# Predict Gender

In [8]:
import numpy as np
import sklearn.model_selection as ms

#Returns scores from KFold cross validation of n folds
#X_train - training data frame
#y_train - traininng labels data frame
#clf - Classifier
#n - Number of folds
def cross_val_scores(X_train, y_train, clf=None, n=5):
    scores = []
    kf = ms.KFold(n, shuffle=True)
    for train, validate in kf.split(X_train):
        clf.fit(X_train[train], y_train[train])
        scores.append(clf.score(X_train[validate], y_train[validate]))
    return scores

In [9]:
X = df.drop(["ReportDateFull", "DOB", "CodeSection","Medical", "Supervisor", "OTHER", "PC/CVC"], axis=1)

In [10]:
X_Gender = X[X.Gender != -1]
X = X.dropna()
X_Gender.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,Gender,Badge,MedicalBinary
0,0,37.0,534,2013,1,0,1433,0
1,-1,,564,2013,1,0,1464,0
2,-1,78.0,534,2013,1,1,1464,1
4,1,43.0,534,2013,1,1,1564,0
5,1,43.0,534,2013,1,0,1564,0


In [11]:
X = X_Gender.drop(['Gender'], axis=1)
y = X_Gender['Gender']

In [12]:
X.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,Badge,MedicalBinary
0,0,37.0,534,2013,1,1433,0
1,-1,,564,2013,1,1464,0
2,-1,78.0,534,2013,1,1464,1
4,1,43.0,534,2013,1,1564,0
5,1,43.0,534,2013,1,1564,0


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
X_train.shape, y_train.shape

((1203, 7), (1203,))

In [14]:
# specify the layer sizes: 
layer_sizes = [X_train.shape[1], 10, 15, np.unique(y_train).shape[0]]  # 1 hidden layers of size 10 and 5, respectively

In [15]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD 

def build_model():
    model = Sequential()
    
    # Now that we have the model, let's add some layers:
    # First a fully-connected (Dense) hidden layer with appropriate input
    # dimension, 10 outputs, and ReLU activation
    model.add(Dense(
        input_dim=layer_sizes[0], 
        units=layer_sizes[1],
        activation="relu"
    ))
    
    model.add(Dense(
        input_dim=layer_sizes[1], 
        units=layer_sizes[2],
        activation="relu"))
    

    # Finally, add a readout layer, mapping the 5 hidden units
    # to two output units using the softmax function
    model.add(Dense(units=layer_sizes[3], 
                    kernel_initializer='uniform',
                    activation="softmax"))
    
    
    sgd = SGD(lr=0.001, decay=1e-7, momentum=.9)  # Stochastic gradient descent
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=["accuracy"])  
    # we'll have the categorical crossentropy as the loss function
    # we also want the model to automatically calculate accuracy
    return model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [16]:
# Before we can fit the network, we have to one-hot vectorize our response.
# Fortunately, there is a keras method for that.
from keras.utils import to_categorical

y_train_vectorized = to_categorical(y_train)

In [17]:
model = build_model()

In [18]:
model.fit(X_train, y_train_vectorized, epochs=10, batch_size=2, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1193db470>

In [19]:
#predict_proba or predict??
proba = model.predict(X_test, batch_size=32)
classes = np.argmax(proba, axis=1) # select the readout neuron with the highest probability
print(classes)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


# Regression on Number of Incidents per year