# Testing Project Data Uploads

# Clean the Data

We spent a lot of time trying to clean our data and make sure that it would be fit to run through a neural network. The majority of our features are categorical, with the exception of Age and DOB (which are in themselves related). The following code snippets are how we were able to relabel our data appropriately and handle missing data when it happened.

In [251]:
#Some Import Statements
import pandas as pd
import numpy as np

In [252]:
#Read in our project data
#Note: This is not the complete original data. The original data contained other information, 
#including the names of offenders and victims. However, we excluded this part of the data set for obvious privacy reasons.

df = pd.read_excel('Project Data Stripped.xlsx')

In [253]:
#Let's see what our data looks like!

df.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,ReportDateFull,Gender,DOB,CodeSection,Badge,Supervisor,Medical,PC/CVC,OTHER
0,M,37.0,534,2013,1,2013-01-01,M,1975-07-30 00:00:00,T14/CCR 4326(a),1433,1245,,,T14/CCR 4326(a)
1,,,564,2013,1,2013-01-02,M,Unknown,Public Complaint,1464,1245,,,Public Complaint
2,,78.0,534,2013,1,2013-01-03,F,1934-06-03 00:00:00,Medical - Fall,1464,1245,Medical - Fall,,
3,F,,534,2013,1,2013-01-05,U,Unknown,Traffic Collision (Injury),1245,1075,,,Traffic Collision (Injury)
4,F,43.0,534,2013,1,2013-01-11,F,1969-11-10 00:00:00,Felony Warrant,1564,1245,,,Felony Warrant


In [281]:
#Create a new column called "MedicalBinary"
#MedicalBinary as true or false
#This feature simply indicates whether the report was a medical incident or not
#0 - Not a Medical Incident
#1 - Medical Incident


df['MedicalBinary'] = df['Medical']

df_notnull = df.notnull()
s,f = np.shape(df)
for i in range(s):
    if df_notnull.loc[i, 'MedicalBinary'] == True:
        df.loc[i, 'MedicalBinary'] = 1
    else:
        df.loc[i, 'MedicalBinary'] = 0
        
df['PC/CVCBinary'] = df['PC/CVC']
df.head()

df_notnull = df.notnull()
s,f = np.shape(df)
for i in range(s):
    if df_notnull.loc[i, 'PC/CVCBinary'] == True:
        df.loc[i, 'PC/CVCBinary'] = 1
    else:
        df.loc[i, 'PC/CVCBinary'] = 0        

        
df['OTHERBINARY'] = df['OTHER']

df_notnull = df.notnull()
s,f = np.shape(df)
for i in range(s):
    if df_notnull.loc[i, 'OTHERBINARY'] == True:
        df.loc[i, 'OTHERBINARY'] = 1
    else:
        df.loc[i, 'OTHERBINARY'] = 0          
df.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,ReportDateFull,Gender,DOB,CodeSection,Badge,Supervisor,Medical,PC/CVC,OTHER,MedicalBinary,PC/CVCBinary,OTHERBINARY
0,M,37.0,534,2013,1,2013-01-01,M,1975-07-30 00:00:00,T14/CCR 4326(a),1433,1245,,,T14/CCR 4326(a),0,0,1
1,,,564,2013,1,2013-01-02,M,Unknown,Public Complaint,1464,1245,,,Public Complaint,0,0,1
2,,78.0,534,2013,1,2013-01-03,F,1934-06-03 00:00:00,Medical - Fall,1464,1245,Medical - Fall,,,1,0,0
3,F,,534,2013,1,2013-01-05,U,Unknown,Traffic Collision (Injury),1245,1075,,,Traffic Collision (Injury),0,0,1
4,F,43.0,534,2013,1,2013-01-11,F,1969-11-10 00:00:00,Felony Warrant,1564,1245,,,Felony Warrant,0,0,1


In [282]:
#Relabel Gender
#0 - Male
#1 - Female
#-1 - Unknown

s,f = np.shape(df)
for i in range(s):
    if df.loc[i, 'Gender'] == 'M':
        df.loc[i, 'Gender'] = 0
    elif df.loc[i, 'Gender'] == 'F':
        df.loc[i, 'Gender'] = 1
    else:
        df.loc[i, 'Gender'] = -1
df.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,ReportDateFull,Gender,DOB,CodeSection,Badge,Supervisor,Medical,PC/CVC,OTHER,MedicalBinary,PC/CVCBinary,OTHERBINARY
0,M,37.0,534,2013,1,2013-01-01,0,1975-07-30 00:00:00,T14/CCR 4326(a),1433,1245,,,T14/CCR 4326(a),0,0,1
1,,,564,2013,1,2013-01-02,0,Unknown,Public Complaint,1464,1245,,,Public Complaint,0,0,1
2,,78.0,534,2013,1,2013-01-03,1,1934-06-03 00:00:00,Medical - Fall,1464,1245,Medical - Fall,,,1,0,0
3,F,,534,2013,1,2013-01-05,-1,Unknown,Traffic Collision (Injury),1245,1075,,,Traffic Collision (Injury),0,0,1
4,F,43.0,534,2013,1,2013-01-11,1,1969-11-10 00:00:00,Felony Warrant,1564,1245,,,Felony Warrant,0,0,1


In [290]:
#Relabel Misdeamenor/Felony
# 0 - Misdemeanor
# 1- Felony

for i in range(s):
    if df.loc[i, 'M/F'] == 'M':
        df.loc[i, 'M/F'] = 0
    elif df.loc[i, 'M/F'] == 'F':
        df.loc[i, 'M/F'] = 1
    else:
        df.loc[i, 'M/F'] = -1       


for i in range(s):
    if df.loc[i, 'DOB'] == 'Unknown':
        df.loc[i, 'DOB'] = -1
df.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,ReportDateFull,Gender,DOB,CodeSection,Badge,Supervisor,Medical,PC/CVC,OTHER,MedicalBinary,PC/CVCBinary,OTHERBINARY
0,-1,37.0,534,2013,1,2013-01-01,0,1975-07-30 00:00:00,T14/CCR 4326(a),1433,1245,,,T14/CCR 4326(a),0,0,1
1,-1,,564,2013,1,2013-01-02,0,-1,Public Complaint,1464,1245,,,Public Complaint,0,0,1
2,-1,78.0,534,2013,1,2013-01-03,1,1934-06-03 00:00:00,Medical - Fall,1464,1245,Medical - Fall,,,1,0,0
3,-1,,534,2013,1,2013-01-05,-1,-1,Traffic Collision (Injury),1245,1075,,,Traffic Collision (Injury),0,0,1
4,-1,43.0,534,2013,1,2013-01-11,1,1969-11-10 00:00:00,Felony Warrant,1564,1245,,,Felony Warrant,0,0,1


In [7]:
#Change names of lifeguards under Badge Numbers to 1000
for i in range(s):
    if not isinstance(df.loc[i, 'Badge'], int):
        df.loc[i, 'Badge'] = 1000

# Predict Gender

In [115]:
import numpy as np
import sklearn.model_selection as ms

#Returns scores from KFold cross validation of n folds
#X_train - training data frame
#y_train - traininng labels data frame
#clf - Classifier
#n - Number of folds
def cross_val_scores(X_train, y_train, clf=None, n=5):
    scores = []
    kf = ms.KFold(n, shuffle=True)
    for train, validate in kf.split(X_train):
        clf.fit(X_train[train], y_train[train])
        scores.append(clf.score(X_train[validate], y_train[validate]))
    return scores

In [116]:
X = df.drop(["ReportDateFull", "DOB", "CodeSection","Medical", "Supervisor", "OTHER", "PC/CVC"], axis=1)

In [117]:
X_Gender = X[X.Gender != -1]
X = X.dropna()
X_Gender.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,Gender,Badge,MedicalBinary
0,0,37.0,534,2013,1,0,1433,0
1,-1,,564,2013,1,0,1464,0
2,-1,78.0,534,2013,1,1,1464,1
4,1,43.0,534,2013,1,1,1564,0
5,1,43.0,534,2013,1,0,1564,0


In [118]:
X = X_Gender.drop(['Gender'], axis=1)
y = X_Gender['Gender']

In [119]:
X.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,Badge,MedicalBinary
0,0,37.0,534,2013,1,1433,0
1,-1,,564,2013,1,1464,0
2,-1,78.0,534,2013,1,1464,1
4,1,43.0,534,2013,1,1564,0
5,1,43.0,534,2013,1,1564,0


In [120]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
X_train.shape, y_train.shape

((1203, 7), (1203,))

In [121]:
# specify the layer sizes: 
layer_sizes = [X_train.shape[1], 10, 15, np.unique(y_train).shape[0]]  # 1 hidden layers of size 10 and 15, respectively

In [122]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD 

def build_model():
    model = Sequential()
    
    # Now that we have the model, let's add some layers:
    # First a fully-connected (Dense) hidden layer with appropriate input
    # dimension, 10 outputs, and ReLU activation
    model.add(Dense(
        input_dim=layer_sizes[0], 
        units=layer_sizes[1],
        activation="relu"
    ))
    
    model.add(Dense(
        input_dim=layer_sizes[1], 
        units=layer_sizes[2],
        activation="relu"))
    

    # Finally, add a readout layer, mapping the 5 hidden units
    # to two output units using the softmax function
    model.add(Dense(units=layer_sizes[3], 
                    kernel_initializer='uniform',
                    activation="softmax"))
    
    
    sgd = SGD(lr=0.001, decay=1e-7, momentum=.9)  # Stochastic gradient descent
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=["accuracy"])  
    # we'll have the categorical crossentropy as the loss function
    # we also want the model to automatically calculate accuracy
    return model

In [123]:
# Before we can fit the network, we have to one-hot vectorize our response.
# Fortunately, there is a keras method for that.
from keras.utils import to_categorical

y_train_vectorized = to_categorical(y_train)



In [124]:
model = build_model()

In [125]:
model.fit(X_train, y_train_vectorized, epochs=10, batch_size=2, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11b2b4a20>

In [126]:
#predict_proba or predict??
proba = model.predict(X_test, batch_size=32)
classes = np.argmax(proba, axis=1) # select the readout neuron with the highest probability
print(classes)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## PREDICT AGE

In [127]:
def cross_val_scores(X_train, y_train, clf=None, n=5):
    scores = []
    kf = ms.KFold(n, shuffle=True)
    for train, validate in kf.split(X_train):
        clf.fit(X_train[train], y_train[train])
        scores.append(clf.score(X_train[validate], y_train[validate]))
    return scores

In [129]:
X = df.drop(["ReportDateFull", "DOB", "CodeSection","Medical", "Supervisor", "OTHER", "PC/CVC"], axis=1)
X.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,Gender,Badge,MedicalBinary
0,0,37.0,534,2013,1,0,1433,0
1,-1,,564,2013,1,0,1464,0
2,-1,78.0,534,2013,1,1,1464,1
3,1,,534,2013,1,-1,1245,0
4,1,43.0,534,2013,1,1,1564,0


In [131]:
X_Age = X[X.Age != -1]
X = X.dropna(axis=1)
X_Age.head()

Unnamed: 0,M/F,Age,UnitNumber,ReportYear,ReportMonth,Gender,Badge,MedicalBinary
0,0,37.0,534,2013,1,0,1433,0
2,-1,78.0,534,2013,1,1,1464,1
4,1,43.0,534,2013,1,1,1564,0
5,1,43.0,534,2013,1,0,1564,0
6,1,46.0,534,2013,1,0,1433,0


In [132]:
X = X_Age.drop(['Age'], axis=1)
y = X_Gender['Age']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
X_train.shape, y_train.shape

In [133]:
# specify the layer sizes: 
layer_sizes = [X_train.shape[1], 10, 15, np.unique(y_train).shape[0]]  # 1 hidden layers of size 10 and 15, respectively

In [134]:
# Before we can fit the network, we have to one-hot vectorize our response.
# Fortunately, there is a keras method for that.
from keras.utils import to_categorical

y_train_vectorized = to_categorical(y_train)



In [135]:
model = build_model()

In [136]:
model.fit(X_train, y_train_vectorized, epochs=10, batch_size=2, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11b3ddb00>

In [137]:
#predict_proba or predict??
proba = model.predict(X_test, batch_size=32)
classes = np.argmax(proba, axis=1) # select the readout neuron with the highest probability
print(classes)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


# Regression on Number of Incidents per year

In [254]:
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_classification, make_moons
from sklearn import preprocessing as preproc
from sklearn import pipeline as pipe
from sklearn import metrics
from sklearn import model_selection as ms
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [255]:
cmap = ListedColormap(["red", "blue"])

In [302]:
# Create a classification dataset
model = df.drop(['CodeSection','Badge','PC/CVC','OTHER','Medical','DOB','ReportDateFull','Age'],axis=1)
model = model[model['Supervisor'].notnull()]
model = model.dropna(axis=0)
X = model.drop(['Supervisor'],axis=1)
y = model["Supervisor"]
X.columns.to_series().groupby(df.dtypes).groups
X.head()
# We will run all of our fitting on X_train and y_train
# X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2)
# We will save X_test and y_test for evaluation

Unnamed: 0,M/F,UnitNumber,ReportYear,ReportMonth,Gender,MedicalBinary,PC/CVCBinary,OTHERBINARY
0,-1,534,2013,1,0,0,0,1
1,-1,564,2013,1,0,0,0,1
2,-1,534,2013,1,1,1,0,0
3,-1,534,2013,1,-1,0,0,1
4,-1,534,2013,1,1,0,0,1


In [297]:
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.1)

In [298]:
scaler = preproc.StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [311]:
X_sc = scaler.fit_transform(X)  # DON'T DO THIS
X_train_sc = scaler.transform(X_train)  # scale X_train
X_test_sc = scaler.transform(X_test)  # scale X_test BASED ON X_train

In [322]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_sc, y_train)
X_test_sc
# logreg.score(X_train_sc, X_train_sc)

array([[ 0.        ,  0.90874338, -0.47760796, ..., -0.74941921,
         1.2935975 , -0.60351654],
       [ 0.        ,  0.90874338,  0.3933242 , ..., -0.74941921,
        -0.77303798,  1.65695543],
       [ 0.        , -1.10042067, -1.34854011, ..., -0.74941921,
         1.2935975 , -0.60351654],
       ...,
       [ 0.        ,  0.90874338,  0.3933242 , ...,  1.33436665,
        -0.77303798, -0.60351654],
       [ 0.        ,  0.90874338, -0.47760796, ...,  1.33436665,
        -0.77303798, -0.60351654],
       [ 0.        , -1.10042067, -0.47760796, ..., -0.74941921,
        -0.77303798,  1.65695543]])