<a href="https://colab.research.google.com/github/uninstallit/ati580_final_project/blob/edvin-1/ati580_explore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Initial Setup**

-- Mount to google drive to load csv

In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [2]:
# connect = Connect()
# mdb_client = connect.get_connection("POLICE_DATABASE")
# mdb_client.testdb
# mdb_database = mdb_client['POLICE_DATABASE'] 
# mdb_collection = mdb_database['POLICE_INTERVIEWS']
# mdb_client.testdb

# # delete all records
# mdb_collection.delete_many({})

In [3]:
# # import pandas as pd

# # Load csv dataset
# clean_data = pd.read_csv('/content/gdrive/My Drive/ati580_final_project/data/Stop_and_Search__Field_Interviews_Cleaned.csv')
# print(clean_data.head())

# # insert collection
# clean_data = clean_data.to_dict("records")
# # Insert collection
# mdb_collection.insert_many(clean_data)

In [4]:
# pip freeze --local > /content/gdrive/My\ Drive/requirements.txt

In [5]:
# pip install --upgrade --force-reinstall `cat/content/gdrive/My\ Drive/requirements.txt`

In [6]:
# pip install dnspython

In [7]:
# pip install tf-nightly

### **Framework**

-- Create classes

In [8]:
from pymongo import MongoClient

class Connect(object):

  @staticmethod    
  def get_connection(database):
    username = "mdbUser"
    password = "ati580"
    return MongoClient('mongodb+srv://{}:{}@ati580-cluster.s5t5z.gcp.mongodb.net/{}?retryWrites=true&w=majority'.format(username, password, database))

In [101]:
from IPython.display import clear_output
import tensorflow as tf
import pandas as pd
import numpy as np
import pymongo
import datetime

class PoliceInterviews(object):

  def __init__(self, selected, num_rows, output, eval_percent, batch):
    self._columns = selected
    self._rows = num_rows
    # data model attributes
    self._output = output
    self._eval_percent = eval_percent
    self._batch = batch
    self._query_dataframe = pd.DataFrame([])
    # connect to database
    self._connect = Connect()
    self._mdb_client = self._connect.get_connection("POLICE_DATABASE")
    self._mdb_database = self._mdb_client['POLICE_DATABASE'] 
    self._mdb_collection = self._mdb_database['POLICE_INTERVIEWS']
    self._is_retrieved = False 

  # convert queries to dataframe or numpy array
  def query_and_convert(self, filter, projection, to="dataframe", rows=None):
    if rows is not None:
      _cursor = self._mdb_collection.find(filter, projection).limit(rows)
    else:
      _cursor = self._mdb_collection.find(filter, projection)
    _dataframe = pd.DataFrame(list(_cursor))
    if to == "dataframe":
      return _dataframe
    elif to == "numpy":
      _np_array = np.transpose(np.squeeze(_dataframe.to_numpy()))
      return _np_array
  
  # convert time to seconds 
  @staticmethod
  def time_to_int_seconds(datetime_str):
    time = datetime.datetime.strptime(datetime_str, '%m/%d/%Y %H:%M:%S %p').time()
    [hours, minutes, seconds] = [int(t) for t in str(time).split(':')]
    return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds).seconds

  # convert license tag to binary
  @staticmethod
  def instate_or_outofstate(license):
    if license == "LA":
      return 1
    return 0

  # convert stop description to binary
  @staticmethod
  def cause_or_suspicion(description):
    _probable_cause_list = ["CRIMINAL VIOLATION", 
                            "JUVENILE VIOLATION", 
                            "TRAFFIC VIOLATION", 
                            # "CALL FOR SERVICE", 
                            "OTHER", 
                            "PRESENT AT CRIME SCENE", 
                            "CITIZEN CONTACT"]
    if description in _probable_cause_list:
      return 0
    return 1

  def query_database(self):
    print("Pulling data ...")
    _filter = []
    _projection = dict({'_id':0})
    # create filter
    for key, value in self._columns.items():
      if value == 1:
        _filter.append({key:{"$exists": True}})
        _filter.append({key:{"$ne": np.nan}})
        _filter.append({key:{"$ne": ""}})
        _filter.append({key:{"$ne": None}})
        _filter.append({key:{"$ne": 0}})
    # create projection
    for key, value in self._columns.items():
      if value == 1:
        _projection[key] = value
    # query database
    self._query_dataframe = self.query_and_convert({"$and":_filter}, _projection, rows=self._rows)
    clear_output()

  def load_dataframe(self, split=True):
    if self._is_retrieved is False:
      self.query_database()
      self._is_retrieved = True
    # convert specific columns
    _dataframe = self._query_dataframe.copy()
    for key, value in self._columns.items():
      if value == 1 and key == 'EventDate':
        _dataframe['EventDate'] = self._query_dataframe['EventDate'].apply(lambda x: self.time_to_int_seconds(x))
      elif value == 1 and key == 'SubjectDriverLicState':
        _dataframe['SubjectDriverLicState'] = self._query_dataframe['SubjectDriverLicState'].apply(lambda x: self.instate_or_outofstate(x))
      elif value == 1 and key == 'StopDescription':
        _dataframe['StopDescription'] = self._query_dataframe['StopDescription'].apply(lambda x: self.cause_or_suspicion(x))
      elif value == 1 and key == 'Zip':
        _dataframe['Zip'] = self._query_dataframe['Zip'].apply(lambda x: int(x))
      elif value == 1 and key == 'VehicleYear':
        _dataframe['VehicleYear'] = self._query_dataframe['VehicleYear'].apply(lambda x: int(x))
      elif value == 1 and key == 'SubjectAge':
        _dataframe['SubjectAge'] = self._query_dataframe['SubjectAge'].apply(lambda x: int(x))
      elif value == 1 and key == 'SubjectHeight':
        _dataframe['SubjectHeight'] = self._query_dataframe['SubjectHeight'].apply(lambda x: int(x))
      elif value == 1 and key == 'SubjectWeight':
        _dataframe['SubjectWeight'] = self._query_dataframe['SubjectWeight'].apply(lambda x: int(x))
    # split data into train and evaluate set
    if split is True:
      _eval_df = _dataframe.sample(frac=self._eval_percent, random_state=1234)
      _train_df = _dataframe.drop(_eval_df.index)
      return _train_df, _eval_df
    return _dataframe

  # convert dataframe_to_dataset 
  @staticmethod
  def dataframe_to_dataset(dataframe, output):
    _dataframe = dataframe.copy()
    _labels    = _dataframe.pop(output)
    _dataset   = tf.data.Dataset.from_tensor_slices((dict(_dataframe), _labels))
    _dataset   = _dataset.shuffle(buffer_size=len(_dataframe))
    return _dataset

  def load_dataset(self):
    _train_df, _eval_df = self.load_dataframe()
    _labels = _eval_df["StopDescription"].value_counts()
    print(_labels)
    _train_ds = self.dataframe_to_dataset(_train_df, self._output)
    _eval_ds  = self.dataframe_to_dataset(_eval_df, self._output)
    # batch dataset
    _train_ds = _train_ds.batch(self._batch)
    _eval_ds  = _eval_ds.batch(self._batch)
    return _train_ds, _eval_ds


In [10]:
import plotly.graph_objects as go

class Histogram(object):

  def __init__(self, title_text, xaxis_text):
    self._title_text = title_text
    self._xaxis_text = xaxis_text

    self._fig = go.Figure()
    self._trace_count = 0
    self._orange = '#FF8C00'
    self._blue   = '#0000FF'

  def _get_color(self):
    if self._trace_count % 2 == 1:
      return self._blue
    if self._trace_count % 2 == 0:
      return self._orange

  def add_trace(self, data=[], label=""):
    if len(data)!=0:
      self._trace_count = self._trace_count + 1
      self._fig.add_trace(go.Histogram(
          x=data,
          # histnorm='probability density',
          name=label,
          marker_color=self._get_color(),
          opacity=1
          ))
      self.update_layout()
        
  def update_layout(self):
    self._fig.update_layout(
        #barmode='stack',
        title=dict({
            'text': '<b>' + self._title_text + '</b>',
            'y':0.85,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font':dict({
                'color':"black",
                'size':14})}),
        xaxis_title_text=self._xaxis_text, 
        yaxis_title_text="Count",
        )
    
  def show_figure(self):
    self._fig.show()

In [11]:
import plotly.graph_objects as go

class BarChart(object):
  def __init__(self, data_df, pivot, title_text, xaxis_text):
    self._data_df = data_df
    self._pivot = pivot
    self._title_text = title_text
    self._xaxis_text = xaxis_text
    self._colors = ['#FF8C00','#0000FF']
    self._count = data_df[self._pivot].value_counts()
    
    self._fig = go.Figure(data=[go.Bar(
        x=self._count.index,
        y=self._count.values,
        marker_color=self._colors)])

    self._fig.update_layout(
        barmode='stack',
        title=dict({
            'text': '<b>' + self._title_text + '</b>',
            'y':0.85,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font':dict({
                'color':"black",
                'size':14})}),
        xaxis_title_text=self._xaxis_text, 
        yaxis_title_text="Count")
    self._fig.show()

In [12]:
import numpy as np
import pandas as pd

connect = Connect()
mdb_client = connect.get_connection("POLICE_DATABASE")
mdb_client.testdb
mdb_database = mdb_client['POLICE_DATABASE'] 
mdb_collection = mdb_database['POLICE_INTERVIEWS']

# convert queries to dataframe or numpy array
def query_and_convert(filter, projection, to="dataframe"):
  cursor    = mdb_collection.find(filter, projection)
  dataframe = pd.DataFrame(list(cursor))
  if to is "dataframe":
    return dataframe
  elif to is "numpy":
    np_array = np.transpose(np.squeeze(dataframe.to_numpy()))
    return np_array

In [13]:
from scipy.stats import ttest_ind_from_stats

class TTestProp(object):

  def __init__(self, white_pop, black_pop):
    self._white_pop = white_pop
    self._black_pop = black_pop
    self._obs_total  = self._white_pop + self._black_pop

    self._mean_white = self._white_pop / self._obs_total
    self._std_white  = self._mean_white * (1 - self._mean_white)
    self._mean_black = self._black_pop / self._obs_total
    self._std_black  = self._mean_black * (1 - self._mean_black)

    print("White -> Mean: {} Std.: {}".format(self._mean_white, self._std_white))
    print("Black -> Mean: {} Std.: {}".format(self._mean_black, self._std_black))

    # H0 and HA
    #	P1 - P2 = 0, 	P1 - P2 ≠ 0
    self._result = ttest_ind_from_stats(mean1=self._mean_white,
                                        std1=self._std_white, 
                                        nobs1=self._white_pop, 
                                        mean2=self._mean_black, 
                                        std2=self._std_black, 
                                        nobs2=self._black_pop)
    print(self._result)

    # ACS: Black: 59.74% White: 33.99%
    # conf. interval p +- 1.96(pq/n) ^ .5
    self._white_pq = self._mean_white * ( 1 - self._mean_white)
    self._white_lbound = self._mean_white - 1.96 * np.sqrt(self._white_pq/self._white_pop)
    self._white_ubound = self._mean_white + 1.96 * np.sqrt(self._white_pq/self._white_pop)

    self._black_pq = self._mean_black * ( 1 - self._mean_black)
    self._black_lbound = self._mean_black - 1.96 * np.sqrt(self._black_pq/self._black_pop)
    self._black_ubound = self._mean_black + 1.96 * np.sqrt(self._black_pq/self._black_pop)

    print("White 95% Conf -> Lower: {} Upper: {}, Sample: {}".format(self._white_lbound, self._white_ubound, self._white_pop))
    print("Black 95% Conf -> Lower: {} Upper: {}, Sample: {}".format(self._black_lbound, self._black_ubound, self._black_pop))


### **Basic Bar and Histogram Charts**

-- Plot location features
1.   *disctrict*
2.   *zone*
3.   *zip-code*

In [14]:
district = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}} ]}, 
            {"_id":0, "District" : 1}, to="dataframe")

dist_chart = BarChart(district, "District", "District Feature", "Disctinct Values")

In [15]:
zone = query_and_convert({
    "$and":[{"Zone":{"$exists": True}}, 
            {"Zone":{"$ne": ""}},
            {"Zone":{"$ne": None}},
            {"Zone":{"$ne": 0}} ]}, 
            {"_id":0, "Zone" : 1}, to="dataframe")

zone_chart = BarChart(zone, "Zone", "Zone Feature", "Disctinct Values")

In [16]:
zipcode = query_and_convert({
    "$and":[{"Zip":{"$exists": True}}, 
            {"Zip":{"$ne": ""}},
            {"Zip":{"$ne": None}},
            {"Zip":{"$ne": 0}} ]}, 
            {"_id":0, "Zip" : 1}, to="dataframe")

zone_chart = BarChart(zipcode, "Zip", "Zip Code Feature", "Disctinct Values")

-- Plot vehicle features
1.   *make*
2.   *model*
3.   *style*
4.   *color*

In [17]:
make = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}} ]}, 
            {"_id":0, "VehicleMake" : 1}, to="dataframe")

make_chart = BarChart(make, "VehicleMake", "Vehicle Make Feature", "Disctinct Values")

In [18]:
model = query_and_convert({
    "$and":[{"VehicleModel":{"$exists": True}}, 
            {"VehicleModel":{"$ne": ""}},
            {"VehicleModel":{"$ne": None}},
            {"VehicleModel":{"$ne": 0}},
            {"VehicleModel":{"$ne": "OTHER"}}]}, 
            {"_id":0, "VehicleModel" : 1}, to="dataframe")

model_chart = BarChart(model, "VehicleModel", "Vehicle Model Feature", "Disctinct Values")

In [19]:
style = query_and_convert({
    "$and":[{"VehicleStyle":{"$exists": True}}, 
            {"VehicleStyle":{"$ne": ""}},
            {"VehicleStyle":{"$ne": None}},
            {"VehicleStyle":{"$ne": 0}} ]}, 
            {"_id":0, "VehicleStyle" : 1}, to="dataframe")

style_chart = BarChart(style, "VehicleStyle", "Vehicle Style Feature", "Disctinct Values")

In [20]:
color = query_and_convert({
    "$and":[{"VehicleColor":{"$exists": True}}, 
            {"VehicleColor":{"$ne": ""}},
            {"VehicleColor":{"$ne": None}},
            {"VehicleColor":{"$ne": 0}} ]}, 
            {"_id":0, "VehicleColor" : 1}, to="dataframe")

color_chart = BarChart(color, "VehicleColor", "Vehicle Color Feature", "Disctinct Values")

-- Plot physical features
1.   *height*
2.   *weight*
3.   *hair color*

In [21]:
race = query_and_convert({
    "$and":[{"SubjectRace":{"$exists": True}}, 
            {"SubjectRace":{"$ne": ""}},
            {"SubjectRace":{"$ne": None}},
            {"SubjectRace":{"$ne": 0}} ]}, 
            {"_id":0, "SubjectRace" : 1}, to="dataframe")

race_chart = BarChart(race, "SubjectRace", "Subject Race Feature", "Disctinct Values")

In [22]:
age = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}} ]}, 
            {"_id":0, "SubjectAge" : 1}, to="dataframe")

race_chart = BarChart(age, "SubjectAge", "Subject Age Feature", "Disctinct Values")

In [23]:
height = query_and_convert({
    "$and":[{"SubjectHeight":{"$exists": True}}, 
            {"SubjectHeight":{"$ne": ""}},
            {"SubjectHeight":{"$ne": None}},
            {"SubjectHeight":{"$ne": 0}},
            {"SubjectHeight":{"$lt": 100}},
            {"SubjectHeight":{"$gt": 40}} ]}, 
            {"_id":0, "SubjectHeight" : 1}, to="numpy")

weight = query_and_convert({
    "$and":[{"SubjectWeight":{"$exists": True}}, 
            {"SubjectWeight":{"$ne": ""}},
            {"SubjectWeight":{"$ne": None}},
            {"SubjectWeight":{"$ne": 0}},
            {"SubjectWeight":{"$lt": 250}},
            {"SubjectWeight":{"$gt": 50}} ]}, 
            {"_id":0, "SubjectWeight" : 1}, to="numpy")

hw_hist = Histogram("Subject Height and Weight Features", "Range")
hw_hist.add_trace(data=height, label="Height")
hw_hist.add_trace(data=weight, label="Weight")
hw_hist.show_figure()

In [24]:
hair = query_and_convert({
    "$and":[{"SubjectHairColor":{"$exists": True}}, 
            {"SubjectHairColor":{"$ne": ""}},
            {"SubjectHairColor":{"$ne": None}},
            {"SubjectHairColor":{"$ne": 0}} ]}, 
            {"_id":0, "SubjectHairColor" : 1}, to="dataframe")

hair_chart = BarChart(hair, "SubjectHairColor", "Subject Hair Color Feature", "Disctinct Values")

### **Correlation**

In [102]:
# all options - set value to 1 to include in query
columns = dict({
    'FieldInterviewID':0,     
    'NOPD_Item':0,            
    'EventDate':1,            
    'District':1,             
    'Zone':1,                 
    'OfficerAssignment':0, 
    'StopDescription':1, 
    'ActionsTaken':0, 
    'VehicleYear':1, 
    'VehicleMake':1, 
    'VehicleModel':1, 
    'VehicleStyle':0, 
    'VehicleColor':1, 
    'SubjectID':0, 
    'SubjectRace':1, 
    'SubjectGender':1, 
    'SubjectAge':1, 
    'SubjectHasPhotoID':0, 
    'SubjectHeight':1, 
    'SubjectWeight':1, 
    'SubjectEyeColor':0, 
    'SubjectHairColor':1, 
    'SubjectDriverLicState':0, 
    'CreatedDateTime':0, 
    'LastModifiedDateTime':0, 
    'Longitude':1, 
    'Latitude':1, 
    'Zip':0, 
    'BlockAddress':0})

# pull data from db and convert
police_interviews = PoliceInterviews(columns, num_rows=None, output="StopDescription", eval_percent=0.2, batch=25)

In [96]:
# visual check
explore_df = police_interviews.load_dataframe(split=False)
print('{:<25} {} \n'.format("Column Name", "Sample"))
for (column_name, column_data) in explore_df.iteritems():
  print('{:<25} {} - ({})'.format(column_name, column_data[0], type(column_data[0])))

Column Name               Sample 

EventDate                 6840 - (<class 'numpy.int64'>)
District                  6 - (<class 'numpy.int64'>)
Zone                      F - (<class 'str'>)
StopDescription           0 - (<class 'numpy.int64'>)
VehicleYear               2000 - (<class 'numpy.int64'>)
VehicleMake               NISSAN - (<class 'str'>)
VehicleModel              ALTIMA - (<class 'str'>)
VehicleColor              GRAY - (<class 'str'>)
SubjectRace               BLACK - (<class 'str'>)
SubjectGender             FEMALE - (<class 'str'>)
SubjectAge                26 - (<class 'numpy.int64'>)
SubjectHeight             66 - (<class 'numpy.int64'>)
SubjectWeight             140 - (<class 'numpy.int64'>)
SubjectHairColor          Black - (<class 'str'>)
Longitude                 -90.10805312379999 - (<class 'numpy.float64'>)
Latitude                  29.940391628 - (<class 'numpy.float64'>)


In [27]:
from sklearn.preprocessing import LabelEncoder

explore_df_encoded = explore_df.copy()

# convert categorical to numerical encoding
labeled = ["Zone", 
           "VehicleMake",
           "VehicleModel",
           "VehicleColor",            
           "SubjectRace",
           "SubjectGender",
           "SubjectHairColor"]

explore_df_encoded[labeled] = explore_df[labeled].apply(LabelEncoder().fit_transform)
explore_df_encoded.corr()

Unnamed: 0,EventDate,District,Zone,StopDescription,VehicleYear,VehicleMake,VehicleModel,VehicleColor,SubjectRace,SubjectGender,SubjectAge,SubjectHeight,SubjectWeight,SubjectHairColor,Longitude,Latitude
EventDate,1.0,0.019433,-0.009864,-0.004415,-0.00454,-0.001421,0.002236,0.007421,-0.007263,-0.008807,0.013118,-0.006307,-0.00152,0.001043,0.022843,-0.00603
District,0.019433,1.0,-0.342899,-0.005051,0.008006,-0.017194,0.009017,0.006087,-0.034745,0.024011,-0.003561,0.014069,0.017164,-0.024651,0.453636,0.107109
Zone,-0.009864,-0.342899,1.0,-0.003739,0.001937,-0.003761,-0.01519,0.000148,-0.112386,-0.014277,-0.031342,-0.007065,-0.013378,-0.034163,-0.127698,0.279855
StopDescription,-0.004415,-0.005051,-0.003739,1.0,-0.006403,-0.015644,-0.004723,-0.001401,-0.02314,0.039992,-0.031128,0.021263,-0.005247,-0.014653,0.013259,0.018302
VehicleYear,-0.00454,0.008006,0.001937,-0.006403,1.0,0.057896,0.006603,-0.024186,0.02256,-0.076621,-0.02064,-0.03355,-0.015702,-0.019689,-0.02697,-0.025471
VehicleMake,-0.001421,-0.017194,-0.003761,-0.015644,0.057896,1.0,-0.075092,-0.01434,0.071894,-0.092347,-0.043195,-0.077955,-0.085916,0.014707,-0.034132,-0.019466
VehicleModel,0.002236,0.009017,-0.01519,-0.004723,0.006603,-0.075092,1.0,0.009231,0.040193,0.030982,0.085019,0.029427,0.050912,0.035532,-0.001954,-0.009654
VehicleColor,0.007421,0.006087,0.000148,-0.001401,-0.024186,-0.01434,0.009231,1.0,0.018047,0.009718,0.024355,-0.003406,0.003476,0.020069,0.003726,0.004212
SubjectRace,-0.007263,-0.034745,-0.112386,-0.02314,0.02256,0.071894,0.040193,0.018047,1.0,0.022004,0.088557,0.041415,-0.057384,0.391526,-0.193526,-0.104915
SubjectGender,-0.008807,0.024011,-0.014277,0.039992,-0.076621,-0.092347,0.030982,0.009718,0.022004,1.0,0.061909,0.618811,0.31457,0.025589,-0.009147,0.004359


### **Feature Importance**

Linear Feature Importance

-- *Compute the ANOVA F-value for the provided sample.*

In [28]:
# prepare data
x_train = explore_df_encoded.copy()
y_train = x_train.pop("StopDescription")

In [29]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# source: https://machinelearningmastery.com/feature-selection-with-categorical-data/

selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(x_train, y_train)
selector_df = pd.DataFrame(columns=["Name", "Score"])

for i, name in enumerate(x_train.columns):
  selector_df.loc[i] = name, selector.scores_[i]

selector_df = selector_df.sort_values(by=['Score'], ascending=True)
fig = go.Figure(data=[go.Bar(
    x=selector_df["Score"].values,
    y=selector_df["Name"].values,
    marker_color='#0000FF',
    orientation='h')])
fig.update_layout(
    barmode='stack',
    title=dict({
        'text': '<b>' + "Linear Feature Importance" + '</b>',
        'y':0.85,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font':dict({
            'color':"black",
            'size':14})}),
        xaxis_title_text="Feature", 
        yaxis_title_text="Score")
fig.show()

Non-Linear Feature Importance

-- *Use XGBoost Classifier*

In [30]:
from xgboost import XGBClassifier

# api: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRFClassifier

# Importance is calculated for a single decision tree by the amount that each attribute split 
# point improves the performance measure, weighted by the number of observations the node is responsible for.

model = XGBClassifier()
model.fit(x_train, y_train)
importance = model.feature_importances_
importance_df = pd.DataFrame(columns=["Name", "Score"])

for i, name in enumerate(x_train.columns):
  importance_df.loc[i] = name, importance[i]

importance_df = importance_df.sort_values(by=['Score'], ascending=True)
fig = go.Figure(data=[go.Bar(
    x=importance_df["Score"].values,
    y=importance_df["Name"].values,
    marker_color='#FF8C00',
    orientation='h')])
fig.update_layout(
    barmode='stack',
    title=dict({
        'text': '<b>' + "Non-Linear Feature Importance" + '</b>',
        'y':0.85,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font':dict({
            'color':"black",
            'size':14})}),
        xaxis_title_text="Feature", 
        yaxis_title_text="Score")
fig.show()

### **Hypothesis Testing By Location**

In [31]:
probable_cause_list = ["CRIMINAL VIOLATION", "JUVENILE VIOLATION", "TRAFFIC VIOLATION", "CALL FOR SERVICE", "OTHER", "PRESENT AT CRIME SCENE", "CITIZEN CONTACT"]

-- reasonable suspicion | gender and race

In [32]:
from scipy.stats import ttest_ind_from_stats

dist8_white_male = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}},
            {"District":{"$eq": 8}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

dist8_black_male = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}},
            {"District":{"$eq": 8}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

district_hist = Histogram("Reasonable Suspiscion | Race and Gender in District 8", "Stop Description")
district_hist.add_trace(data=dist8_white_male, label="White Male")
district_hist.add_trace(data=dist8_black_male, label="Black Male")
district_hist.show_figure()

In [33]:
dist8_white_male = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}},
            {"District":{"$eq": 8}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

dist8_black_male = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}},
            {"District":{"$eq": 8}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

# sum probable cause occurences 
dist8_white_male = dist8_white_male["StopDescription"].value_counts().sum()
dist8_black_male = dist8_black_male["StopDescription"].value_counts().sum()
# ttest proportions
ttest = TTestProp(dist8_white_male, dist8_black_male)

White -> Mean: 0.4207147814018043 Std.: 0.24371385411183633
Black -> Mean: 0.5792852185981957 Std.: 0.24371385411183633
Ttest_indResult(statistic=-42.23813443946431, pvalue=0.0)
White 95% Conf -> Lower: 0.4093704297215406 Upper: 0.432059133082068, Sample: 7275
Black 95% Conf -> Lower: 0.5696174255915757 Upper: 0.5889530116048157, Sample: 10017


In [34]:
from scipy.stats import ttest_ind_from_stats

dist8_white_female = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}},
            {"District":{"$eq": 8}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

dist8_black_female = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}},
            {"District":{"$eq": 8}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

district_hist = Histogram("Reasonable Suspiscion | Race and Gender in District 8", "Stop Description")
district_hist.add_trace(data=dist8_white_female, label="White Female")
district_hist.add_trace(data=dist8_black_female, label="Black Female")
district_hist.show_figure()

In [35]:
dist8_white_female = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}},
            {"District":{"$eq": 8}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

dist8_black_female = query_and_convert({
    "$and":[{"District":{"$exists": True}}, 
            {"District":{"$ne": ""}},
            {"District":{"$ne": None}},
            {"District":{"$ne": 0}},
            {"District":{"$eq": 8}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

# sum probable cause occurences 
dist8_white_female = dist8_white_female["StopDescription"].value_counts().sum()
dist8_black_female = dist8_black_female["StopDescription"].value_counts().sum()
# ttest proportions
ttest = TTestProp(dist8_white_female, dist8_black_female)

White -> Mean: 0.4711507810009563 Std.: 0.2491677225631452
Black -> Mean: 0.5288492189990437 Std.: 0.2491677225631452
Ttest_indResult(statistic=-6.474040682794434, pvalue=1.1035193357204706e-10)
White 95% Conf -> Lower: 0.44570213118606566 Upper: 0.49659943081584695, Sample: 1478
Black 95% Conf -> Lower: 0.5048289000560966 Upper: 0.5528695379419908, Sample: 1659


### **Hypothesis Testing By Vehicle Attributes**

In [36]:
chevy_white_male = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}},
            {"VehicleMake":{"$eq": "CHEVROLET"}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

chevy_black_male = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}},
            {"VehicleMake":{"$eq": "CHEVROLET"}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

district_hist = Histogram("Reasonable Suspiscion | Race and Gender in Chevrolet", "Stop Description")
district_hist.add_trace(data=chevy_white_male, label="White Male")
district_hist.add_trace(data=chevy_black_male, label="Black Male")
district_hist.show_figure()

In [37]:
chevy_white_male = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}},
            {"VehicleMake":{"$eq": "CHEVROLET"}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

chevy_black_male = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}},
            {"VehicleMake":{"$eq": "CHEVROLET"}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

# sum probable cause occurences 
chevy_white_male = chevy_white_male["StopDescription"].value_counts().sum()
chevy_black_male = chevy_black_male["StopDescription"].value_counts().sum()
# ttest proportions
ttest = TTestProp(chevy_white_male, chevy_black_male)

White -> Mean: 0.18825781748564135 Std.: 0.15281681164118432
Black -> Mean: 0.8117421825143587 Std.: 0.15281681164118427
Ttest_indResult(statistic=-63.13562707320548, pvalue=0.0)
White 95% Conf -> Lower: 0.14364798108455784 Upper: 0.23286765388672487, Sample: 295
Black 95% Conf -> Lower: 0.7902590264753397 Upper: 0.8332253385533777, Sample: 1272


In [38]:
chevy_white_female = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}},
            {"VehicleMake":{"$eq": "CHEVROLET"}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

chevy_black_female = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}},
            {"VehicleMake":{"$eq": "CHEVROLET"}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

district_hist = Histogram("Reasonable Suspiscion | Race and Gender in Chevrolet", "Stop Description")
district_hist.add_trace(data=chevy_white_female, label="White Female")
district_hist.add_trace(data=chevy_black_female, label="Black Female")
district_hist.show_figure()

In [39]:
chevy_white_female = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}},
            {"VehicleMake":{"$eq": "CHEVROLET"}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

chevy_black_female = query_and_convert({
    "$and":[{"VehicleMake":{"$exists": True}}, 
            {"VehicleMake":{"$ne": ""}},
            {"VehicleMake":{"$ne": None}},
            {"VehicleMake":{"$ne": 0}},
            {"VehicleMake":{"$eq": "CHEVROLET"}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

# sum probable cause occurences 
chevy_white_female = chevy_white_female["StopDescription"].value_counts().sum()
chevy_black_female = chevy_black_female["StopDescription"].value_counts().sum()
# ttest proportions
ttest = TTestProp(chevy_white_female, chevy_black_female)

White -> Mean: 0.20844327176781002 Std.: 0.16499467422254094
Black -> Mean: 0.7915567282321899 Std.: 0.16499467422254094
Ttest_indResult(statistic=-27.947161567186868, pvalue=6.708674351083374e-94)
White 95% Conf -> Lower: 0.11887019570356201 Upper: 0.29801634783205805, Sample: 79
Black 95% Conf -> Lower: 0.7455913956254919 Upper: 0.8375220608388879, Sample: 300


### **Hypothesis Testing by Physical Attributes**

In [40]:
age_white_male = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"SubjectAge":{"$gt": 20}},
            {"SubjectAge":{"$lt": 30}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

age_black_male = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"SubjectAge":{"$gt": 20}},
            {"SubjectAge":{"$lt": 30}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

district_hist = Histogram("Reasonable Suspiscion | Race and Gender between Age 20 to 25", "Stop Description")
district_hist.add_trace(data=age_white_male, label="White Male")
district_hist.add_trace(data=age_black_male, label="Black Male")
district_hist.show_figure()

In [41]:
age_white_male = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"SubjectAge":{"$gt": 20}},
            {"SubjectAge":{"$lt": 30}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

age_black_male = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"SubjectAge":{"$gt": 20}},
            {"SubjectAge":{"$lt": 30}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "MALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

# sum probable cause occurences 
age_white_male = age_white_male["StopDescription"].value_counts().sum()
age_black_male = age_black_male["StopDescription"].value_counts().sum()
# ttest proportions
ttest = TTestProp(age_white_male, age_black_male)

White -> Mean: 0.22937255803263618 Std.: 0.1767607876542011
Black -> Mean: 0.7706274419673639 Std.: 0.17676078765420108
Ttest_indResult(statistic=-169.83742687334077, pvalue=0.0)
White 95% Conf -> Lower: 0.2163302705419744 Upper: 0.24241484552329795, Sample: 3992
Black 95% Conf -> Lower: 0.7635119937398517 Upper: 0.7777428901948761, Sample: 13412


In [42]:
age_white_female = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"SubjectAge":{"$gt": 20}},
            {"SubjectAge":{"$lt": 30}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

age_black_female = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"SubjectAge":{"$gt": 20}},
            {"SubjectAge":{"$lt": 30}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

district_hist = Histogram("Reasonable Suspiscion | Race and Gender between Age 20 to 25", "Stop Description")
district_hist.add_trace(data=age_white_female, label="White Female")
district_hist.add_trace(data=age_black_female, label="Black Female")
district_hist.show_figure()

In [43]:
age_white_female = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"SubjectAge":{"$gt": 20}},
            {"SubjectAge":{"$lt": 30}},
            {"SubjectRace":{"$eq": "WHITE"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

age_black_female = query_and_convert({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"SubjectAge":{"$gt": 20}},
            {"SubjectAge":{"$lt": 30}},
            {"SubjectRace":{"$eq": "BLACK"}},
            {"SubjectGender":{"$eq": "FEMALE"}},
            {"StopDescription":{"$nin": probable_cause_list}} ]}, 
            {"_id":0, "StopDescription" : 1}, to="dataframe")

# sum probable cause occurences 
age_white_female = age_white_female["StopDescription"].value_counts().sum()
age_black_female = age_black_female["StopDescription"].value_counts().sum()
# ttest proportions
ttest = TTestProp(age_white_female, age_black_female)

White -> Mean: 0.30650224215246635 Std.: 0.21255861770797724
Black -> Mean: 0.6934977578475336 Std.: 0.21255861770797724
Ttest_indResult(statistic=-56.05753586449956, pvalue=0.0)
White 95% Conf -> Lower: 0.28206168102196477 Upper: 0.33094280328296793, Sample: 1367
Black 95% Conf -> Lower: 0.6772495498507718 Upper: 0.7097459658442955, Sample: 3093


### **Inference Models**

In [None]:
train_ds, eval_ds = police_interviews.load_dataset()

In [109]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras import layers
from tensorflow import keras

def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    # Learn the statistics of the data
    normalizer.adapt(feature_ds)
    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature

def encode_string_categorical_feature(feature, name, dataset):
    # Create a StringLookup layer which will turn strings into integer indices
    index = StringLookup()
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    # Learn the set of possible string values and assign them a fixed integer index
    index.adapt(feature_ds)
    # Turn the string input into integer indices
    encoded_feature = index(feature)
    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")
    # Prepare a dataset of indices
    feature_ds = feature_ds.map(index)
    # Learn the space of possible indices
    encoder.adapt(feature_ds)
    # Apply one-hot encoding to our indices
    encoded_feature = encoder(encoded_feature)
    return encoded_feature

def encode_integer_categorical_feature(feature, name, dataset):
    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    # Learn the space of possible indices
    encoder.adapt(feature_ds)
    # Apply one-hot encoding to our indices
    encoded_feature = encoder(feature)
    return encoded_feature

# categorical features encoded as integers
# inout_state  = keras.Input(shape=(1,), name="SubjectDriverLicState", dtype="int64")
district     = keras.Input(shape=(1,), name="District", dtype="int64")
vehicle_year = keras.Input(shape=(1,), name="VehicleYear", dtype="int64")
# # zip          = keras.Input(shape=(1,), name="Zip", dtype="int64")
# # output
# # stop_description = keras.Input(shape=(1,), name="StopDescription", dtype="int64")

# # categorical feature encoded as string
zone             = keras.Input(shape=(1,), name="Zone", dtype="string")
vehicle_make     = keras.Input(shape=(1,), name="VehicleMake", dtype="string")
vehicle_model    = keras.Input(shape=(1,), name="VehicleModel", dtype="string")
# vehicle_style    = keras.Input(shape=(1,), name="VehicleStyle", dtype="string")
vehicle_color    = keras.Input(shape=(1,), name="VehicleColor", dtype="string")
subject_race     = keras.Input(shape=(1,), name="SubjectRace", dtype="string")
subject_gender   = keras.Input(shape=(1,), name="SubjectGender", dtype="string")
hair_color       = keras.Input(shape=(1,), name="SubjectHairColor", dtype="string")

# # numerical features
time             = keras.Input(shape=(1,), name="EventDate")
subject_age      = keras.Input(shape=(1,), name="SubjectAge")
subject_height   = keras.Input(shape=(1,), name="SubjectHeight")
subject_weight   = keras.Input(shape=(1,), name="SubjectWeight")
longitude        = keras.Input(shape=(1,),   name="Longitude")
latitude         = keras.Input(shape=(1,), name="Latitude")

all_inputs = [# inout_state,
              district, 
              vehicle_year, 
              # # zip, 
              zone, 
              vehicle_make, 
              vehicle_model, 
              # vehicle_style, 
              vehicle_color, 
              subject_race, 
              subject_gender, 
              hair_color, 
              time, 
              subject_age, 
              subject_height, 
              subject_weight,
              longitude,
              latitude]

In [110]:
# integer categorical features
# inout_state_encoded  = encode_integer_categorical_feature(inout_state, "SubjectDriverLicState", train_ds)
district_encoded     = encode_integer_categorical_feature(district, "District", train_ds)
vehicle_year_encoded = encode_integer_categorical_feature(vehicle_year, "VehicleYear", train_ds)
# zip_encoded          = encode_integer_categorical_feature(zip, "Zip", train_ds)

# string categorical features
zone_encoded             = encode_string_categorical_feature(zone, "Zone", train_ds)
vehicle_make_encoded     = encode_string_categorical_feature(vehicle_make, "VehicleMake", train_ds)
vehicle_model_encoded    = encode_string_categorical_feature(vehicle_model, "VehicleModel", train_ds)
# # vehicle_style_encoded    = encode_string_categorical_feature(vehicle_style, "VehicleStyle", train_ds)
vehicle_color_encoded    = encode_string_categorical_feature(vehicle_color, "VehicleColor", train_ds)
subject_race_encoded     = encode_string_categorical_feature(subject_race, "SubjectRace", train_ds)
subject_gender_encoded   = encode_string_categorical_feature(subject_gender, "SubjectGender", train_ds)
hair_color_encoded       = encode_string_categorical_feature(hair_color, "SubjectHairColor", train_ds)

# numerical features
time_encoded             = encode_numerical_feature(time, "EventDate", train_ds)
subject_age_encoded      = encode_numerical_feature(subject_age, "SubjectAge", train_ds)
subject_height_encoded   = encode_numerical_feature(subject_height, "SubjectHeight", train_ds)
subject_weight_encoded   = encode_numerical_feature(subject_weight, "SubjectWeight", train_ds)
longitude_encoded        = encode_numerical_feature(longitude, "Longitude", train_ds)
latitude_encoded         = encode_numerical_feature(latitude, "Latitude", train_ds)

all_features = layers.concatenate([
                                   # inout_state_encoded,
                                   district_encoded,
                                   vehicle_year_encoded,
                                  #  # zip_encoded,
                                   zone_encoded,
                                   vehicle_make_encoded,
                                   vehicle_model_encoded,
                                  #  # vehicle_style_encoded,
                                   vehicle_color_encoded,
                                   subject_race_encoded,
                                   subject_gender_encoded,
                                   hair_color_encoded,
                                   time_encoded,
                                   subject_age_encoded,
                                   subject_height_encoded,
                                   subject_weight_encoded,
                                   longitude_encoded,
                                   latitude_encoded])

In [111]:
# build model
x = layers.Dense(32, activation="relu")(all_features)
x = layers.Dropout(0.2)(x)
x = layers.Dense(16, activation="relu")(x)
x = layers.Dropout(0.2)(x)
output = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(all_inputs, output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

In [112]:
# train model
model.fit(train_ds, epochs=20, validation_data=eval_ds)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f0b34de5f28>

### **Logistic Regression**
-- Binary Logistic Regression

In [120]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

def encode_cat_features(features, dataframe, encoder):
  _dataframe = dataframe.copy()
  _dataframe = dataframe[features]
  transform_array = encoder.transform(_dataframe).toarray()
  return np.transpose(transform_array)

def z_score_norm(x, mean, std):
  return ( float(x) - float(mean) ) / float(std)

def inv_z_score_norm(z, mean, std):
  return float(mean) + float(z) * float(std)

def encode_norm_features(features, dataframe, mean=None, std=None):
  _dataframe = dataframe.copy()
  _dataframe = _dataframe[features]
  for feature in features:
    _mean = mean[feature]
    _std  = std[feature]
    _dataframe[feature] = _dataframe[feature].apply(lambda x: z_score_norm(x, _mean, _std))
  return np.transpose(_dataframe.to_numpy())
  
# load data
police_df = police_interviews.load_dataframe(split=False)
police_label_df = police_df.pop("StopDescription")

# categorical features
categorical = ["Zone", 
               "VehicleMake", 
               "VehicleModel", 
               # "VehicleStyle",         
               "VehicleColor", 
               "SubjectRace", 
               "SubjectGender",
               "SubjectHairColor", 
               # "SubjectDriverLicState", 
               "District", 
               "VehicleYear", 
               # "Zip"
               ]      

numerical = ["EventDate", "SubjectAge", "SubjectHeight", "SubjectWeight", "Longitude", "Latitude"]

# one-hot encoder
# train on the whole set to avoid feature dim errors
onehot_encoder = OneHotEncoder(sparse=True)
onehot_encoder.fit(police_df[categorical])

# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(police_df, police_label_df, test_size=0.2, random_state=1234)

print(x_test)

# # categorical features encoding
# x_train_cat_encoded = encode_cat_features(categorical, x_train, onehot_encoder)
# x_test_cat_encoded  = encode_cat_features(categorical, x_test, onehot_encoder)

# # # numerical features encoding
# mean = x_train[numerical].mean(axis=0)
# std  = x_train[numerical].std(axis=0)
# x_train_num_encoded = encode_norm_features(numerical, x_train, mean, std)
# x_test_num_encoded  = encode_norm_features(numerical, x_test, mean, std)

# # # combine categorical and numerical
# x_train_combined = np.vstack([x_train_cat_encoded, x_train_num_encoded])
# x_train_combined = np.transpose(x_train_combined)
# x_test_combined = np.vstack([x_test_cat_encoded, x_test_num_encoded])
# x_test_combined = np.transpose(x_test_combined)

# # train model
# logistic_regression = LogisticRegression(random_state=0, max_iter=1000).fit(x_train_combined, y_train)

# # # accuracy on test set
# print(logistic_regression.score(x_train_combined, y_train))
# print(logistic_regression.score(x_test_combined, y_test))

        EventDate  District Zone  ...  SubjectHairColor  Longitude   Latitude
65573       32340         7    P  ...             Black -89.993705  30.029789
45998        8880         5    D  ...            Blonde -90.046003  29.967929
74468       46200         3    R  ...             Black -90.065501  30.016519
169056      18120         3    E  ...             Brown -90.109330  29.995127
246421      38820         8    D  ...            Blonde -90.060169  29.961411
...           ...       ...  ...  ...               ...        ...        ...
199719      40020         8    E  ...             Brown -90.057747  29.960877
65036       46440         4    D  ...             Black -90.007969  29.921966
69596       25560         6    H  ...             Black -90.083084  29.936994
77865       41940         4    D  ...             Black -90.007853  29.923775
215054      30180         6    F  ...             Black -90.085604  29.939926

[50480 rows x 15 columns]


### **Compare Models**

In [171]:
from sklearn.metrics import confusion_matrix

# predict classes
log_pred = logistic_regression.predict(x_test_combined[:5000, :])
expected = y_test[:5000].to_numpy()

# log_pred = logistic_regression.predict(x_test_combined)
# expected = y_test.to_numpy()

# confusion matrix
tn, fp, fn, tp = confusion_matrix(expected, log_pred, normalize=None).ravel()

print("True Negative:  ", tn) 
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Prositive: ", tp)

True Negative:   4868
False Positive:  0
False Negative:  132
True Prositive:  0


In [187]:
from sklearn.metrics import confusion_matrix

x_test_list = x_test[:5000].to_dict('r')

# x_dict = disct({})
# for row in x_test[:5000].iterrows():


nn_pred_2 = np.empty(5000, dtype=np.int)
for i, item in enumerate(x_test_list):
    input_dict = {name: tf.convert_to_tensor([value]) for name, value in item.items()}
    prediction = np.squeeze(model.predict(input_dict))
    if prediction >= 0.5:
      nn_pred_2[i] = int(1)
    else:
      nn_pred_2[i] = int(0)

print(nn_pred_2)
tn, fp, fn, tp = confusion_matrix(expected, nn_pred_2, normalize=None).ravel()
print("True Negative:  ", tn) 
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Prositive: ", tp)

[0 0 0 ... 0 0 0]
True Negative:   4867
False Positive:  1
False Negative:  130
True Prositive:  2
