<a href="https://colab.research.google.com/github/uninstallit/ati580_final_project/blob/edvin-1/ati580_explore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Initial Setup**

-- Mount to google drive to load csv

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

-- Save libraries list in .txt 

In [None]:
# pip freeze --local > /content/gdrive/My\ Drive/colab_installed.txt

-- Install libraries from .txt list

In [None]:
# pip install --upgrade --force-reinstall `cat/content/gdrive/My\ Drive/colab_installed.txt`

-- Load the clean database

In [None]:
# db.drop_collection('addressbook')
# data.reset_index(inplace=True)
# data_dict = data.to_dict("records")
# collection.insert_many(data_dict)

### **Framework**

In [None]:
pip install dnspython

-- Create classes

In [1]:
from pymongo import MongoClient

class Connect(object):

  @staticmethod    
  def get_connection(database):
    username = "mdbUser"
    password = "ati580"
    return MongoClient('mongodb+srv://{}:{}@ati580-cluster.s5t5z.gcp.mongodb.net/{}?retryWrites=true&w=majority'.format(username, password, database))

In [2]:
from IPython.display import clear_output
import tensorflow as tf
import pandas as pd
import numpy as np
import pymongo
import datetime

class PoliceInterviews(object):

  def __init__(self, selected, num_rows, output, eval_percent, batch):
    self._columns = selected
    self._rows = num_rows
    # data model attributes
    self._output = output
    self._eval_percent = eval_percent
    self._batch = batch
    self._query_dataframe = pd.DataFrame([])
    # connect to database
    self._connect = Connect()
    self._mdb_client = self._connect.get_connection("POLICE_DATABASE")
    self._mdb_database = self._mdb_client['POLICE_DATABASE'] 
    self._mdb_collection = self._mdb_database['POLICE_INTERVIEWS']
    self._is_retrieved = False 

  # convert queries to dataframe or numpy array
  def query_and_convert(self, filter, projection, to="dataframe", rows=None):
    if rows is not None:
      _cursor = self._mdb_collection.find(filter, projection).limit(rows)
    else:
      _cursor = self._mdb_collection.find(filter, projection)
    _dataframe = pd.DataFrame(list(_cursor))
    if to == "dataframe":
      return _dataframe
    elif to == "numpy":
      _np_array = np.transpose(np.squeeze(_dataframe.to_numpy()))
      return _np_array
  
  # convert time to seconds 
  @staticmethod
  def time_to_int_seconds(datetime_str):
    time = datetime.datetime.strptime(datetime_str, '%m/%d/%Y %H:%M:%S %p').time()
    [hours, minutes, seconds] = [int(t) for t in str(time).split(':')]
    return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds).seconds

  # convert license tag to binary
  @staticmethod
  def instate_or_outofstate(license):
    if license == "LA":
      return 1
    return 0

  # convert stop description to binary
  @staticmethod
  def cause_or_suspicion(description):
    _probable_cause_list = ["CRIMINAL VIOLATION", "JUVENILE VIOLATION", "TRAFFIC VIOLATION"]
    if description in _probable_cause_list:
      return 0
    return 1

  def query_database(self):
    print("Pulling data ...")
    _filter = []
    _projection = dict({'_id':0})
    # create filter
    for key, value in self._columns.items():
      if value == 1:
        _filter.append({key:{"$exists": True}})
        _filter.append({key:{"$ne": ""}})
        _filter.append({key:{"$ne": None}})
        _filter.append({key:{"$ne": 0}})
    # create projection
    for key, value in self._columns.items():
      if value == 1:
        _projection[key] = value
    # query database
    self._query_dataframe = self.query_and_convert({"$and":_filter}, _projection, rows=self._rows)
    clear_output()

  def load_dataframe(self, split=True):
    if self._is_retrieved is False:
      self.query_database()
      self._is_retrieved = True
    # convert specific columns
    _dataframe = self._query_dataframe.copy()
    for key, value in self._columns.items():
      if value == 1 and key == 'EventDate':
        _dataframe['EventDate'] = self._query_dataframe['EventDate'].apply(lambda x: self.time_to_int_seconds(x))
      elif value == 1 and key == 'SubjectDriverLicState':
        _dataframe['SubjectDriverLicState'] = self._query_dataframe['SubjectDriverLicState'].apply(lambda x: self.instate_or_outofstate(x))
      elif value == 1 and key == 'StopDescription':
        _dataframe['StopDescription'] = self._query_dataframe['StopDescription'].apply(lambda x: self.cause_or_suspicion(x))
      elif value == 1 and key == 'Zip':
        _dataframe['Zip'] = self._query_dataframe['Zip'].apply(lambda x: int(x))
      elif value == 1 and key == 'VehicleYear':
        _dataframe['VehicleYear'] = self._query_dataframe['VehicleYear'].apply(lambda x: int(x))
      elif value == 1 and key == 'SubjectAge':
        _dataframe['SubjectAge'] = self._query_dataframe['SubjectAge'].apply(lambda x: int(x))
      elif value == 1 and key == 'SubjectHeight':
        _dataframe['SubjectHeight'] = self._query_dataframe['SubjectHeight'].apply(lambda x: int(x))
      elif value == 1 and key == 'SubjectWeight':
        _dataframe['SubjectWeight'] = self._query_dataframe['SubjectWeight'].apply(lambda x: int(x))
    # split data into train and evaluate set
    if split is True:
      _eval_df = _dataframe.sample(frac=self._eval_percent, random_state=1234)
      _train_df = _dataframe.drop(_eval_df.index)
      return _train_df, _eval_df
    return _dataframe

  # convert dataframe_to_dataset 
  @staticmethod
  def dataframe_to_dataset(dataframe, output):
    _dataframe = dataframe.copy()
    _labels    = _dataframe.pop(output)
    _dataset   = tf.data.Dataset.from_tensor_slices((dict(_dataframe), _labels))
    _dataset   = _dataset.shuffle(buffer_size=len(_dataframe))
    return _dataset

  def load_dataset(self):
    _train_df, _eval_df = self.load_dataframe()
    _train_ds = self.dataframe_to_dataset(_train_df, self._output)
    _eval_ds  = self.dataframe_to_dataset(_eval_df, self._output)
    # batch dataset
    _train_ds = _train_ds.batch(self._batch)
    _eval_ds  = _eval_ds.batch(self._batch)
    return _train_ds, _eval_ds


In [6]:
import plotly.graph_objects as go

class Histogram(object):

  def __init__(self, title_text, xaxis_text):
    self._title_text = title_text
    self._xaxis_text = xaxis_text

    self._fig = go.Figure()
    self._trace_count = 0
    self._orange = '#FF8C00'
    self._blue   = '#0000FF'

  def _get_color(self):
    if self._trace_count % 2 == 1:
      return self._orange
    if self._trace_count % 2 == 0:
      return self._blue

  def add_trace(self, data=[], label=""):
    if len(data)!=0:
      self._trace_count = self._trace_count + 1
      self._fig.add_trace(go.Histogram(
          x=data,
          # histnorm='probability density',
          name=label,
          marker_color=self._get_color(),
          opacity=1
          ))
      self.update_layout()
        
  def update_layout(self):
    self._fig.update_layout(
        barmode='stack',
        title=dict({
            'text': '<b>' + self._title_text + '</b>',
            'y':0.85,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font':dict({
                'color':"black",
                'size':14})}),
        xaxis_title_text=self._xaxis_text, 
        yaxis_title_text="Count",
        )
    
  def show_figure(self):
    self._fig.show()

### **Data Exploration**

-- Plot vehicle attributes

In [9]:
import numpy as np
import pandas as pd

# convert queries to dataframe or numpy array
def query_and_convert(filter, projection, to="dataframe"):
  cursor    = mdb_collection.find(filter, projection)
  dataframe = pd.DataFrame(list(cursor))
  if to is "dataframe":
    return dataframe
  elif to is "numpy":
    np_array = np.transpose(np.squeeze(dataframe.to_numpy()))
    return np_array

connect = Connect()
mdb_client = connect.get_connection("POLICE_DATABASE")
mdb_client.testdb
mdb_database = mdb_client['POLICE_DATABASE'] 
mdb_collection = mdb_database['POLICE_INTERVIEWS']

-- Plot vehicle features
1.   *disctrict*
2.   *zone*
3.   *subject* 
4.   *zip-code*

In [None]:
black = query_and_convert({
    "$and":[{"SubjectRace":{"$exists": True}}, 
            {"SubjectRace":{"$ne": ""}},
            {"SubjectRace":{"$ne": None}},
            {"SubjectRace":{"$ne": 0}},
            {"SubjectRace":"BLACK"} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

# # plot categorical
hist = Histogram("white vs Black", "Stop Description")
hist.add_trace(data=white, label="Black")
hist.show_figure()

-- Define the working features

In [3]:
# select columns
columns = dict({
    'FieldInterviewID':0,     
    'NOPD_Item':0,            
    'EventDate':1,            
    'District':1,             
    'Zone':1,                 
    'OfficerAssignment':0, 
    'StopDescription':1, 
    'ActionsTaken':0, 
    'VehicleYear':1, 
    'VehicleMake':1, 
    'VehicleModel':1, 
    'VehicleStyle':1, 
    'VehicleColor':1, 
    'SubjectID':0, 
    'SubjectRace':1, 
    'SubjectGender':1, 
    'SubjectAge':1, 
    'SubjectHasPhotoID':0, 
    'SubjectHeight':1, 
    'SubjectWeight':1, 
    'SubjectEyeColor':0, 
    'SubjectHairColor':1, 
    'SubjectDriverLicState':1, 
    'CreatedDateTime':0, 
    'LastModifiedDateTime':0, 
    'Longitude':1, 
    'Latitude':1, 
    'Zip':1, 
    'BlockAddress':0})

# use small num_rows for demos
police_interviews = PoliceInterviews(columns, num_rows=None, output="StopDescription", eval_percent=0.2, batch=25)

-- Load dtaframe

In [4]:
train_df, eval_df = police_interviews.load_dataframe()
print('{:<25} {} \n'.format("Column Name", "Sample"))
for (column_name, column_data) in train_df.iteritems():
  print('{:<25} {}'.format(column_name, column_data[0]))

Column Name               Sample 

EventDate                 34500
District                  7
Zone                      O
StopDescription           0
VehicleYear               1997
VehicleMake               CHEVROLET
VehicleModel              TAHOE
VehicleStyle              SPORTS UTILITY
VehicleColor              BLUE
SubjectRace               BLACK
SubjectGender             MALE
SubjectAge                21
SubjectHeight             72
SubjectWeight             169
SubjectHairColor          Black
SubjectDriverLicState     1
Longitude                 -89.9887508371
Latitude                  30.0355783772
Zip                       70127


-- Plot 