<a href="https://colab.research.google.com/github/uninstallit/ati580_final_project/blob/edvin-1/ati580_vis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1.0 Initial Setup**

In [None]:
pip install dnspython

In [1]:
from pymongo import MongoClient

class Connect(object):

  @staticmethod    
  def get_connection(database):
    username = "mdbUser"
    password = "ati580"
    return MongoClient('mongodb+srv://{}:{}@ati580-cluster.s5t5z.gcp.mongodb.net/{}?retryWrites=true&w=majority'.format(username, password, database))

In [22]:
import plotly.graph_objects as go

class Histogram(object):

  def __init__(self, title_text, xaxis_text):
    self._title_text = title_text
    self._xaxis_text = xaxis_text

    self._fig = go.Figure()
    self._trace_count = 0
    self._orange = '#FF8C00'
    self._blue   = '#0000FF'

  def _get_color(self):
    if self._trace_count % 2 == 1:
      return self._orange
    if self._trace_count % 2 == 0:
      return self._blue

  def add_trace(self, data=[], label=""):
    if len(data)!=0:
      self._trace_count = self._trace_count + 1
      self._fig.add_trace(go.Histogram(
          x=data,
          # histnorm='probability density',
          name=label,
          marker_color=self._get_color(),
          opacity=1
          ))
      self.update_layout()
        
  def update_layout(self):
    self._fig.update_layout(
        barmode='stack',
        title=dict({
            'text': '<b>' + self._title_text + '</b>',
            'y':0.85,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font':dict({
                'color':"black",
                'size':14})}),
        xaxis_title_text=self._xaxis_text, 
        yaxis_title_text="Count",
        )
    
  def show_figure(self):
    self._fig.show()
      

**Explore Data**

In [5]:
from IPython.display import clear_output
import plotly.express as px
from ipywidgets import widgets, HBox
import pandas as pd

connect = Connect()
mdb_client = connect.get_connection("POLICE_DATABASE")
mdb_database = mdb_client['POLICE_DATABASE'] 
mdb_collection = mdb_database['POLICE_INTERVIEWS']

# all options - set value to 1 to include in query
options = dict({'FieldInterviewID':0,
                'NOPD_Item':0, 
                'EventDate':0, 
                'District':0, 
                'Zone':0, 
                'OfficerAssignment':0, 
                'StopDescription':0, 
                'ActionsTaken':0, 
                'VehicleYear':1, 
                'VehicleMake':0, 
                'VehicleModel':0, 
                'VehicleStyle':0, 
                'VehicleColor':1, 
                'SubjectID':0, 
                'SubjectRace':1, 
                'SubjectGender':1, 
                'SubjectAge':0, 
                'SubjectHasPhotoID':0, 
                'SubjectHeight':1, 
                'SubjectWeight':0, 
                'SubjectEyeColor':0, 
                'SubjectHairColor':1, 
                'SubjectDriverLicState':0, 
                'CreatedDateTime':0, 
                'LastModifiedDateTime':0, 
                'Longitude':0, 
                'Latitude':0, 
                'Zip':1, 
                'BlockAddress':0})

features = { key: value for key, value in options.items() if value == 1 }
  
def on_change_selection(change):
  selection = change["new"].strip()
  # change query limit
  limit = 500
  clear_output()
  print("Loading data ...")
  query = features.pop("_id", None)
  data = pd.DataFrame(list(mdb_collection.find({}, query).limit(limit)))
  data = data.fillna(0)
  fig = px.scatter_matrix(data, dimensions=features.keys(), color=selection)
  clear_output()
  display(selection_dropdown)
  fig.show()

selection_dropdown = widgets.Dropdown(
    options=features.keys(),
    value=None,
    description='Color:',
    disabled=False,
)

selection_dropdown.observe(on_change_selection, names="value")
display(selection_dropdown)


Dropdown(description='Color:', index=2, options=('VehicleYear', 'VehicleColor', 'SubjectRace', 'SubjectGender'…

**Visualziation and Hypothesis Testing**

In [1]:
import pandas as pd
import numpy as np
import pymongo

# connect to database
connect = Connect()
mdb_client = connect.get_connection("POLICE_DATABASE")
mdb_database = mdb_client['POLICE_DATABASE'] 
mdb_collection = mdb_database['POLICE_INTERVIEWS']
mdb_client.testdb

# convert queries to dataframe or numpy array
def query_and_convert(filter, projection, to="dataframe"):
  cursor    = mdb_collection.find(filter, projection)
  dataframe = pd.DataFrame(list(cursor))
  if to is "dataframe":
    return dataframe
  elif to is "numpy":
    np_array = np.transpose(np.squeeze(dataframe.to_numpy()))
    return np_array

# create research queries
age_vs_chevy = query_to_numpy({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"VehicleMake":"CHEVROLET"} ]}, 
            {"_id":0, "SubjectAge" : 1})

age_vs_ford = query_to_numpy({
    "$and":[{"SubjectAge":{"$exists": True}}, 
            {"SubjectAge":{"$ne": ""}},
            {"SubjectAge":{"$ne": None}},
            {"SubjectAge":{"$ne": 0}},
            {"VehicleMake":"FORD"} ]}, 
            {"_id":0, "SubjectAge" : 1})

# plot numerical 
hist = Histogram("age vs make", "Age")
hist.add_trace(data=age_vs_chevy, label="CHEVY")
hist.add_trace(data=age_vs_ford, label="FORD")
hist.show_figure()

vehicle_color = query_to_numpy({
    "$and":[{"VehicleColor":{"$exists": True}}, 
            {"VehicleColor":{"$ne": ""}},
            {"VehicleColor":{"$ne": None}},
            {"VehicleColor":{"$ne": 0}}, ]}, 
            {"_id":0, "VehicleColor" : 1})

# plot categorical
hist2 = Histogram("vehicle color", "Color")
hist2.add_trace(data=vehicle_color, label="Color")
hist2.show_figure()

white = query_and_convert({
    "$and":[{"SubjectRace":{"$exists": True}}, 
            {"SubjectRace":{"$ne": ""}},
            {"SubjectRace":{"$ne": None}},
            {"SubjectRace":{"$ne": 0}},
            {"SubjectRace":"WHITE"} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

black = query_and_convert({
    "$and":[{"SubjectRace":{"$exists": True}}, 
            {"SubjectRace":{"$ne": ""}},
            {"SubjectRace":{"$ne": None}},
            {"SubjectRace":{"$ne": 0}},
            {"SubjectRace":"BLACK"} ]}, 
            {"_id":0, "StopDescription" : 1}, to="numpy")

# plot categorical
hist2 = Histogram("white vs Black", "Stop Description")
hist2.add_trace(data=white, label="Race")
hist2.add_trace(data=black, label="Race")
hist2.show_figure()


NameError: ignored

Good reference for hypothesis testing in Python:

https://towardsdatascience.com/hypothesis-testing-in-machine-learning-using-python-a0dc89e169ce



Notes:
TF-IDF with Tensorflow: https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-9-neural-networks-with-tfidf-vectors-using-d0b4af6be6d7

ipywidgets pdf: http://xph.necst.it/2017/software/lessons/Lesson_4_PYNQ.pdf

mongoDB queries: https://docs.mongodb.com/manual/tutorial/query-documents/


Handle missing values with Pandas: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html