### Imports

In [85]:
import pandas as pd
from pymongo import MongoClient
from bson.objectid import ObjectId
import datetime
from time import time
import pprint
import pymongo

### Connect to the DB

In [25]:
# connection to the database
client = MongoClient()

db_name = "PoliceShootings"

db = client[db_name]

# Get PoliceKillings collection
killings_col = db.PoliceKillings

## Simple Queries

#### Simple Query 1 - Unarmed Males under 18 killed by Police

In [26]:
simple_query_1 = { 
    'age': { '$lt': 18 }, 
    'gender': 'M', 
    'armed': 'unarmed' 
}
my_result = killings_col.find(simple_query_1)
for x in my_result:
    print(f"({x['name']}, {int(x['age'])})")

(Deven Guilford, 17)
(Jeremy Mardis, 6)
(David Joseph, 17)
(Jose Raul Cruz, 16)
(Jordan Edwards, 15)
(Armando Garcia-Muro, 17)


#### Simple Query 2 - Women killed by Police while not fleeing and not armed

In [27]:
simple_query_2 = {
    'flee': 'Not fleeing',
    'gender': 'F',
    'armed': 'unarmed'
}
my_result = killings_col.find(simple_query_2)
for x in my_result:
    print(f"({x['name']}, {int(x['age'])})")

(Autumn Steele, 34)
(India Kager, 28)
(Ciara Meyer, 12)
(Alteria Woods, 21)
(Justine Damond, 40)


## Complex queries


#### Complex Query 1 - People armed with a knife shot by police in the state of Texas

In [59]:
complex_query_1 = [
    { "$match": {"armed": "knife"} },
    {
        "$lookup":
        {
            "from": "City",       
            "localField": "city_id",  
            "foreignField": "_id", 
            "as": "c"
        },
    },
    {  
        "$unwind": 
        { "path": '$c'} 
    },
    {
        "$lookup":
        {
            "from": "State",      
            "localField": "c.state_id",   
            "foreignField": "_id",
            "as": "s"  
        }
    },
    {  
        "$unwind": 
        { "path": '$s'} 
    },
    { '$match': {'s.acronym': 'TX' } },
]

response = killings_col.aggregate(complex_query_1)

for x in response:
    print(f"({x['name']}, {x['age']}, {x['s']['acronym']}, {x['armed']})")   

(Richard McClendon, 43.0, TX, knife)
(Daniel Brumley, 27.0, TX, knife)
(Kristiana Coignard, 17.0, TX, knife)
(Jose Antonio Espinoza Ruiz, 56.0, TX, knife)
(Kamal Dajani, 26.0, TX, knife)
(Randall Waddel, 49.0, TX, knife)
(Randall Lance Hughes, 48.0, TX, knife)
(Roger Albrecht, nan, TX, knife)
(Henry Reyna, 49.0, TX, knife)
(Michael Clyde Lynch, 37.0, TX, knife)
(Tyler Hunkin, 29.0, TX, knife)
(Gregory Mathis, 36.0, TX, knife)
(Jose Angel Vallarta, 30.0, TX, knife)
(Martin Gomez, 46.0, TX, knife)
(Medger Blake, 41.0, TX, knife)
(Morgan London Rankins, 30.0, TX, knife)
(Epthen Lamont Johnson, 40.0, TX, knife)
(Ray Valdez, 55.0, TX, knife)
(Rodney Henderson, 48.0, TX, knife)
(Emmett Edward Hall, 60.0, TX, knife)


### Complex Query 2 - State with the most shootings

In [29]:
complex_query_2 = [
  {
      "$lookup":
      {
          "from": "City",       
          "localField": "city_id",  
          "foreignField": "_id", 
          "as": "c"
      },
  },
  {  
      "$unwind": 
      { "path": '$c'} 
  },
  {
      "$lookup":
      {
          "from": "State",      
          "localField": "c.state_id",   
          "foreignField": "_id",
          "as": "s"  
      }
  },
  {  
      "$unwind": 
      { "path": '$s'} 
  },
  { "$group": { "_id": "$s.acronym", "shootings": { "$sum": 1 } } },
  { "$sort": { "shootings": -1 } },
  { "$limit": 1}
]

response = killings_col.aggregate(complex_query_2)

for x in response:
    print(x) 

{'_id': 'CA', 'shootings': 390}


### Insert Quert - Insert new police killing

In [30]:
mydict = {
    "name": "John Smith", 
    "date": datetime.datetime(2020, 2, 25), 
    "manner_of_death": "shot", 
    "armed": "toy weapon", 
    "age": 34,
    "gender": "M", 
    "race_id": ObjectId("63726502634ca21d0f760540"), 
    "city_id": ObjectId("637264fd634ca21d0f75927c"), 
    "signs_of_mental_illness": True, 
    "threat_level": "attack", 
    "flee": None, 
    "body_camera": True 
}

insertion_result = killings_col.insert_one(mydict)

### Update Query - Update the date of the police killing

In [31]:
new_date = datetime.datetime(2022, 2, 25)

killings_col.update_one(
    {"_id": insertion_result.inserted_id},
    {"$set": {"date": new_date}}
)

<pymongo.results.UpdateResult at 0x7f309e83ae00>

## Index

In [36]:
# Number of iterations used in performance
iterations = 20

# Get average performance
def performance(collection, query, is_aggregate):
    result = 0
    for _ in range(iterations):
        time_i = time()
        if is_aggregate:
            collection.aggregate(query)
        else:
            collection.find(query)
        time_f = time()
        result += time_f-time_i
    return result / iterations

# Get performance for all queries
def performance_list():
    return {
        "simple_query_1" : performance(killings_col, simple_query_1, False),
        "simple_query_2" : performance(killings_col, simple_query_2, False),
        "complex_query_1" : performance(killings_col, complex_query_1, True),
        "complex_query_2" : performance(killings_col, complex_query_2, True)
    }

In [73]:
# Get examined doc num for all queries
def examined_list():
    return {
        "simple_query_1": killings_col.find(simple_query_1).explain()["executionStats"]["totalDocsExamined"],
        "simple_query_2": killings_col.find(simple_query_2).explain()["executionStats"]["totalDocsExamined"],
        "complex_query_1": None
    }

### Performance without indexes

In [75]:
# Simple query 1 - 2541 docs examined
# Simple query 2 - 2539 docs examined

performance_no_index = performance_list()

pprint.pprint(performance_no_index)

{'complex_query_1': 0.09725010395050049,
 'complex_query_2': 0.3797390580177307,
 'simple_query_1': 1.0251998901367188e-05,
 'simple_query_2': 4.458427429199219e-06}


In [76]:
examined_no_index = examined_list()

pprint.pprint(examined_no_index)

{'simple_query_1': 2539, 'simple_query_2': 2539}


### Add indexes

In [103]:
# Drop pre-existing indexes
killings_col.drop_indexes()

In [104]:
city_col = db["City"]
state_col = db["State"]

# For simple_query 1
killings_col.create_index(
    [
        ("gender", pymongo.ASCENDING),
        ("armed", pymongo.ASCENDING),
        ("age", pymongo.ASCENDING)
    ],
    name="simple_query_1_index"
)

# For simple_query 2
killings_col.create_index(
    [
        ("gender", pymongo.ASCENDING),
        ("armed", pymongo.ASCENDING),
        ("flee", pymongo.ASCENDING)
    ],
    name="simple_query_2_index"
)

# # For complex_query 1
# killings_col.create_index(
#     [
#         ("armed", pymongo.ASCENDING),
#         ("city_id", pymongo.ASCENDING),
#     ],
#     name="simple_query_3_index"
# )

# city_col.create_index(
#     [
#         ("_id", pymongo.ASCENDING),
#         ("city_id", pymongo.ASCENDING),
#     ],
#     name="simple_query_3_index"
# )

# state_col.create_index(
#     [
#         ("_id", pymongo.ASCENDING),
#         ("acronym", pymongo.ASCENDING),
#     ],
#     name="simple_query_3_index"
# )

'simple_query_3_index'

In [105]:
# Simple query 1 - 51 docs examined
# Simple query 2 - 107 docs examined

performance_index = performance_list()

pprint.pprint(performance_index)

{'complex_query_1': 0.09683300256729126,
 'complex_query_2': 0.40439910888671876,
 'simple_query_1': 7.2836875915527345e-06,
 'simple_query_2': 4.553794860839844e-06}


In [93]:
examined_index = examined_list()

pprint.pprint(examined_index)

{'simple_query_1': 6, 'simple_query_2': 5}
