### Imports

In [1]:
import pandas as pd
from pymongo import MongoClient
from bson.objectid import ObjectId
import numpy as np

### Connect to DB

In [2]:
# Connection to the database
client = MongoClient()

db_name = "PoliceShootings"

# Drop database if it exists
if db_name in client.list_database_names():
    client.drop_database(db_name)

db = client[db_name]

###  Read CSV files

In [3]:
csv_folder = "archive"

completed_highschool_df = pd.read_csv(f"{csv_folder}/PercentOver25CompletedHighSchool.csv", encoding="ISO-8859-1")
police_killings_df = pd.read_csv(f"{csv_folder}/PoliceKillingsUS.csv", encoding="ISO-8859-1")
race_by_city_df = pd.read_csv(f"{csv_folder}/ShareRaceByCity.csv", encoding="ISO-8859-1")

### Creating the database and collections

In [4]:
db.create_collection("State")
state_col= db['State']

db.create_collection("City")
city_col = db['City']

db.create_collection("Race")
race_col = db['Race']

db.create_collection("PercentOver25CompletedHighSchool")
percent_highschool_col = db['PercentOver25CompletedHighSchool']

db.create_collection("PoliceKillings")
killings_col = db['PoliceKillings']

db.create_collection("ShareRaceByCity")
share_race_city_col = db['ShareRaceByCity']

### Clean data

In [5]:
# Replaces redundent words with empty string
def remove_redundency(dataframe):
    dataframe['City'] = dataframe['City'].str.replace(' town', '')
    dataframe['City'] = dataframe['City'].str.replace(' city', '')
    dataframe['City'] = dataframe['City'].str.replace(' CDP', '')

# Map race Acronym to race string
char_to_race = {
    "W": "white",
    "B": "black",
    "N": "native_american",
    "A": "asian",
    "H": "hispanic",
    "O": "other",
}

def convert_string_to_race(acronym):
    if not isinstance(acronym, str):
        return None
    return char_to_race[acronym]

In [6]:
# Cleaning the city, town, CDP suffixes for less issues with connecting collections
remove_redundency(completed_highschool_df)
remove_redundency(race_by_city_df)

# Renaming the values of race for creating the relationships with ease
police_killings_df["race"] = police_killings_df["race"].apply(convert_string_to_race)

### Insert Data

---
##### State

In [7]:
# Get the unique state names
df_state = pd.DataFrame({'acronym': completed_highschool_df["Geographic Area"].drop_duplicates().values})
# Set to Column:Value for mongo insertion
df_state = df_state.to_dict('records')
# Store in variable for possible use after
result_state_insertion = state_col.insert_many(df_state) 

---
#### City

In [8]:
docs = state_col.find()
dict_state_to_id = {}
# Map state to id
for doc in docs:
    dict_state_to_id[doc['acronym']] = ObjectId(str(doc['_id']))

# No unique city names because the same city name can appear for more than one state
df_city = pd.DataFrame({'name': completed_highschool_df["City"].values}) 
df_city['state_id'] = completed_highschool_df["Geographic Area"].map(dict_state_to_id)
# Set to Column:Value for mongo insertion
df_city = df_city.to_dict('records')
# Store in variable for possible use after
result_city_insertion = city_col.insert_many(df_city)

---
##### Race

In [9]:
races = ["white", "black", "native_american", "asian", "hispanic", "other"]

df_race = pd.DataFrame({'name': races})
# Set to Column:Value for mongo insertion
df_race = df_race.to_dict('records')
# Store in variable for possible use after
result_race_insertion = race_col.insert_many(df_race) 

##### Percent over 25 completed high school

In [10]:
docs = city_col.find()
dict_city_to_id = {}
# Map city name and state to city id
for doc in docs:
    dict_city_to_id[(doc['name'], doc['state_id'])] = ObjectId(str(doc['_id']))

df_percent_highschool = pd.DataFrame({'percent': completed_highschool_df["percent_completed_hs"].values})
df_percent_highschool['city_id'] = completed_highschool_df.apply(lambda x: dict_city_to_id[(x["City"], dict_state_to_id[x["Geographic Area"]])], axis=1)
# Set to Column:Value for mongo insertion
df_percent_highschool = df_percent_highschool.to_dict('records')
# Store in variable for possible use after
result_percent_highschool_insertion = percent_highschool_col.insert_many(df_percent_highschool)

---
##### Police killings

In [11]:
# Given city and state, return city id
def convert_state_and_city_to_id(city, state):
  if (city, dict_state_to_id[state]) in dict_city_to_id:
    return dict_city_to_id[(city, dict_state_to_id[state])]
  return None

In [12]:
docs = race_col.find()
dict_race_to_id = {}
# Map race to its id
for doc in docs:
    dict_race_to_id[doc['name']] = ObjectId(str(doc['_id']))

df_killings = police_killings_df.copy()
df_killings.drop("race", axis=1)
df_killings.drop("state", axis=1)
df_killings.drop("city", axis=1)

dates = pd.to_datetime(police_killings_df["date"].values, dayfirst = True)
dates = np.array(dates,dtype=np.datetime64)
df_killings['race_id'] = police_killings_df["race"].map(dict_race_to_id)
df_killings['city_id'] = police_killings_df.apply(lambda x: convert_state_and_city_to_id(x["city"], x["state"]), axis=1)
df_killings['date'] = dates
df_killings = df_killings.to_dict('records')
result_killing_insertion = killings_col.insert_many(df_killings)

##### Share race by city

In [13]:
# for each of the 5 shares: share_white,share_black,share_native_american,share_asian,share_hispanic
# have a new row on the dataframe

df_city_ids = race_by_city_df.apply(lambda x: convert_state_and_city_to_id(x["City"], x["Geographic area"]), axis=1)

df_share_race_city_white = pd.DataFrame({'share': race_by_city_df["share_white"].values})
df_share_race_city_white['Race_id'] = dict_race_to_id["white"]
df_share_race_city_white['City_id'] = df_city_ids

df_share_race_city_black = pd.DataFrame({'share': race_by_city_df["share_black"].values})
df_share_race_city_black['Race_id'] = dict_race_to_id["black"]
df_share_race_city_black['City_id'] = df_city_ids

df_share_race_city_native = pd.DataFrame({'share': race_by_city_df["share_native_american"].values})
df_share_race_city_native['Race_id'] = dict_race_to_id["native_american"]
df_share_race_city_native['City_id'] = df_city_ids

df_share_race_city_asian = pd.DataFrame({'share': race_by_city_df["share_asian"].values})
df_share_race_city_asian['Race_id'] = dict_race_to_id["asian"]
df_share_race_city_asian['City_id'] = df_city_ids

df_share_race_city_hispanic = pd.DataFrame({'share': race_by_city_df["share_hispanic"].values})
df_share_race_city_hispanic['Race_id'] = dict_race_to_id["hispanic"]
df_share_race_city_hispanic['City_id'] = df_city_ids

df_share_race_city = pd.concat(
[
    df_share_race_city_white, 
    df_share_race_city_black,
    df_share_race_city_native, 
    df_share_race_city_asian,
    df_share_race_city_hispanic
])

df_share_race_city = df_share_race_city.to_dict('records')
result_share_race_city_insertion = share_race_city_col.insert_many(df_share_race_city)