In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import pydeck as pdk


In [2]:
def load_dataset():
    df = pd.read_csv("data.csv")
    df = df[pd.notnull(df['longitude']) & pd.notnull(df['latitude'])]
    return df

def load_geo_subset(df):
    geo_df = df[["longitude", "latitude"]]
    geo_df.dropna(inplace=True)
    return geo_df

def city_list(df):
    return sorted(list(df['city_or_county'].unique()))

def load_city_subset(df, city):
    subdf = df[df['city_or_county'] == city].copy()
    subdf = subdf[["longitude", "latitude", "address", "n_killed", "n_injured"]]
    subdf.dropna(inplace=True)
    return subdf

In [54]:
df = load_dataset()
df.head()

Unnamed: 0.1,Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url_fields_missing,congressional_district,...,notes,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,state_house_district,state_senate_district
0,0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,False,14.0,...,Julian Sims under investigation: Four Shot and...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,,
1,1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,False,43.0,...,Four Shot; One Killed; Unidentified shooter in...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,62.0,35.0
2,2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,False,9.0,...,,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,56.0,13.0
3,3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,False,6.0,...,,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,40.0,28.0
4,4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,False,6.0,...,Two firearms recovered. (Attempted) murder sui...,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,62.0,27.0


# Find the column need preprocess

In [30]:
# columns = df.columns
# columns_to_process = df[columns].isnull().any()
# print(columns_to_process)
# df.dtypes.to_dict()

Unnamed: 0                     False
incident_id                    False
date                           False
state                          False
city_or_county                 False
address                         True
n_killed                       False
n_injured                      False
incident_url_fields_missing    False
congressional_district          True
gun_stolen                      True
gun_type                        True
incident_characteristics        True
latitude                       False
location_description            True
longitude                      False
n_guns_involved                 True
notes                           True
participant_age                 True
participant_age_group           True
participant_gender              True
participant_name                True
participant_relationship        True
participant_status              True
participant_type                True
state_house_district            True
state_senate_district           True
d

# Preprocess Column by Fill Nan and split to map and save to new column

In [50]:
def get_user_mapping(txt):
    if txt == "NA":
        return {}
    mapping = {}
    for d in txt.split("||"):
        try:
            key = d.split("::")[0]
            val = d.split("::")[1]
            if key not in mapping:
                mapping[key] = val
        except:
            pass
    return mapping

df['participant_type'] = df['participant_type'].fillna("NA")
df['participant_type_map'] = df['participant_type'].apply(lambda x : get_user_mapping(x))
df['participant_age'] = df['participant_age'].fillna("NA")
df['participant_age_map'] = df['participant_age'].apply(lambda x : get_user_mapping(x))
df['participant_age_group'] = df['participant_age_group'].fillna("NA")
df['participant_age_group_map'] = df['participant_age_group'].apply(lambda x : get_user_mapping(x))
df['participant_gender'] = df['participant_gender'].fillna("NA")
df['participant_gender_map'] = df['participant_gender'].apply(lambda x : get_user_mapping(x))


In [51]:
df['participant_age_group_map'].head()

0    {'0': 'Adult 18+', '1': 'Adult 18+', '2': 'Adu...
1    {'0': 'Adult 18+', '1': 'Adult 18+', '2': 'Adu...
2    {'0': 'Adult 18+', '1': 'Adult 18+', '2': 'Adu...
3    {'0': 'Adult 18+', '1': 'Adult 18+', '2': 'Adu...
4    {'0': 'Adult 18+', '1': 'Adult 18+', '2': 'Tee...
Name: participant_age_group_map, dtype: object

# Find the unique type

In [52]:
def get_unique_name(df,column_name):
    s = set()
    for item in df[column_name]:
        for k,v in item.items():
            s.add(v)
    print(column_name,s)
get_unique_name(df,'participant_type_map') 
get_unique_name(df,'participant_age_group_map') 
get_unique_name(df,'participant_gender_map') 


participant_type_map {'Subject-Suspect', 'Victim'}
participant_age_group_map {'Teen 12-17', 'Adult 18+', 'Child 0-11'}
participant_gender_map {'Male, female', 'Female', 'Male'}
