# Preprocessing of Gun Violence Dataset

In [62]:
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re

In [8]:
# Load the data
directory = os.path.dirname(str(Path().resolve()) + '\\')
path = os.path.join(directory, 'data', 'gun-violence-data.csv')

raw = pd.read_csv(path)
print(raw.columns)
raw.head()

Index(['incident_id', 'date', 'state', 'city_or_county', 'address', 'n_killed',
       'n_injured', 'incident_url', 'source_url',
       'incident_url_fields_missing', 'congressional_district', 'gun_stolen',
       'gun_type', 'incident_characteristics', 'latitude',
       'location_description', 'longitude', 'n_guns_involved', 'notes',
       'participant_age', 'participant_age_group', 'participant_gender',
       'participant_name', 'participant_relationship', 'participant_status',
       'participant_type', 'sources', 'state_house_district',
       'state_senate_district'],
      dtype='object')


Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [171]:
data = raw[['incident_id', 'date', 'state', 'city_or_county', 'address', 'n_killed', 'n_injured', 'congressional_district',
            'gun_stolen', 'gun_type', 'incident_characteristics', 'latitude', 'longitude', 'n_guns_involved', 'participant_age',
            'participant_gender', 'participant_name', 'participant_relationship', 'participant_status', 'participant_type',
            'state_house_district', 'state_senate_district']]

print('# of Incidents: ' + str(len(data)))
data.head()

# of Incidents: 239677


Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,congressional_district,gun_stolen,gun_type,...,longitude,n_guns_involved,participant_age,participant_gender,participant_name,participant_relationship,participant_status,participant_type,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,14.0,,,...,-79.8559,,0::20,0::Male||1::Male||3::Male||4::Female,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,43.0,,,...,-118.333,,0::20,0::Male,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,9.0,0::Unknown||1::Unknown,0::Unknown||1::Unknown,...,-82.1377,2.0,0::25||1::31||2::33||3::34||4::33,0::Male||1::Male||2::Male||3::Male||4::Male,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,6.0,,,...,-104.802,,0::29||1::33||2::56||3::33,0::Female||1::Male||2::Male||3::Male,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,6.0,0::Unknown||1::Unknown,0::Handgun||1::Handgun,...,-79.9569,2.0,0::18||1::46||2::14||3::47,0::Female||1::Male||2::Male||3::Female,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,62.0,27.0


In [165]:
# Define a helper function to get a df of the people involved in an event
def get_people(row_data):  # A row from the main dataframe
    # Initialize data
    ages, genders, names, relationships, statuses, types = [], [], [], [], [], []
    
    # Regex the data
    if type(row_data['participant_age']) == str:
        ages = re.findall('(\d+)::(\d+)', row_data['participant_age'])
        ages = dict(ages)
    if type(row_data['participant_gender']) == str:
        genders = re.findall('(\d+)::(\w+)', row_data['participant_gender'])
        genders = dict(genders)
    if type(row_data['participant_name']) == str:
        names = re.findall('(\d+)::(\w+\s*\w*)', row_data['participant_name'])
        names = dict(names)
    if type(row_data['participant_relationship']) == str:
        relationships = re.findall('(\d+)::(\w+)', row_data['participant_relationship'])
        relationships = dict(relationships)
    if type(row_data['participant_status']) == str:
        statuses = re.findall('(\d+)::(\w+)', row_data['participant_status'])
        statuses = dict(statuses)
    if type(row_data['participant_type']) == str:
        types = re.findall('(\d+)::(\w+)', row_data['participant_type'])
        types = dict(types)
    
    # Get the number of people
    n_people = max(len(l) for l in [ages, genders, names, relationships, statuses, types])
    people_data = {'age': [], 'gender': [], 'name': [], 'relationship': [], 'status': [], 'type': []}
    for person in range(n_people):
        person = str(person)
        
        # Append the data
        if person in ages:
            people_data['age'].append(ages[person])
        else:
            people_data['age'].append(np.nan)
        if person in genders:
            people_data['gender'].append(genders[person])
        else:
            people_data['gender'].append('')
        if person in names:
            people_data['name'].append(names[person])
        else:
            people_data['name'].append('')
        if person in relationships:
            people_data['relationship'].append(relationships[person])
        else:
            people_data['relationship'].append('')
        if person in statuses:
            people_data['status'].append(statuses[person])
        else:
            people_data['status'].append('')
        if person in types:
            people_data['type'].append(types[person])
        else:
            people_data['type'].append('')
    
    # Extract the peoples' data
    people = pd.DataFrame.from_dict(people_data)
    
    return people

In [173]:
# Get the people involved for a given incident; in this case, the one in the 0th index
people = get_people(data.iloc[6])
people

Unnamed: 0,age,gender,name,relationship,status,type
0,51,Male,Greg Griego,,Killed,Victim
1,40,Female,Sara Griego,,Killed,Victim
2,9,Male,Zephania Griego,,Killed,Victim
3,5,Female,Jael Griego,,Killed,Victim
4,2,Female,Angelina Griego,,Killed,Victim
5,15,Male,Nehemiah Griego,Family,Unharmed,Subject
