In [None]:
# setup
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

gun_data = pd.read_csv('stage3.csv')
gun_data.head()

# Separating Participants in Gun Violence Data

final results of separated participant data found in `gun_violence_participants.csv`

In [None]:
participants = gun_data[["incident_id","participant_age", "participant_age_group", "participant_gender", "participant_name", "participant_status", "participant_type"]].copy(deep=True)
participants.head()

In [None]:
#formatting data to eventually get a list of dictionaries (one dictionary per row), where each dictionary represents one entry (row) in our gun violence dataset
#the dictionary for the row contains a list of all participants and their ages

def get_participant_dictionaries(df_column, age_data = False):
    values_split_at_bars = [entry.split("||")  if str(entry) != 'nan' else str(entry) for entry in list(df_column)]

    participant_values_dicts = []
    for list_of_participants in values_split_at_bars:

        if list_of_participants != 'nan':
            
            participants_dict = dict()

            try:
                for participant in list_of_participants:
                        p_key = 'participant_' + participant.split("::")[0]
                        participants_dict[p_key] = participant.split("::")[1]
                    
                participant_values_dicts.append(participants_dict)

            except IndexError:

                try:
                    for participant in list_of_participants:
                        p_key = 'participant_' + participant.split(":")[0] #looks like some entries were separated by ':' instead of "::"
                        participants_dict[p_key] = participant.split(":")[1] 
                    
                    participant_values_dicts.append(participants_dict)
                
                except ValueError:
                    list_of_participants = list_of_participants[0].split('|') #reformat to split at | (didn't get split first time around)
                    
                    for participant in list_of_participants:
                        p_key = 'participant_' + participant.split(":")[0]
                        participants_dict[p_key] = participant.split(":")[1] 

                    participant_values_dicts.append(participants_dict)
                    
                
        else:
            participant_values_dicts.append({'participant_0': np.NaN})

    #list of participants per gun violence entry + their ages
    return participant_values_dicts


# Create a dataframe with each participant's incident_id, name, age, age group, gender, status, and type. The incident id will be used in case we want to look up for info about 
# the gun incident the participant belongs to
participants.participant_age = get_participant_dictionaries(participants.participant_age)
participants.participant_age_group = get_participant_dictionaries(participants.participant_age_group)
participants.participant_gender = get_participant_dictionaries(participants.participant_gender)
participants.participant_name = get_participant_dictionaries(participants.participant_name)
participants.participant_status = get_participant_dictionaries(participants.participant_status)
participants.participant_type = get_participant_dictionaries(participants.participant_type)

participants

In [None]:
#separating participants so each participant gets their own row
participants_separated = pd.DataFrame(columns=['id', 'incident_id', 'age', 'age_group', 'gender', 'name', 'status', 'type'])
for index, row in participants.iterrows():
    print(index)
    incident_id = row['incident_id']

    ages = row.participant_age
    age_groups = row.participant_age_group
    genders = row.participant_gender
    names = row.participant_name
    statuses = row.participant_status
    types = row.participant_type

    participant_keys = list(types.keys())
    for participant_id in participant_keys:
        row_identifier = str(incident_id) + '_' + participant_id

        participant_row = {'id': row_identifier, 
                           'incident_id': incident_id, 
                           'age': ages.get(participant_id, np.NaN), 
                           'age_group': age_groups.get(participant_id, np.NaN),
                           'gender': genders.get(participant_id, np.NaN),
                           'name': names.get(participant_id, np.NaN),
                           'status': statuses.get(participant_id, np.NaN),
                           'type': types.get(participant_id, np.NaN)}

        participants_separated.loc[index] = participant_row

#display dataframe of separated participants
participants_separated