In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from pathlib import Path
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load raw data
directory = os.path.dirname(str(Path().resolve()) + '\\')
path = os.path.join(directory, 'data', 'gun-violence-data.csv')

gun_data = pd.read_csv(path)
print(gun_data.columns)

Index(['incident_id', 'date', 'state', 'city_or_county', 'address', 'n_killed',
       'n_injured', 'incident_url', 'source_url',
       'incident_url_fields_missing', 'congressional_district', 'gun_stolen',
       'gun_type', 'incident_characteristics', 'latitude',
       'location_description', 'longitude', 'n_guns_involved', 'notes',
       'participant_age', 'participant_age_group', 'participant_gender',
       'participant_name', 'participant_relationship', 'participant_status',
       'participant_type', 'sources', 'state_house_district',
       'state_senate_district'],
      dtype='object')


In [3]:
gun_data = gun_data[['incident_id', 'date', 'state', 'city_or_county', 'address', 'n_killed', 'n_injured', 'congressional_district',
            'gun_stolen', 'gun_type', 'incident_characteristics', 'latitude', 'longitude', 'n_guns_involved', 'participant_age',
            'participant_gender', 'participant_name', 'participant_relationship', 'participant_status', 'participant_type',
            'state_house_district', 'state_senate_district']]

print('# of Incidents: ' + str(len(gun_data)))
print('Columns: ' + str(gun_data.columns.values))

# of Incidents: 239677
Columns: ['incident_id' 'date' 'state' 'city_or_county' 'address' 'n_killed'
 'n_injured' 'congressional_district' 'gun_stolen' 'gun_type'
 'incident_characteristics' 'latitude' 'longitude' 'n_guns_involved'
 'participant_age' 'participant_gender' 'participant_name'
 'participant_relationship' 'participant_status' 'participant_type'
 'state_house_district' 'state_senate_district']


In [4]:
# Replace date column with year, month, and day
gun_data.loc[:,'year'] = pd.DatetimeIndex(gun_data.loc[:,'date']).year
gun_data.loc[:,'month'] = pd.DatetimeIndex(gun_data.loc[:,'date']).month
gun_data.loc[:,'day'] = pd.DatetimeIndex(gun_data.loc[:,'date']).day
cols = gun_data.columns.tolist()
cols = cols[:1] + cols[-3:] + cols[2:-3]
gun_data = gun_data[cols]

# Remove all rows with NaN latitude or longitude
gun_data = gun_data[np.isfinite(gun_data['latitude'])]

In [5]:
# Load the other CSVs

directory = os.path.dirname(str(Path().resolve()) + '\\')
mental_health = pd.read_csv(os.path.join(directory, 'data', 'EverytownNICSMentalHealth.csv'))
# Remove commas from number values
mental_health[mental_health.columns] = mental_health[mental_health.columns].replace({',': ''}, regex=True)
gun_safety = pd.read_csv(os.path.join(directory, 'data', 'GiffordsData.csv'))
gun_ownership = pd.read_csv(os.path.join(directory, 'data', 'Gun ownership.csv'))
happiness = pd.read_csv(os.path.join(directory, 'data', 'happiness.csv'))
concealed_carry = pd.read_csv(os.path.join(directory, 'data', 'ConcealedCarryPrices.csv'))

In [6]:
# Giffords pre-processing: Convert grades (A-F) to numbers (12-1) and rename column
grades = {
        'A': 12,
        'A-': 11,
        'B+': 10,
        'B': 9,
        'B-': 8,
        'C+': 7,
        'C': 6,
        'C-': 5,
        'D+': 4,
        'D': 3,
        'D-': 2,
        'F': 1,
}
gun_safety['Giffords_Gun_Safety_Grade'] = gun_safety['Giffords_Gun_Safety_Grade'].map(grades)
gun_safety.rename(columns={'Giffords_Gun_Safety_Grade':'Giffords Gun Safety Score', 
                          'Gun_Death_Rate_2018 (per 100k people)': 'Gun Deaths per 100k People (2018)',
                          'Gun_Death_Rank': 'Gun Death Rate (Ranked High to Low)'}, inplace=True)
gun_safety.head()

Unnamed: 0,State,Giffords Gun Safety Score,Gun Deaths per 100k People (2018),Gun Death Rate (Ranked High to Low)
0,Alabama,1.0,22.9,2.0
1,Alaska,1.0,24.5,1.0
2,Arizona,1.0,15.7,18.0
3,Arkansas,1.0,20.2,7.0
4,California,12.0,7.8,44.0


In [7]:
# Happiness pre-processing: Add a new state column and update the city column
happiness['State'] = happiness['City'].str[-2:]
happiness['City'] = happiness['City'].str[:-4]

In [8]:
# Happiness pre-processing: converting two-letter abbrev. to state
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}
# Add the full state name to the happiness table
happiness['State_full'] = happiness['State'].map(states)

# Create a second data table that counts how many happiness ratings we have for each state
happiness2 = happiness.groupby('State_full').count()['City'].to_frame()
happiness2 = happiness2.rename(index=str, columns={'City': 'Happiness Score'})

In [9]:
# WARNING: slow code block; takes a couple of minutes
# Extract the number of people in each incident
num_ages = gun_data['participant_age'].str.extractall('\d+::(\d+)').reset_index(level=1).groupby(level=0).max()['match']
num_genders = gun_data['participant_gender'].str.extractall('\d+::(\w+)').reset_index(level=1).groupby(level=0).max()['match']
num_names = gun_data['participant_name'].str.extractall('\d+::(\w+\s*\w*)').reset_index(level=1).groupby(level=0).max()['match']
num_relationships = gun_data['participant_relationship'].str.extractall('\d+::(\w+)').reset_index(level=1).groupby(level=0).max()['match']
num_statuses = gun_data['participant_status'].str.extractall('\d+::(\w+)').reset_index(level=1).groupby(level=0).max()['match']
num_types = gun_data['participant_type'].str.extractall('\d+::(\w+)').reset_index(level=1).groupby(level=0).max()['match']

# Get the number of people in each incident by taking the max the respective categories
num_merged = pd.concat([num_ages, num_genders, num_names, num_relationships, num_statuses, num_types], axis=1)
all_nums = num_merged.max(axis=1, skipna=True).astype(int)

# Make it into a df
all_nums = all_nums.to_frame()
all_nums.columns = ['n_people']

# Add 1 because Python is zero indexed
all_nums['n_people'] += 1

print(len(all_nums))

204102


In [10]:
# Extract the ages
raw_ages = gun_data['participant_age'].str.extractall('\d+::(?P<age>\d+)')
raw_ages['age'] = raw_ages['age'].astype(int)
raw_ages.reset_index(inplace=True)

# Count the ages in each group
ages = raw_ages.groupby(['level_0', pd.cut(raw_ages['age'], [0, 25, 45, 999])]).count()
ages.rename({'age': 'count'}, axis=1, inplace=True)
ages.fillna(0, axis=1, inplace=True)
ages['count'] = ages['count'].astype(int)

# Pivot the table and rename columns
ages = ages.reset_index().pivot(index='level_0', columns='age', values='count')
ages.index.rename('incident', inplace=True)
ages.columns = ages.columns.astype(str)
ages.rename({'(0, 25]': 'n_young', '(25, 45]': 'n_mid', '(45, 999]': 'n_old'}, axis=1, inplace=True)

print(len(ages))
ages.head()

139297


age,n_young,n_mid,n_old
incident,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0,0
1,1,0,0
2,1,4,0
3,0,3,1
4,2,0,2


In [11]:
# Extract the genders
raw_genders = gun_data['participant_gender'].str.extractall('\d+::(?P<gender>\w+)')
raw_genders.reset_index(inplace=True)

# Count the ages in each group
genders = raw_genders.groupby(['level_0', 'gender']).count()
genders.rename({'match': 'count'}, axis=1, inplace=True)
genders['count'] = genders['count'].astype(int)

# Pivot the table and rename columns
genders = genders.reset_index().pivot(index='level_0', columns='gender', values='count')
genders.fillna(0, axis=1, inplace=True)
genders.index.rename('incident', inplace=True)
genders.columns = genders.columns.astype(str)
genders.rename({'Female': 'n_female', 'Male': 'n_male'}, axis=1, inplace=True)
genders['n_female'] = genders['n_female'].astype(int)
genders['n_male'] = genders['n_male'].astype(int)

print(len(genders))
genders.head()

193047


gender,n_female,n_male
incident,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,3
1,0,1
2,0,5
3,1,3
4,2,2


In [12]:
# Join the data
joined_data = pd.concat([gun_data, all_nums, ages, genders], axis=1)

# Add the missing columns
joined_data['n_unknown_gender'] = joined_data['n_people'] - joined_data['n_female'] - joined_data['n_male']
joined_data['n_unknown_age'] = joined_data['n_people'] - joined_data['n_young'] - joined_data['n_mid'] - joined_data['n_old']

print(len(joined_data))

231754


In [13]:
# Get incidents where there is no information about the people (aka missing gender or missing age data for everyone)
missing_data = joined_data[joined_data['n_unknown_gender'].isnull() | joined_data['n_unknown_age'].isnull()]
print(len(missing_data))

93617


In [14]:
# CAMILLE'S OLD CODE; NOT USED ANYMORE

# # For each incident, generate column values for #female, #male, and various age groups
# gun_data['n_female'] = gun_data['n_killed'].copy()
# gun_data['n_male'] = gun_data['n_killed'].copy()
# gun_data['n_unknown_gender'] = gun_data['n_killed'].copy()
# gun_data['n_young'] = gun_data['n_killed'].copy()
# gun_data['n_mid'] = gun_data['n_killed'].copy()
# gun_data['n_old'] = gun_data['n_killed'].copy()
# gun_data['n_unknown_age'] = gun_data['n_killed'].copy()
# for i in range(len(gun_data)):
#     if i % 100 == 0:
#         print("processed:", i, "so far")
#     people = get_people(gun_data.iloc[i])
#     values = {'age': 1000, 'gender': "Unknown"}
#     people = people.fillna(value=values)
#     gender_people = people.groupby('gender').count()['age'].to_frame().rename(index=str, columns={"age": "Count"})
#     age_people = people.groupby('age').count()['gender'].to_frame().rename(index=str, columns={"gender": "Count"})
#     age_people['Age'] = age_people.index
#     try: 
#         gun_data['n_female'][i] = gender_people.loc["Female"][0]
#     except:
#         gun_data['n_female'][i] = 0
#     try: 
#         gun_data['n_male'][i] = gender_people.loc["Male"][0]
#     except:
#         gun_data['n_male'][i] = 0
#     try: 
#         gun_data['n_unknown_gender'][i] = gender_people.loc["Unknown"][0]
#     except:
#         gun_data['n_unknown_gender'][i] = 0
#     count_young = 0
#     count_mid = 0
#     count_old = 0
#     count_unknown = 0
#     for j in range(len(age_people)):
#         age = float(age_people.iloc[j][1])
#         count = age_people.iloc[j][0]
        
#         if age < 25:
#             count_young += count
#         elif age < 45:
#             count_mid += count
#         elif age < 999:
#             count_old += count
#         else:
#             count_unknown += count
#     gun_data['n_young'][i] = count_young
#     gun_data['n_mid'][i] = count_mid
#     gun_data['n_old'][i] = count_old
#     gun_data['n_unknown_age'][i] = count_unknown

In [15]:
# Merge Everything
merged1 = joined_data.merge(mental_health, left_on='state', right_on='State')
merged2 = merged1.merge(gun_safety, left_on='state', right_on='State')
merged3 = merged2.merge(gun_ownership, left_on='state', right_on='State')
merged4 = merged3.merge(concealed_carry, left_on='state', right_on='State')
merged5 = merged4.merge(happiness2, left_on='state', right_on='State_full')
merged5 = merged5.drop(columns = ['State_x', 'State_y'])


In [16]:
final_data = merged5[((merged5['n_killed'] != 0) | (merged5['n_injured'] != 0))]
cleanPath = os.path.join(directory, 'data', 'final_data.csv')
final_data.to_csv(cleanPath)