# Additional Data Wrangling

## Table of Contents

#### 1. Importing Libraries
#### 2. Importing Data
#### 3. Data Wrangling
#### 4. Exporting Data

# 01. Importing Libraries

In [5]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Importing Data

In [7]:
# Create path
path = r'C:\Users\16307\Desktop\Tasks - DA Immersion\Gun Violence Analysis'

In [8]:
# Import cleaned gun violence data set as dataframe
df = pd.read_csv(os.path.join(path, '02 Data', 'gun_violence_cleaned.csv'))

# 03. Data Wrangling

In [10]:
# Function to extract the ages, compute the average, and handle mixed formats
def calculate_average_age(age_string):
    if pd.isna(age_string) or age_string == "":
        return np.nan  # Return NaN for missing or blank entries
    
    try:
        # Check which format is used (either '::' or ':')
        if "::" in age_string:
            # Correct format with '::' and '||' as delimiters
            items = age_string.split("||")
            values = [int(item.split("::")[1]) for item in items if item.split("::")[1].isdigit()]
        elif ":" in age_string:
            # Incorrect format with ':' and '|' as delimiters
            items = age_string.split("|")
            values = [int(item.split(":")[1]) for item in items if item.split(":")[1].isdigit()]
        else:
            return np.nan  # Return NaN if neither format matches
        
        # Return the average if valid values exist
        if values:
            return sum(values) / len(values)
        else:
            return np.nan  # Return NaN if no valid ages
    except Exception as e:
        return np.nan  # Return NaN if there's any error

# Apply the function to the 'participant_age' column and create a new column 'average_age'
df['average_age'] = df['participant_age'].apply(calculate_average_age)

In [11]:
# Display the DataFrame
df.head()

Unnamed: 0,incident_id,date,state,city_or_county,n_killed,n_injured,congressional_district,gun_stolen,gun_type,latitude,...,longitude,n_guns_involved,participant_age,participant_age_group,participant_gender,participant_status,participant_type,state_house_district,state_senate_district,average_age
0,461105,1/1/2013,Pennsylvania,Mckeesport,0,4,14.0,,,40.3467,...,-79.8559,,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,,,20.0
1,460726,1/1/2013,California,Hawthorne,1,3,43.0,,,33.909,...,-118.333,,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,62.0,35.0,20.0
2,478855,1/1/2013,Ohio,Lorain,1,3,9.0,0::Unknown||1::Unknown,0::Unknown||1::Unknown,41.4455,...,-82.1377,2.0,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,56.0,13.0,31.2
3,478925,1/5/2013,Colorado,Aurora,4,0,6.0,,,39.6518,...,-104.802,,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,40.0,28.0,37.75
4,478959,1/7/2013,North Carolina,Greensboro,2,2,6.0,0::Unknown||1::Unknown,0::Handgun||1::Handgun,36.114,...,-79.9569,2.0,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,62.0,27.0,31.25


# 04. Exporting Data

In [13]:
# Export dataframe as csv and pkl
df.to_csv(os.path.join(path, '02 Data', 'gun_violence_cleaned2.csv'))
df.to_pickle(os.path.join(path, '02 Data', 'gun_violence_cleaned2.pkl'))