In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('prep_csv/shooting-1982-2023.csv')

In [3]:
# Replace '-' with 'unknown' in the entire DataFrame
df.replace('-', 'unknown', inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')

#df['date'] = df['date'].dt.strftime('%m-%d-%Y')

# Split the 'location' column into separate columns for city and state
df[['city', 'state']] = df['location'].str.split(', ', expand=True)

# Split the 'summary' column into separate columns for name, age, and incident description

df['name'] = df['summary'].str.extract(r'^([^,]+), \d+')
df['age'] = df['summary'].str.extract(r'\b(\d+)\b')

# Remove 'years-old' from the age column and convert it to numeric
df['age'] = df['age'].str.extract(r'^(\d+)', expand=False)
df['age'] = pd.to_numeric(df['age'], errors='coerce')

# Extract the incident description
df['incident_description'] = df['summary'].str.split(', ', 2).str[-1]

# Drop unnecessary columns
df.drop(columns=['summary', 'location'], inplace=True)

# Convert age_of_shooter to numeric
df['age_of_shooter'] = pd.to_numeric(df['age_of_shooter'], errors='coerce')

# Add age_group column based on age_of_shooter column
df['age_group'] = np.where(df['age_of_shooter'] > 18, 'above 18', 'not 18+')

# add case_key column
df['case_key'] = range(1, len(df) + 1)

df['weapons_obtained_legally'] = df['weapons_obtained_legally'].apply(lambda x: 'yes' if x.lower() == 'yes' else 'no')

# make all values in the DataFrame lowercase
df = df.apply(lambda x: x.astype(str).str.lower())

# Save the modified DataFrame to a new CSV file
df.to_csv('prep_csv/new_shooting_data.csv', index=False)
