# School Shootings Dataset, Cleaning

### Dependencies

In [1]:
import pandas as pd
import requests
import json

### Google Geocode API constants

In [2]:
api_key = 'AIzaSyAeSwgFHJnGt6wTx9rEXmp5yy0QtaIzXiY'
base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
example_url_new_york = 'https://maps.googleapis.com/maps/api/geocode/json?address=New%20York&region=New%20York&key=AIzaSyBqwyQMdmH_-LZRLxrnLgtlzfenQiV0uoI'

### Read in CSV and create DataFrame

In [3]:
filepath = '../../raw/school_shootings_1990_2018.csv'
csv = pd.read_csv(filepath)
df = pd.DataFrame(csv)

### View head and dtypes

In [4]:
df.head()

Unnamed: 0,Date,City,State,AreaType,School,Fatalities,Wounded,Dupe,Source,Desc
0,3/27/90,Brooklyn,New York,,C,0,1.0,,Wikp,A black youth was taunted with racial slurs by...
1,5/20/90,Centerville,Tennessee,suburban,HS,1,,,Pah,
2,8/26/90,Las Vegas,Nevada,urban,HS,1,,,Pah,
3,9/11/90,San Antonio,Texas,,HS,0,3.0,,Wikp,3 students were wounded when gunfire broke out...
4,1/8/91,Richardson,Texas,urban,HS,0,,,Pah,


In [5]:
df.Fatalities.max()

33

In [6]:
df.dtypes

Date           object
City           object
State          object
AreaType       object
School         object
Fatalities      int64
Wounded       float64
Dupe           object
Source         object
Desc           object
dtype: object

### Remove duplicates
- Dataset is aggregated from two sources, 'pah' and 'wikipedia'
    - if an incident from the 'pah' dataset is a duplicate of a record in the wikipedia dataset, it is marked True in the Dupe column.
    - these will have to be removed
    - will do it in-place

In [7]:
len(df)

652

In [8]:
df.drop(df[df['Dupe'] == True].index, inplace=True)
df = df.reset_index(drop=True)

In [9]:
len(df)

514

### Remove superfluous columns

In [10]:
skinny_df = df[['Date', 'City', 'State', 'Fatalities']]

In [11]:
skinny_df.head()

Unnamed: 0,Date,City,State,Fatalities
0,3/27/90,Brooklyn,New York,0
1,5/20/90,Centerville,Tennessee,1
2,8/26/90,Las Vegas,Nevada,1
3,9/11/90,San Antonio,Texas,0
4,1/8/91,Richardson,Texas,0


### Parse date and add year column for later manipulation

In [12]:
#skinny_df['Year'] = ''
skinny_df = skinny_df.assign(Year="")


In [13]:
skinny_df.head()

Unnamed: 0,Date,City,State,Fatalities,Year
0,3/27/90,Brooklyn,New York,0,
1,5/20/90,Centerville,Tennessee,1,
2,8/26/90,Las Vegas,Nevada,1,
3,9/11/90,San Antonio,Texas,0,
4,1/8/91,Richardson,Texas,0,


In [14]:
# iterate over rows
for index, row in skinny_df.iterrows():
    # grab last two digits of date field, cast as int, store
    date = int(row['Date'][-2:])
    # if date is under 20, make it 20xx, else 19xx and set that to the row we are on, under Year column
    if (date < 20):
        skinny_df.iat[index, 4] = int(date + 2000)
    else:
        skinny_df.iat[index, 4] = int(date + 1900)

In [15]:
skinny_df.head()

Unnamed: 0,Date,City,State,Fatalities,Year
0,3/27/90,Brooklyn,New York,0,1990
1,5/20/90,Centerville,Tennessee,1,1990
2,8/26/90,Las Vegas,Nevada,1,1990
3,9/11/90,San Antonio,Texas,0,1990
4,1/8/91,Richardson,Texas,0,1991


### Remove years after 2015?
- general violent crime dataset stops at 2015, but that is ok
    - if the city had a high average number of violent crimes from 1990 to 2015, we can consider that city still violent
- will keep all years of data for this set

### Add lat, long columns using google maps geocode API

In [18]:
# testing json traversal

city = skinny_df.loc[0,'City']
state = skinny_df.loc[0,'State']
url = base_url + 'address=' + city + '&region=' + state + '&key=' + api_key
json = requests.get(url).json()
print(json)
lat = json['results'][0]['geometry']['location']['lat']
lng = json['results'][0]['geometry']['location']['lng']
#print(city, lat, lng)
print(lng)

{'error_message': 'You have exceeded your daily request quota for this API.', 'results': [], 'status': 'OVER_QUERY_LIMIT'}


IndexError: list index out of range

In [None]:
len(skinny_df)

In [None]:
# extracts city, state from dataframe, calls API, traverses resulting json and sets new columns to returned values
# note: because I am setting on the same dataframe I am iterating over, code takes 5 minutes to run (will fix, sry)

# create empty lists to hold api responses
lat_list = []
lng_list = []

for index, row in skinny_df.iterrows():
    city = row['City']
    state = row['State']
    url = base_url + 'address=' + city + '&region=' + state + '&key=' + api_key
    json = requests.get(url).json()
    lat = json['results'][0]['geometry']['location']['lat']
    lng = json['results'][0]['geometry']['location']['lng']
    skinny_df.loc[index, 'Latitude'] = lat
    skinny_df.loc[index, 'Longitude'] = lng
    print('city: ', city, '\n--------\n', 'lat: ', lat, '\n', 'lng: ', lng, '\n--------\n--------\n')


In [None]:
skinny_df.head()

### Export cleaned CSV

In [None]:
cleaned_data = skinny_df

In [None]:
cleaned_data.to_csv('Output/cleaned_school_shootings_1990_2018.csv', index=False)