# Investigating correlation between economic factors and number of school shootings

### Dependencies

In [1]:
import pandas as pd
import numpy as np

### First task: get dataframe of number of school shootings per state per year

In [2]:
# import school shootings dataset and create dataframe
filepath1 = '../../output/cleaned_school_shootings_1990_2018.csv'

In [3]:
csv = pd.read_csv(filepath1)
df = pd.DataFrame(csv)

In [4]:
df.head()

Unnamed: 0,Date,City,State,Fatalities,Year,Latitude,Longitude
0,3/27/90,Brooklyn,New York,0,1990,40.678178,-73.944158
1,5/20/90,Centerville,Tennessee,1,1990,39.628393,-84.159382
2,8/26/90,Las Vegas,Nevada,1,1990,36.169941,-115.13983
3,9/11/90,San Antonio,Texas,0,1990,29.424122,-98.493628
4,1/8/91,Richardson,Texas,0,1991,32.948334,-96.729852


In [5]:
# take only relevant columns
smaller_df = df[['State', 'Fatalities', 'Year']]

In [6]:
# group by state in order to get school shooting counts for each year
grouped_by_state = smaller_df.groupby('State')

In [7]:
# get count of shootings each state, each year
grouped_by_state.Year.value_counts()

State          Year
Alabama        2010    4
               1992    1
               1996    1
               2006    1
               2008    1
               2011    1
               2012    1
               2013    1
               2016    1
Alaska         1997    1
Arizona        2013    2
               1992    1
               2000    1
               2002    1
               2008    1
               2010    1
               2015    1
               2016    1
Arkansas       2008    3
               1998    2
               1996    1
               1997    1
               2000    1
California     1994    5
               1996    5
               1998    5
               2009    5
               1993    4
               2001    4
               2011    4
                      ..
Virginia       2009    3
               2013    2
               1998    1
               2002    1
               2007    1
               2010    1
               2011    1
Washington     1994    3
     

In [8]:
sorted(smaller_df.State.unique().tolist())

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'D.C.',
 'Delaware',
 'District Of Columbia',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Hawaii',
 'IA',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virgin Islands',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [9]:
# list of items to replace:
# 'D.C', 'District Of Columbia', replaced with 'District of Columbia'
# 'IA', replaced with 'Iowa'
# Drop virgin islands

In [10]:
# perform above cleaning
smaller_df.replace('D.C.', 'District of Columbia', inplace=True)
smaller_df.replace('District Of Columbia', 'District of Columbia', inplace=True)
smaller_df.replace('IA', 'Iowa', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [11]:
cleaned_df = smaller_df[smaller_df['State'] != 'Virgin Islands']

In [12]:
statelist = sorted(cleaned_df.State.unique().tolist())

In [13]:
new_df = pd.DataFrame(columns=statelist, index=['1990',
                                '1991',
                                '1992',
                                '1993',
                                '1994',
                                '1995',
                                '1996',
                                '1997',
                                '1998',
                                '1999',
                                '2000',
                                '2001',
                                '2002',
                                '2003',
                                '2004',
                                '2005',
                                '2006',
                                '2007',
                                '2008',
                                '2009',
                                '2010',
                                '2011',
                                '2012',
                                '2013',
                                '2014',
                                '2015',
                                '2016',
                                '2017',
                                '2018',
                                ])

In [14]:
new_df.head()

Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
1990,,,,,,,,,,,...,,,,,,,,,,
1991,,,,,,,,,,,...,,,,,,,,,,
1992,,,,,,,,,,,...,,,,,,,,,,
1993,,,,,,,,,,,...,,,,,,,,,,
1994,,,,,,,,,,,...,,,,,,,,,,


In [15]:
# iterate through cleaned shooting dataframe, and populate new dataframe
new_df.replace(np.NaN, 0, inplace=True)
for index, row in cleaned_df.iterrows():
    new_df.loc[str(row['Year']), row['State']] += 1

In [16]:
# cast all values as integers
newer_df = new_df.astype(int)

In [17]:
newer_df.head()

Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
1990,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1991,0,0,0,0,1,0,0,0,0,0,...,0,0,4,0,0,0,0,0,0,0
1992,1,0,1,0,2,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
1993,0,0,0,0,4,0,1,1,1,2,...,0,0,4,1,0,0,1,0,1,1
1994,0,0,0,0,5,0,0,0,0,2,...,0,1,1,0,0,0,3,0,0,0


### Export CSV

In [18]:
newer_df.to_csv('../../output/sparse_school_shootings_per_state_per_year.csv')

## Do same for fatalities

In [19]:
another_df = pd.DataFrame(columns=statelist, index=['1990',
                                '1991',
                                '1992',
                                '1993',
                                '1994',
                                '1995',
                                '1996',
                                '1997',
                                '1998',
                                '1999',
                                '2000',
                                '2001',
                                '2002',
                                '2003',
                                '2004',
                                '2005',
                                '2006',
                                '2007',
                                '2008',
                                '2009',
                                '2010',
                                '2011',
                                '2012',
                                '2013',
                                '2014',
                                '2015',
                                '2016',
                                '2017',
                                '2018',
                                ])

In [20]:
another_df.replace(np.NaN, 0, inplace=True)
for index, row in cleaned_df.iterrows():
    another_df.loc[str(row['Year']), row['State']] += row['Fatalities']

In [21]:
fatality_df = another_df.astype(int)

In [22]:
fatality_df.head()

Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
1990,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1991,0,0,0,0,1,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
1992,1,0,1,0,3,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1993,0,0,0,0,4,0,1,1,1,2,...,0,0,4,1,0,0,1,0,1,1
1994,0,0,0,0,4,0,0,0,0,1,...,0,1,0,0,0,0,3,0,0,0


### Export as csv

In [23]:
fatality_df.to_csv('../../output/sparse_fatalities_by_state_by_year.csv')