In [1]:
#   Name: Teresa Ferrill
#   Date: 05/03/2020
# Course: DSC540-T302 Data Preparation
# Term Project: Milestone Three - Gun Violence

Cleaning/Formatting Website Data

Perform at least 5 data transformation and/or cleansing steps to your Website data.     
Note - these transformations/cleasing steps occur throughout the notebook


> 1. Format data into a more readable format      
>> --Converted HREF information for inclusion in table (link within table to details on incident)     
>> --Took Date column and broke it into thee columns (Month, Day, Year) so that it can be used with other data sources sorted by Year     
> 2. Replace Headers  
>> --Added Index column header     
>> --Updated all column headers for data columns    
> 3. Find duplicates    
>> --Ran query to find duplicates (no duplicates found)
> 4. Identify outliers and bad data     
>> --Checked for NULL or NaN values within the data (no null values found)          
>> --Ran Histogram on Injured and Killed columns in order to determine if there were any negative values
> 5. Fix casing or inconsistent values     
>> --Deleted unnecessary column from table     
>> --Verified State, City, Month, Day, Year columns for inconsistent values    
>> --Updated State column to upper case state abbreviation

__Set up for Notebook activities, including import statements and establishment of custom functions__

In [2]:
# import libraries to support notebook processing
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# custom functions to support web scrapping activities

# variables to support processing
# the mass-shooting table covers 18 pages
site = "https://www.gunviolencearchive.org/mass-shooting"

# using the user-agent tag, I am able to access the website 
# without HTTP Error 403: Forbidden
hdr = {'User-Agent': 'Mozilla/5.0'}

# list to hold table data
data = []

# list to hold href data that will become a field within the table
refsdata = []

# list to hold the table headers from the web table
table_headers = []

# function to support reading the web page
def readPage(url):    
    # create request to open web page
    req = Request(url,headers=hdr)
        
    # open the web page
    page = urlopen(req)
    
    # read page using BeautifulSoup
    soup = BeautifulSoup(page)
    
    # return the soup page to calling function
    return soup

# function to read the table in the web page
def readTable(soup):

    # save table from soup page
    table = soup.find('table')
    
    # save body of table 
    table_body = table.find('tbody')
    
    # identify the rows within the table
    rows = table_body.find_all('tr')
    
    # loop through the rows and collect columns and html refence to incident details
    for row in rows:
        # save columns from row        
        cols = row.find_all('td')
        
        # strip off html from column elements within the cols list
        cols = [ele.text.strip() for ele in cols]
        
        # append columns to data list, illuminating empty values
        data.append([ele for ele in cols if ele]) 
        
        # capture the href values for information on incidents
        refs = row.find_all('li', {'class':"1 last"})
        
        # save href values into list
        refsdata.append(refs)
     
    # return data list to calling function
    return data

def runHist(df, col, xlabel):  
    plt.hist(df[col])
    plt.xlabel('Count')
    plt.title(xlabel)
    plt.show()   
    
# function to update State column to capitalized State abbreviation 
def updateState(df):
    df['State'] = df.State.replace('Indiana', 'IN')
    df['State'] = df.State.replace('Wisconsin', 'WI')
    df['State'] = df.State.replace('New York', 'NY')
    df['State'] = df.State.replace('North Carolina', 'NC')
    df['State'] = df.State.replace('Florida', 'FL')
    df['State'] = df.State.replace('Tennessee', 'TN')
    df['State'] = df.State.replace('Texas', 'TX')
    df['State'] = df.State.replace('Maryland', 'MD')
    df['State'] = df.State.replace('Massachusetts', 'MA')
    df['State'] = df.State.replace('Mississippi', 'MS')
    df['State'] = df.State.replace('California', 'CA')
    df['State'] = df.State.replace('Michigan', 'MI')
    df['State'] = df.State.replace('South Carolina', 'SC')
    df['State'] = df.State.replace('Alaska', 'AK')
    df['State'] = df.State.replace('Illinois', 'IL')
    df['State'] = df.State.replace('Kentucky', 'KY')
    df['State'] = df.State.replace('Georgia', 'GA')
    df['State'] = df.State.replace('Pennsylvania', 'PA')
    df['State'] = df.State.replace('Virginia', 'VA')
    df['State'] = df.State.replace('Missouri', 'MO')
    df['State'] = df.State.replace('New Mexico', 'NM')
    df['State'] = df.State.replace('Louisiana', 'LA')
    df['State'] = df.State.replace('Ohio', 'OH')
    df['State'] = df.State.replace('District of Columbia', 'DC')
    df['State'] = df.State.replace('Washington', 'WA')
    df['State'] = df.State.replace('Utah', 'UT')
    df['State'] = df.State.replace('West Virginia', 'WV')
    df['State'] = df.State.replace('Iowa', 'IA')
    df['State'] = df.State.replace('Minnesota', 'MN')
    df['State'] = df.State.replace('Alabama', 'AL')
    df['State'] = df.State.replace('Montana', 'MT')
    df['State'] = df.State.replace('New Jersey', 'NJ')
    df['State'] = df.State.replace('Arkansas', 'AR')
    df['State'] = df.State.replace('Arizona', 'AZ')
    df['State'] = df.State.replace('Oklahoma', 'OK')
    df['State'] = df.State.replace('Kansas', 'KS')
    df['State'] = df.State.replace('Wyoming', 'WY')
    df['State'] = df.State.replace('Nevada', 'NV')
    df['State'] = df.State.replace('Oregon', 'OR')
    df['State'] = df.State.replace('Nebraska', 'NE')
    df['State'] = df.State.replace('Colorado', 'CO')
    df['State'] = df.State.replace('Connecticut', 'CT')
    df['State'] = df.State.replace('Delaware', 'DE')
    df['State'] = df.State.replace('Idaho', 'ID')
    df['State'] = df.State.replace('Maine', 'ME')    

__Start of Cleaning and Formatting Activities__

In [4]:
# Start with reading web page and reviewing its contents

# read start of table from 1st web page
# fill data and refsdata lists
readTable(readPage(site))

# obtain soup page
soup = readPage(site)

# fill table_headers list using 'th' tag to be used as DataFrame headers
for tx in soup.find_all('th'):
    table_headers.append(tx['column'])  
    
# read 'next' pages through page range to obtain full table information
# set counter variable to step through all pages containing the mass-shooting table
counter = 1

# loop through each page containing the web table
for x in range(counter, 18):

    # read page then table, update data file     
    page = 'https://www.gunviolencearchive.org/mass-shooting?page=' + str(counter) 
    
    # print out the page after it is read (page 1 through page 17)
    print('page: {}'.format(page))    
    
    # read table and set soup page
    readTable(readPage(page))
    
    # increment counter    
    counter += 1

page: https://www.gunviolencearchive.org/mass-shooting?page=1
page: https://www.gunviolencearchive.org/mass-shooting?page=2
page: https://www.gunviolencearchive.org/mass-shooting?page=3
page: https://www.gunviolencearchive.org/mass-shooting?page=4
page: https://www.gunviolencearchive.org/mass-shooting?page=5
page: https://www.gunviolencearchive.org/mass-shooting?page=6
page: https://www.gunviolencearchive.org/mass-shooting?page=7
page: https://www.gunviolencearchive.org/mass-shooting?page=8
page: https://www.gunviolencearchive.org/mass-shooting?page=9
page: https://www.gunviolencearchive.org/mass-shooting?page=10
page: https://www.gunviolencearchive.org/mass-shooting?page=11
page: https://www.gunviolencearchive.org/mass-shooting?page=12
page: https://www.gunviolencearchive.org/mass-shooting?page=13
page: https://www.gunviolencearchive.org/mass-shooting?page=14
page: https://www.gunviolencearchive.org/mass-shooting?page=15
page: https://www.gunviolencearchive.org/mass-shooting?page=16
p

In [5]:
# turn table list into DataFrame using table headers as column headers
df = pd.DataFrame (data,columns=table_headers)

# print DataFrame
df

Unnamed: 0,Base.IncidentID,Base.IncidentDate,Location.State,Location.CityOrCounty,Location.Address,Counts.NumberOfParticipantsKilled,Counts.NumberOfParticipantsInjured,Base.IncidentOperations
0,1671289,"May 3, 2020",Ohio,Columbus,300 block of S Ashburton Rd,0,4,View Incident\nView Source
1,1671624,"May 3, 2020",Florida,Jacksonville,2100 block of Brooklyn Rd,1,3,View Incident\nView Source
2,1671078,"May 3, 2020",Illinois,Chicago,3700 W 13th St,0,5,View Incident\nView Source
3,1670970,"May 2, 2020",Pennsylvania,Philadelphia,200 block of S Cecil St,0,4,View Incident\nView Source
4,1668470,"April 29, 2020",Indiana,Indianapolis,451 E Stop 11 Rd,0,4,View Incident\nView Source
...,...,...,...,...,...,...,...,...
445,1364540,"April 6, 2019",Illinois,Chicago,6300 block of S Seeley Ave,0,6,View Incident\nView Source
446,1363737,"April 4, 2019",Florida,Panama City,100 block of Allen Ave,1,3,View Incident\nView Source
447,1362715,"April 4, 2019",Georgia,Stockbridge,300 block of Eagle Ct,3,2,View Incident\nView Source
448,1362364,"April 2, 2019",Mississippi,Hermanville,13195 MS 18,0,4,View Incident\nView Source


> Resulting DataFrame is 450 rows by 8 columns

__Add Link to Incident Report to Data Table__

In [6]:
# print href data list - created when the table was read in
refsdata

[[<li class="1 last"><a href="https://www.nbc4i.com/news/local-news/4-injured-in-shooting-at-east-columbus-party/" target="_blank">View Source</a></li>],
 [<li class="1 last"><a href="https://www.jacksonville.com/news/20200504/1-of-4-sunday-victims-in-jacksonville-shooting-dies-he-was-ribault-football-player" target="_blank">View Source</a></li>],
 [<li class="1 last"><a href="https://chicago.suntimes.com/crime/2020/5/3/21245571/lawndale-5-teens-shot-gun-violence-drive-by-13th" target="_blank">View Source</a></li>],
 [<li class="1 last"><a href="https://6abc.com/shooting-leaves-26-year-old-man-injured-in-southwest-philadelphia-police/6145941/" target="_blank">View Source</a></li>],
 [<li class="1 last"><a href="https://www.indystar.com/story/news/crime/2020/04/29/4-injured-shooting-perry-park-south-side/3046327001/" target="_blank">View Source</a></li>],
 [<li class="1 last"><a href="https://www.cbs58.com/news/milwaukee-police-identifies-5-victims-in-shooting-near-12th-and-hadley" targ

In [7]:
# convert href's into strings for addition to table DataFrame 
# and trim unnecessary tags from html

# set counter for loop
i=0

# loop through href list 
for i in range(i, len(refsdata)):
    
    # set each element in href list to a string
    refsdata[i] = str(refsdata[i])

    #clean string data - remove first 29 characters and last 39 characters
    refsdata[i] = refsdata[i][:-39]
    refsdata[i] = refsdata[i][29:]
    
    # increment counter
    i += 1

# print href string list
refsdata

['https://www.nbc4i.com/news/local-news/4-injured-in-shooting-at-east-columbus-party/',
 'https://www.jacksonville.com/news/20200504/1-of-4-sunday-victims-in-jacksonville-shooting-dies-he-was-ribault-football-player',
 'https://chicago.suntimes.com/crime/2020/5/3/21245571/lawndale-5-teens-shot-gun-violence-drive-by-13th',
 'https://6abc.com/shooting-leaves-26-year-old-man-injured-in-southwest-philadelphia-police/6145941/',
 'https://www.indystar.com/story/news/crime/2020/04/29/4-injured-shooting-perry-park-south-side/3046327001/',
 'https://www.cbs58.com/news/milwaukee-police-identifies-5-victims-in-shooting-near-12th-and-hadley',
 'https://www.timesunion.com/news/article/Four-people-shot-in-Troy-early-Sunday-morning-15227010.php',
 'https://www.salisburypost.com/2020/04/27/salisbury-police-id-fatal-shooting-victim-others-injured/',
 'https://cbs12.com/news/local/high-school-student-in-florida-shot-and-killed-overnight',
 'https://www.fox13memphis.com/news/local/investigation-underway-

In [8]:
# add list of string href values to the table data set

# set href list to a series
se = pd.Series(refsdata)

# use href series to add values to table DataFrame
df['href'] = se.values

# print DataFrame
df

Unnamed: 0,Base.IncidentID,Base.IncidentDate,Location.State,Location.CityOrCounty,Location.Address,Counts.NumberOfParticipantsKilled,Counts.NumberOfParticipantsInjured,Base.IncidentOperations,href
0,1671289,"May 3, 2020",Ohio,Columbus,300 block of S Ashburton Rd,0,4,View Incident\nView Source,https://www.nbc4i.com/news/local-news/4-injure...
1,1671624,"May 3, 2020",Florida,Jacksonville,2100 block of Brooklyn Rd,1,3,View Incident\nView Source,https://www.jacksonville.com/news/20200504/1-o...
2,1671078,"May 3, 2020",Illinois,Chicago,3700 W 13th St,0,5,View Incident\nView Source,https://chicago.suntimes.com/crime/2020/5/3/21...
3,1670970,"May 2, 2020",Pennsylvania,Philadelphia,200 block of S Cecil St,0,4,View Incident\nView Source,https://6abc.com/shooting-leaves-26-year-old-m...
4,1668470,"April 29, 2020",Indiana,Indianapolis,451 E Stop 11 Rd,0,4,View Incident\nView Source,https://www.indystar.com/story/news/crime/2020...
...,...,...,...,...,...,...,...,...,...
445,1364540,"April 6, 2019",Illinois,Chicago,6300 block of S Seeley Ave,0,6,View Incident\nView Source,https://chicago.suntimes.com/news/8-shot-inclu...
446,1363737,"April 4, 2019",Florida,Panama City,100 block of Allen Ave,1,3,View Incident\nView Source,https://www.newsherald.com/news/20190510/hunt-...
447,1362715,"April 4, 2019",Georgia,Stockbridge,300 block of Eagle Ct,3,2,View Incident\nView Source,https://www.ajc.com/news/crime--law/breaking-p...
448,1362364,"April 2, 2019",Mississippi,Hermanville,13195 MS 18,0,4,View Incident\nView Source,https://www.wjtv.com/news/local-news/lydell-pa...


> Resulting DataFrame is 450 rows by 9 columns

__Replace Headers__

In [9]:
# added header to row index column
df.index.name = 'Index'

# renamed column headers
df = df.rename(columns = {'Base.IncidentID':'ID', 'Base.IncidentDate':'Date', 'Location.State':'State', 'Location.CityOrCounty':'City'})
df = df.rename(columns = {'Location.Address': 'Address', 'Counts.NumberOfParticipantsKilled':'Killed'})
df = df.rename(columns = {'Counts.NumberOfParticipantsInjured':'Injured', 'href':'HREF'})

# print out resulting DataFrame
df

Unnamed: 0_level_0,ID,Date,State,City,Address,Killed,Injured,Base.IncidentOperations,HREF
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1671289,"May 3, 2020",Ohio,Columbus,300 block of S Ashburton Rd,0,4,View Incident\nView Source,https://www.nbc4i.com/news/local-news/4-injure...
1,1671624,"May 3, 2020",Florida,Jacksonville,2100 block of Brooklyn Rd,1,3,View Incident\nView Source,https://www.jacksonville.com/news/20200504/1-o...
2,1671078,"May 3, 2020",Illinois,Chicago,3700 W 13th St,0,5,View Incident\nView Source,https://chicago.suntimes.com/crime/2020/5/3/21...
3,1670970,"May 2, 2020",Pennsylvania,Philadelphia,200 block of S Cecil St,0,4,View Incident\nView Source,https://6abc.com/shooting-leaves-26-year-old-m...
4,1668470,"April 29, 2020",Indiana,Indianapolis,451 E Stop 11 Rd,0,4,View Incident\nView Source,https://www.indystar.com/story/news/crime/2020...
...,...,...,...,...,...,...,...,...,...
445,1364540,"April 6, 2019",Illinois,Chicago,6300 block of S Seeley Ave,0,6,View Incident\nView Source,https://chicago.suntimes.com/news/8-shot-inclu...
446,1363737,"April 4, 2019",Florida,Panama City,100 block of Allen Ave,1,3,View Incident\nView Source,https://www.newsherald.com/news/20190510/hunt-...
447,1362715,"April 4, 2019",Georgia,Stockbridge,300 block of Eagle Ct,3,2,View Incident\nView Source,https://www.ajc.com/news/crime--law/breaking-p...
448,1362364,"April 2, 2019",Mississippi,Hermanville,13195 MS 18,0,4,View Incident\nView Source,https://www.wjtv.com/news/local-news/lydell-pa...


__Find Duplicates__

In [10]:
# Select duplicate rows except first occurrence based on all columns
duplicateRowsDF = df[df.duplicated()]

# print results
print("Duplicate Rows except based on all columns are :")
print(duplicateRowsDF)

Duplicate Rows except based on all columns are :
Empty DataFrame
Columns: [ID, Date, State, City, Address, Killed, Injured, Base.IncidentOperations, HREF]
Index: []


> The resulting empty DataFrame indicates there are no duplicates in the table DataFrame.

__Remove unnecessary column__

In [11]:
# remove Base.IncidentOperations column
del df['Base.IncidentOperations']

# print resulting DataFrame
df

Unnamed: 0_level_0,ID,Date,State,City,Address,Killed,Injured,HREF
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1671289,"May 3, 2020",Ohio,Columbus,300 block of S Ashburton Rd,0,4,https://www.nbc4i.com/news/local-news/4-injure...
1,1671624,"May 3, 2020",Florida,Jacksonville,2100 block of Brooklyn Rd,1,3,https://www.jacksonville.com/news/20200504/1-o...
2,1671078,"May 3, 2020",Illinois,Chicago,3700 W 13th St,0,5,https://chicago.suntimes.com/crime/2020/5/3/21...
3,1670970,"May 2, 2020",Pennsylvania,Philadelphia,200 block of S Cecil St,0,4,https://6abc.com/shooting-leaves-26-year-old-m...
4,1668470,"April 29, 2020",Indiana,Indianapolis,451 E Stop 11 Rd,0,4,https://www.indystar.com/story/news/crime/2020...
...,...,...,...,...,...,...,...,...
445,1364540,"April 6, 2019",Illinois,Chicago,6300 block of S Seeley Ave,0,6,https://chicago.suntimes.com/news/8-shot-inclu...
446,1363737,"April 4, 2019",Florida,Panama City,100 block of Allen Ave,1,3,https://www.newsherald.com/news/20190510/hunt-...
447,1362715,"April 4, 2019",Georgia,Stockbridge,300 block of Eagle Ct,3,2,https://www.ajc.com/news/crime--law/breaking-p...
448,1362364,"April 2, 2019",Mississippi,Hermanville,13195 MS 18,0,4,https://www.wjtv.com/news/local-news/lydell-pa...


> Resulting DataFrame is 450 rows by 8 columns

__Add Month, Day, and Year columns; populate from Date column     
Check Month, Day, and Year columns for inconsistent values__

In [12]:
# other data sources have information based on year separate from month and day
# need to separate the date into month, day, and year

# add columns for month, day, and year
df['Month'] = df['Date']
df['Day'] = df['Date']
df['Year'] = df['Date']

# loop through DataFrame and update new columns with corresponding date portion
# split the initial value on space and store the date portion appropriately
for index, row in df.iterrows():
    # split Date value on spaces, add to date list
    splitDate = row['Date'].split()
    
    # update Month column with 1st value in the string (Month)
    row['Month'] = splitDate[0]
    
    # update Day column with 2nd value in the string (Day) and remove the comma
    row['Day'] = splitDate[1][:-1]
    
    # update Year column with 3rd value in the string (Year)
    row['Year'] = splitDate[2]
    
# verify resulting DataFrame
df

Unnamed: 0_level_0,ID,Date,State,City,Address,Killed,Injured,HREF,Month,Day,Year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1671289,"May 3, 2020",Ohio,Columbus,300 block of S Ashburton Rd,0,4,https://www.nbc4i.com/news/local-news/4-injure...,May,3,2020
1,1671624,"May 3, 2020",Florida,Jacksonville,2100 block of Brooklyn Rd,1,3,https://www.jacksonville.com/news/20200504/1-o...,May,3,2020
2,1671078,"May 3, 2020",Illinois,Chicago,3700 W 13th St,0,5,https://chicago.suntimes.com/crime/2020/5/3/21...,May,3,2020
3,1670970,"May 2, 2020",Pennsylvania,Philadelphia,200 block of S Cecil St,0,4,https://6abc.com/shooting-leaves-26-year-old-m...,May,2,2020
4,1668470,"April 29, 2020",Indiana,Indianapolis,451 E Stop 11 Rd,0,4,https://www.indystar.com/story/news/crime/2020...,April,29,2020
...,...,...,...,...,...,...,...,...,...,...,...
445,1364540,"April 6, 2019",Illinois,Chicago,6300 block of S Seeley Ave,0,6,https://chicago.suntimes.com/news/8-shot-inclu...,April,6,2019
446,1363737,"April 4, 2019",Florida,Panama City,100 block of Allen Ave,1,3,https://www.newsherald.com/news/20190510/hunt-...,April,4,2019
447,1362715,"April 4, 2019",Georgia,Stockbridge,300 block of Eagle Ct,3,2,https://www.ajc.com/news/crime--law/breaking-p...,April,4,2019
448,1362364,"April 2, 2019",Mississippi,Hermanville,13195 MS 18,0,4,https://www.wjtv.com/news/local-news/lydell-pa...,April,2,2019


In [13]:
df.dtypes

ID         object
Date       object
State      object
City       object
Address    object
Killed     object
Injured    object
HREF       object
Month      object
Day        object
Year       object
dtype: object

> Resulting DataFrame is 450 rows by 11 columns

In [None]:
# verify unique Month column
df.Month.unique()

> No bad data found in Month column

In [None]:
# verify Day column
df.Day.unique()

> No bad data found in Day column

In [None]:
# verify unique City names
df.City.unique()

> No bad data found in the City column

In [None]:
# determine where NaN values exist

# notnull().sum() method counts the rows within the dataset that do not contain any null values
df.notnull().sum()

No null or NaN values found

__Identify Outliers      
Run histograms on Killed and Injured data columns, checking primarily for negative numbers__

In [None]:
# convert Killed and Injured columns to numberic data type columns
df["Killed"] = pd.to_numeric(df["Killed"])
df["Injured"] = pd.to_numeric(df["Injured"])

In [None]:
# run histogram of Injured column
runHist(df, 'Injured', 'Number Injured in Mass Shooting Event')

In [None]:
# run histogram of Killed column
runHist(df, 'Killed', 'Number Killed in Mass Shooting Event')

> No negative numbers found     
> Being mass shooting information, it is conceivable there will be smaller amounts of larger values as compared to smaller values     
> No outlier information determined

__Fix Casing     
Checked State value for consistency     
Replaced current values with two letter state abbreviation__

In [None]:
# identify unique State names
df.State.unique()

In [None]:
# update and print DataFrame
updateState(df)
df

In [None]:
# verify case of State abbreviations
df.State.unique()

> State column case is consistent

> References:   

>> GVA. (2020). _Mass Shootings_ Retrieved from Gun Violence Archive: https://www.gunviolencearchive.org/mass-shooting