# Group Project

# Data Acquisition (revision)

In [2]:
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import numpy as np

In [74]:
def transform_tables(pd_dataframe):
    # omit the hyperlink column that will be read as NA values
    new_dataframe = pd_dataframe.loc[:,"Incident ID":"# Injured"]
    # rename columns
    new_dataframe = new_dataframe.rename(columns = {"Incident ID": "ID", "Incident Date": "Date", 
                                    "State": "State", "City Or County": "City/County", 
                                    "Address": "Address", "# Killed": "Killed", 
                                    "# Injured": "Injured"})
    return new_dataframe

In [75]:
def save_html(url, path):
    response = requests.get(url)
    with open(path, "wb") as file:
        file.write(response.content)

In [76]:
def get_webpages(soup, year):
    last_webpage_href = soup.find('a', attrs={'title': "Go to last page"})
    last_webpage_path = last_webpage_href.get('href')
    number_of_other_pages = int(re.findall(r'%s(\d+)'%"page=", last_webpage_path)[0])
    if year in range(2014, 2016):
        webpage_paths = ['/reports/mass-shootings/'+ str(year)] # initialize with the first page's path
    else:
        webpage_paths = ['/reports/mass-shooting?year='+ str(year)]
    for page_number in range(1, number_of_other_pages + 1):
        path = re.sub(str(number_of_other_pages), str(page_number), last_webpage_path)
        webpage_paths.append(path)
    return webpage_paths

In [77]:
def get_news_sources(soup):
    news_hrefs = soup.findAll('a', attrs={'href': re.compile("^http://")})
    news_links = [tag.get('href') for tag in news_hrefs] # get all sources listed on a page
    return news_links

In [17]:
# get the report tables

annual_reports = []
for year in range(2014, 2020):
    first_page_url = "https://www.gunviolencearchive.org/reports/mass-shootings/" + str(year)
    csv_file = str(year) + "_mass_shootings.csv"
    this_year_report = pd.read_csv(csv_file)
    cleaned_report = transform_tables(this_year_report)
    annual_reports.append(cleaned_report)

ms_2014 = annual_reports[0]
ms_2015 = annual_reports[1]
ms_2016 = annual_reports[2]
ms_2017 = annual_reports[3]
ms_2018 = annual_reports[4]
ms_2019 = annual_reports[5]

ms_2019

Unnamed: 0,ID,Date,State,City/County,Address,Killed,Injured
0,1582816,"December 29, 2019",Illinois,Danville,1803 block of Westview Ave,0,5
1,1582197,"December 29, 2019",New York,Buffalo,50 block of Henrietta Ave,1,3
2,1583444,"December 29, 2019",California,Ceres,800 block of Allacante Dr,0,5
3,1581590,"December 27, 2019",California,Modesto,1900 block of Vernon Ave,1,3
4,1581476,"December 27, 2019",Georgia,Kennesaw,1575 Ridenour Pkwy NW,0,4
...,...,...,...,...,...,...,...
412,1293054,"January 4, 2019",California,Torrance,22501 Hawthorne Blvd,3,4
413,1292941,"January 3, 2019",Arizona,Yuma,3800 block of E County 18½ St,1,3
414,1291779,"January 2, 2019",Arkansas,Jonesboro,3516 Galaxy St,1,3
415,1289877,"January 1, 2019",South Carolina,Columbia,1709 Decker Blvd,0,5


In [28]:
# save first pages html

for year in range(2014, 2020):
    if year in range(2014, 2016):
        first_page_url = "https://www.gunviolencearchive.org/reports/mass-shootings/" + str(year)
    else:
        first_page_url = "https://www.gunviolencearchive.org/reports/mass-shooting?year=" + str(year)
    path = "mass_shooting_html_"+ str(year) 
    save_html(first_page_url, path)

In [71]:
# get all pages paths

web_pages_paths = []
for year in range(2014, 2020):
    path = "mass_shooting_html_"+ str(year)
    soup = BeautifulSoup(open(path,'r'), 'html.parser')
    web_pages_paths.append(get_webpages(soup, year)) # including the first

web_pages_paths 

[['/reports/mass-shootings/2014',
  '/reports/mass-shootings/2014?page=1',
  '/reports/mass-shootings/2014?page=2',
  '/reports/mass-shootings/2014?page=3',
  '/reports/mass-shootings/2014?page=4',
  '/reports/mass-shootings/2014?page=5',
  '/reports/mass-shootings/2014?page=6',
  '/reports/mass-shootings/2014?page=7',
  '/reports/mass-shootings/2014?page=8',
  '/reports/mass-shootings/2014?page=9',
  '/reports/mass-shootings/2014?page=10'],
 ['/reports/mass-shootings/2015',
  '/reports/mass-shootings/2015?page=1',
  '/reports/mass-shootings/2015?page=2',
  '/reports/mass-shootings/2015?page=3',
  '/reports/mass-shootings/2015?page=4',
  '/reports/mass-shootings/2015?page=5',
  '/reports/mass-shootings/2015?page=6',
  '/reports/mass-shootings/2015?page=7',
  '/reports/mass-shootings/2015?page=8',
  '/reports/mass-shootings/2015?page=9',
  '/reports/mass-shootings/2015?page=10',
  '/reports/mass-shootings/2015?page=11',
  '/reports/mass-shootings/2015?page=12',
  '/reports/mass-shooting

In [81]:
sources_container = np.arange(2014, 2020, 1).tolist()
for i in range(len(sources_container)):
    sources_container[i] = []
    for path in web_pages_paths[i]:
        link = "https://www.gunviolencearchive.org" + path
        filename = "mass_shooting_html_"+ str(year) + "_page_" + str(i)
        save_html(link, filename)
        soup = BeautifulSoup(open(filename,'r'), 'html.parser')
        this_page_sources = get_news_sources(soup)
        sources_container[i].append(this_page_sources) 

In [106]:
len(sources_container[3][7]) # something is wrong with scraping 2019 and there are some missing sources on some pages 

21

In [104]:
def remove_nesting(nested_list):
    return [i for j in nested_list for i in j]

news_2014 = remove_nesting(sources_container[0])
news_2015 = remove_nesting(sources_container[1])
news_2016 = remove_nesting(sources_container[2])
news_2017 = remove_nesting(sources_container[3])
news_2018 = remove_nesting(sources_container[4])
news_2019 = remove_nesting(sources_container[5])

In [105]:
news_2017

['http://kdvr.com/2017/12/31/douglas-county-sheriff-responds-to-officer-down-call/',
 'http://www.nj.com/monmouth/index.ssf/2018/01/third_person_escaped_home_in_killing_of_parents_sister_family_friend_prosecutor.html#incart_river_index',
 'http://wsvn.com/news/local/5-hospitalized-after-shooting-in-nw-miami-dade/',
 'http://deltadailynews.com/downtown-vicksburg-shooting-injures-five-people/',
 'http://www.wsfa.com/story/37136634/man-arrested-in-troy-shooting-with-5-victims-1-dead',
 'http://klfy.com/2017/12/29/local-rapper-charged-in-eunice-homicide/',
 'http://www.al.com/news/birmingham/index.ssf/2017/12/4_shot_in_drive-by_shooting_in.html',
 'http://www.katc.com/story/37123665/four-people-injured-in-shooting-on-n-university-avenue',
 'http://www.wtvm.com/story/37092610/update-arrest-made-in-attala-co-night-club-shooting',
 'http://losangeles.cbslocal.com/2017/12/15/4-shot-including-9-year-old-girl-in-pacoima/',
 'http://www.wlky.com/article/3-charged-in-west-louisville-drive-by-shoot