# Group Project

# Data Acquisition (revision)

In [1]:
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import numpy as np

In [2]:
def transform_tables(pd_dataframe):
    # omit the hyperlink column that will be read as NA values
    new_dataframe = pd_dataframe.loc[:,"Incident ID":"# Injured"]
    # rename columns
    new_dataframe = new_dataframe.rename(columns = {"Incident ID": "ID", "Incident Date": "Date", 
                                    "State": "State", "City Or County": "City/County", 
                                    "Address": "Address", "# Killed": "Killed", 
                                    "# Injured": "Injured"})
    return new_dataframe

In [3]:
def save_html(url, path):
    response = requests.get(url)
    with open(path, "wb") as file:
        file.write(response.content)

In [4]:
def get_webpages(soup, year):
    last_webpage_href = soup.find('a', attrs={'title': "Go to last page"})
    last_webpage_path = last_webpage_href.get('href')
    number_of_other_pages = int(re.findall(r'%s(\d+)'%"page=", last_webpage_path)[0])
    if year in range(2014, 2016):
        webpage_paths = ['/reports/mass-shootings/'+ str(year)] # initialize with the first page's path
    else:
        webpage_paths = ['/reports/mass-shooting?year='+ str(year)]
    for page_number in range(1, number_of_other_pages + 1):
        path = re.sub(str(number_of_other_pages), str(page_number), last_webpage_path)
        webpage_paths.append(path)
    return webpage_paths

In [68]:
def get_news_sources(soup):
    news_hrefs = soup.findAll('a', attrs={'href': re.compile("^https://|^http://")})
    news_links = [tag.get('href') for tag in news_hrefs if tag.text == "View Source"] # get all sources listed on a page
    return news_links

In [117]:
# get the report tables

annual_reports = []
for year in range(2014, 2020):
    first_page_url = "https://www.gunviolencearchive.org/reports/mass-shootings/" + str(year)
    csv_file = str(year) + "_mass_shootings.csv"
    this_year_report = pd.read_csv(csv_file)
    cleaned_report = transform_tables(this_year_report)
    annual_reports.append(cleaned_report)

ms_2014 = annual_reports[0]
ms_2015 = annual_reports[1]
ms_2016 = annual_reports[2]
ms_2017 = annual_reports[3]
ms_2018 = annual_reports[4]
ms_2019 = annual_reports[5]

ms_2019[:10]

Unnamed: 0,ID,Date,State,City/County,Address,Killed,Injured
0,1582816,"December 29, 2019",Illinois,Danville,1803 block of Westview Ave,0,5
1,1582197,"December 29, 2019",New York,Buffalo,50 block of Henrietta Ave,1,3
2,1583444,"December 29, 2019",California,Ceres,800 block of Allacante Dr,0,5
3,1581590,"December 27, 2019",California,Modesto,1900 block of Vernon Ave,1,3
4,1581476,"December 27, 2019",Georgia,Kennesaw,1575 Ridenour Pkwy NW,0,4
5,1581103,"December 27, 2019",Texas,Houston,500 block of Smart St,2,7
6,1580308,"December 26, 2019",Florida,Saint Petersburg,4300 6th St S,0,5
7,1579889,"December 25, 2019",California,Oakland,2100 block of 34th Ave,0,4
8,1579970,"December 25, 2019",Iowa,Coralville,954 Boston Way,1,3
9,1579856,"December 25, 2019",Virginia,Richmond,1800 E Main St,1,3


In [28]:
# save first pages html

for year in range(2014, 2020):
    if year in range(2014, 2016):
        first_page_url = "https://www.gunviolencearchive.org/reports/mass-shootings/" + str(year)
    else:
        first_page_url = "https://www.gunviolencearchive.org/reports/mass-shooting?year=" + str(year)
    path = "mass_shooting_html_"+ str(year) 
    save_html(first_page_url, path)

In [116]:
# get all pages paths

web_pages_paths = []
for year in range(2014, 2020):
    path = "mass_shooting_html_"+ str(year)
    soup = BeautifulSoup(open(path,'r'), 'html.parser')
    web_pages_paths.append(get_webpages(soup, year)) # including the first

web_pages_paths[:2] 

[['/reports/mass-shootings/2014',
  '/reports/mass-shootings/2014?page=1',
  '/reports/mass-shootings/2014?page=2',
  '/reports/mass-shootings/2014?page=3',
  '/reports/mass-shootings/2014?page=4',
  '/reports/mass-shootings/2014?page=5',
  '/reports/mass-shootings/2014?page=6',
  '/reports/mass-shootings/2014?page=7',
  '/reports/mass-shootings/2014?page=8',
  '/reports/mass-shootings/2014?page=9',
  '/reports/mass-shootings/2014?page=10'],
 ['/reports/mass-shootings/2015',
  '/reports/mass-shootings/2015?page=1',
  '/reports/mass-shootings/2015?page=2',
  '/reports/mass-shootings/2015?page=3',
  '/reports/mass-shootings/2015?page=4',
  '/reports/mass-shootings/2015?page=5',
  '/reports/mass-shootings/2015?page=6',
  '/reports/mass-shootings/2015?page=7',
  '/reports/mass-shootings/2015?page=8',
  '/reports/mass-shootings/2015?page=9',
  '/reports/mass-shootings/2015?page=10',
  '/reports/mass-shootings/2015?page=11',
  '/reports/mass-shootings/2015?page=12',
  '/reports/mass-shooting

In [69]:
sources_container = np.arange(2014, 2020, 1).tolist()
for year_index in range(len(sources_container)):
    year = 2014 + year_index
    sources_container[year_index] = []
    page_index = -1
    for path in web_pages_paths[year_index]:
        page_index += 1
        link = "https://www.gunviolencearchive.org" + path
        filename = "mass_shooting_html_"+ str(year) + "_page_" + str(page_index)
#         save_html(link, filename)
        soup = BeautifulSoup(open(filename,'r'), 'html.parser')
        this_page_sources = get_news_sources(soup)
        sources_container[year_index].append(this_page_sources) 

In [95]:
sources_container[5][8]

['https://www.kmov.com/news/downtown-st-louis-shooting-leaves-injured-overnight/article_5fe78650-a0ba-11e9-afa7-4fca28c07d37.html',
 'https://chicago.suntimes.com/crime/2019/7/7/20684947/4-wounded-englewood-shooting-south-paulina-street-gun-violence',
 'https://chicago.suntimes.com/crime/2019/7/7/20684923/4-wounded-1-critically-englewood-shooting-61st-place',
 'https://www.abqjournal.com/1337222/4-shot-outside-abq-night-club.html',
 'https://www.nbcbayarea.com/news/local/Quadruple-Shooting-San-Jose-Police-512320011.html',
 'https://www.wcvb.com/article/6-people-shot-outside-of-roxbury-party-police-say/28306883',
 'https://www.rgj.com/story/news/2021/03/24/reno-family-says-slayed-teen-hero-third-arrest-made-case/4761896001/',
 'https://chicago.suntimes.com/crime/2019/7/5/20682901/5-wounde-woodlawn-shooting-67th-street-gun-violence',
 'https://abc7ny.com/4-shot-in-brooklyn-after-4th-of-july-fireworks/5379759/',
 'https://www.wifr.com/content/news/Four-people-shot-after-argument-in-Rockfo

In [79]:
def remove_nesting(nested_list):
    return [i for j in nested_list for i in j]

news_2014 = remove_nesting(sources_container[0])
news_2015 = remove_nesting(sources_container[1])
news_2016 = remove_nesting(sources_container[2])
news_2017 = remove_nesting(sources_container[3])
news_2018 = remove_nesting(sources_container[4])
news_2019 = remove_nesting(sources_container[5])

In [118]:
news_2019[:5]

['http://www.vermilioncountyfirst.com/2019/12/29/5-women-shot-at-danville-home/',
 'https://www.wkbw.com/news/local-news/4-shot-1-seriously-injured-in-riverside-early-sunday',
 'https://www.modbee.com/news/local/crime/article238828443.html',
 'https://www.modbee.com/news/local/crime/article238841643.html',
 'https://www.11alive.com/article/news/crime/kennesaw-area-shooting-apartment/85-e07c79a0-0881-466e-aa1b-7214148e85e6']

In [119]:
ms_2014['Source'] = news_2014
ms_2015['Source'] = news_2015
ms_2016['Source'] = news_2016
ms_2017['Source'] = news_2017
ms_2018['Source'] = news_2018

# ms_2019['Source'] = news_2019 
# gives error since one row does not have a source listed directly

index = news_2019.index("https://www.wcvb.com/article/6-people-shot-outside-of-roxbury-party-police-say/28306883") # index of where it is supposed to be 
news_2019.insert(index, "https://fox2now.com/2019/07/07/north-county-residents-on-edge-after-5-adults-found-dead-in-apartment/")
news_2019 = [news_2019[i] for i in range(len(news_2019)) if news_2019[i] != news_2019[i-1]] 
# in case of re-running the insert code and duplicating

ms_2019['Source'] = news_2019 
ms_2019[:10]


Unnamed: 0,ID,Date,State,City/County,Address,Killed,Injured,Source
0,1582816,"December 29, 2019",Illinois,Danville,1803 block of Westview Ave,0,5,http://www.vermilioncountyfirst.com/2019/12/29...
1,1582197,"December 29, 2019",New York,Buffalo,50 block of Henrietta Ave,1,3,https://www.wkbw.com/news/local-news/4-shot-1-...
2,1583444,"December 29, 2019",California,Ceres,800 block of Allacante Dr,0,5,https://www.modbee.com/news/local/crime/articl...
3,1581590,"December 27, 2019",California,Modesto,1900 block of Vernon Ave,1,3,https://www.modbee.com/news/local/crime/articl...
4,1581476,"December 27, 2019",Georgia,Kennesaw,1575 Ridenour Pkwy NW,0,4,https://www.11alive.com/article/news/crime/ken...
5,1581103,"December 27, 2019",Texas,Houston,500 block of Smart St,2,7,https://www.chron.com/news/houston-texas/houst...
6,1580308,"December 26, 2019",Florida,Saint Petersburg,4300 6th St S,0,5,https://www.tampabay.com/news/st-petersburg/20...
7,1579889,"December 25, 2019",California,Oakland,2100 block of 34th Ave,0,4,https://abc7news.com/quadruple-shooting-at-hol...
8,1579970,"December 25, 2019",Iowa,Coralville,954 Boston Way,1,3,https://www.kcrg.com/2020/07/08/brothers-now-j...
9,1579856,"December 25, 2019",Virginia,Richmond,1800 E Main St,1,3,https://wtvr.com/2019/12/25/kenneth-lawson-obit/


In [115]:
ms_2014.to_csv(path_or_buf = "complete_2014_dataset")
ms_2015.to_csv(path_or_buf = "complete_2015_dataset")
ms_2016.to_csv(path_or_buf = "complete_2016_dataset")
ms_2017.to_csv(path_or_buf = "complete_2017_dataset")
ms_2018.to_csv(path_or_buf = "complete_2018_dataset")
ms_2019.to_csv(path_or_buf = "complete_2019_dataset") # export as csv files

In [130]:
merged_data = pd.concat([ms_2014, ms_2015, ms_2016, ms_2017, ms_2018, ms_2019])

In [134]:
len(merged_data)

2085

In [135]:
merged_data.to_csv(path_or_buf = "complete_project_dataset")