Twitter Fingers == Trigger Fingers: A Look at Gun Violence

Data Acquisition and Cleaning Part 1: Data was gathered from one main source, gunviolencearchive.org. Each individual reporting of gun violence comes with sources to prove that it happened, as well as details on the number injured, killed, date, time, and place. The first data acquisition takes these variables, puts them into a dataframe then into csv, as well as finds the source and cleans the links so that they are easily accessible. 

In [None]:
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import numpy as np

In [None]:
def transform_tables(pd_dataframe):
    # omit the hyperlink column that will be read as NA values
    new_dataframe = pd_dataframe.loc[:,"Incident ID":"# Injured"]
    # rename columns
    new_dataframe = new_dataframe.rename(columns = {"Incident ID": "ID", "Incident Date": "Date", 
                                    "State": "State", "City Or County": "City/County", 
                                    "Address": "Address", "# Killed": "Killed", 
                                    "# Injured": "Injured"})
    return new_dataframe

In [None]:
def save_html(url, path):
    response = requests.get(url)
    with open(path, "wb") as file:
        file.write(response.content)

In [None]:
def get_webpages(soup, year):
    last_webpage_href = soup.find('a', attrs={'title': "Go to last page"})
    last_webpage_path = last_webpage_href.get('href')
    number_of_other_pages = int(re.findall(r'%s(\d+)'%"page=", last_webpage_path)[0])
    if year in range(2014, 2016):
        webpage_paths = ['/reports/mass-shootings/'+ str(year)] # initialize with the first page's path
    else:
        webpage_paths = ['/reports/mass-shooting?year='+ str(year)]
    for page_number in range(1, number_of_other_pages + 1):
        path = re.sub(str(number_of_other_pages), str(page_number), last_webpage_path)
        webpage_paths.append(path)
    return webpage_paths

In [None]:
def get_news_sources(soup):
    news_hrefs = soup.findAll('a', attrs={'href': re.compile("^https://|^http://")})
    news_links = [tag.get('href') for tag in news_hrefs if tag.text == "View Source"] # get all sources listed on a page
    return news_links

In [29]:
# get the report tables

annual_reports = []
for year in range(2014, 2020):
    first_page_url = "https://www.gunviolencearchive.org/reports/mass-shootings/" + str(year)
    csv_file = str(year) + "_mass_shootings.csv"
    this_year_report = pd.read_csv(csv_file)
    cleaned_report = transform_tables(this_year_report)
    annual_reports.append(cleaned_report)

FileNotFoundError: [Errno 2] No such file or directory: '2014_mass_shootings.csv'

In [None]:
ms_2014 = annual_reports[0]
ms_2015 = annual_reports[1]
ms_2016 = annual_reports[2]
ms_2017 = annual_reports[3]
ms_2018 = annual_reports[4]
ms_2019 = annual_reports[5]

ms_2019[:10]

In [27]:
# save first pages html

for year in range(2014, 2020):
    if year in range(2014, 2016):
        first_page_url = "https://www.gunviolencearchive.org/reports/mass-shootings/" + str(year)
    else:
        first_page_url = "https://www.gunviolencearchive.org/reports/mass-shooting?year=" + str(year)
    path = "mass_shooting_html_"+ str(year) 
    save_html(first_page_url, path)

In [28]:
# get all pages paths

web_pages_paths = []
for year in range(2014, 2020):
    path = "mass_shooting_html_"+ str(year)
    soup = BeautifulSoup(open(path,'r'), 'html.parser')
    web_pages_paths.append(get_webpages(soup, year)) # including the first

web_pages_paths[:2] 

[['/reports/mass-shootings/2014',
  '/reports/mass-shootings/2014?page=1',
  '/reports/mass-shootings/2014?page=2',
  '/reports/mass-shootings/2014?page=3',
  '/reports/mass-shootings/2014?page=4',
  '/reports/mass-shootings/2014?page=5',
  '/reports/mass-shootings/2014?page=6',
  '/reports/mass-shootings/2014?page=7',
  '/reports/mass-shootings/2014?page=8',
  '/reports/mass-shootings/2014?page=9',
  '/reports/mass-shootings/2014?page=10'],
 ['/reports/mass-shootings/2015',
  '/reports/mass-shootings/2015?page=1',
  '/reports/mass-shootings/2015?page=2',
  '/reports/mass-shootings/2015?page=3',
  '/reports/mass-shootings/2015?page=4',
  '/reports/mass-shootings/2015?page=5',
  '/reports/mass-shootings/2015?page=6',
  '/reports/mass-shootings/2015?page=7',
  '/reports/mass-shootings/2015?page=8',
  '/reports/mass-shootings/2015?page=9',
  '/reports/mass-shootings/2015?page=10',
  '/reports/mass-shootings/2015?page=11',
  '/reports/mass-shootings/2015?page=12',
  '/reports/mass-shooting

In [30]:
sources_container = np.arange(2014, 2020, 1).tolist()
for year_index in range(len(sources_container)):
    year = 2014 + year_index
    sources_container[year_index] = []
    page_index = -1
    for path in web_pages_paths[year_index]:
        page_index += 1
        link = "https://www.gunviolencearchive.org" + path
        filename = "mass_shooting_html_"+ str(year) + "_page_" + str(page_index)
#         save_html(link, filename)
        soup = BeautifulSoup(open(filename,'r'), 'html.parser')
        this_page_sources = get_news_sources(soup)
        sources_container[year_index].append(this_page_sources) 

FileNotFoundError: [Errno 2] No such file or directory: 'mass_shooting_html_2014_page_0'

In [None]:
sources_container[5][8]

In [31]:
def remove_nesting(nested_list):
    return [i for j in nested_list for i in j]

In [32]:
news_2014 = remove_nesting(sources_container[0])
news_2015 = remove_nesting(sources_container[1])
news_2016 = remove_nesting(sources_container[2])
news_2017 = remove_nesting(sources_container[3])
news_2018 = remove_nesting(sources_container[4])
news_2019 = remove_nesting(sources_container[5])
news_2019[:5]

TypeError: 'int' object is not iterable

In [None]:
ms_2014['Source'] = news_2014
ms_2015['Source'] = news_2015
ms_2016['Source'] = news_2016
ms_2017['Source'] = news_2017
ms_2018['Source'] = news_2018

# ms_2019['Source'] = news_2019 
# gives error since one row does not have a source listed directly

index = news_2019.index("https://www.wcvb.com/article/6-people-shot-outside-of-roxbury-party-police-say/28306883") # index of where it is supposed to be 
news_2019.insert(index, "https://fox2now.com/2019/07/07/north-county-residents-on-edge-after-5-adults-found-dead-in-apartment/")
news_2019 = [news_2019[i] for i in range(len(news_2019)) if news_2019[i] != news_2019[i-1]] 
# in case of re-running the insert code and duplicating

ms_2019['Source'] = news_2019 
ms_2019[:10]


In [None]:
ms_2014.to_csv(path_or_buf = "complete_2014_dataset")
ms_2015.to_csv(path_or_buf = "complete_2015_dataset")
ms_2016.to_csv(path_or_buf = "complete_2016_dataset")
ms_2017.to_csv(path_or_buf = "complete_2017_dataset")
ms_2018.to_csv(path_or_buf = "complete_2018_dataset")
ms_2019.to_csv(path_or_buf = "complete_2019_dataset") # export as csv files

In [None]:
merged_data = pd.concat([ms_2014, ms_2015, ms_2016, ms_2017, ms_2018, ms_2019])
print(len(merged_data))
merged_data.to_csv(path_or_buf = "complete_project_dataset1")

Data Acquisition Part 2: The second step of the data acquisition process was to access the sources, and find the article text from each source. This allows us to create a second set of data in a text file for analysis of all words from these articles, "articletext". 

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import socket
import urllib.request

#reads in complete project data
gunviolencedataset = pd.read_csv("complete_project_dataset")
sourceurl = gunviolencedataset["Source"] #creates a series of just the source urls

The function that takes the source url, goes to the url, reads in the text from that site. The function then writes the text from the "p" tag to a text file, so that it can be accessed afterwards. 

In [None]:
def getsourcetext(urlseries):
    #initialize beautifulsoup
    soup = BeautifulSoup('''<html>  </html>''', 'html.parser') 
    timeout = 20 #creates a timeout variable w an int
    socket.setdefaulttimeout(timeout) #uses timeout to set the socket timeout
    textfile = open("articletext", "w+",  encoding="utf-8") #creates a writing file
    for k in urlseries: #runs this loop for every entry in the series
        try:
            html = urllib.request.urlopen(url=k) #opens k website
            html = html.read() #reads in website info
            htmlfile = html.decode('utf-8') #decodes the info into a new file
        #need to make exceptions for 404/403/etc
        except Exception as e:
            continue
        else: #what to do after try block works or doesn't work
            singlesoup = BeautifulSoup(htmlfile, 'html.parser') #the variable that holds the data from the article
            for n in singlesoup("p"): #finds p tag (the main paragraph of the article)
                textfile.write(n.get_text(strip=True)) #we only want the parts of the article that are from the main paragraph
            soup.append(singlesoup) #adds the html file to the collective beautiful soup file
    textfile.close()
    return soup

The below block gives the "allhtmlfiletext" file and creates a beautifulsoup object of all the htmlfiles in the series. The purpose of creating the "allhtmlfiletext" is so that the getsourcetext() function defined above doesn't have to run everytime, it takes about 40-50 minutes to complete. 

In [None]:
sourcesoup = getsourcetext(sourceurl) #makes sourcesoup into the entire beautiful soup, as well as finds the links of the source url series
allhtmlfiles = sourcesoup.get_text(strip=True) #gets all of the written words out as a sting
allhtmltxt = open("allhtmlfiletext", "w", encoding = "utf-8") #creates a file
allhtmltxt.write(allhtmlfiles) #adds this string to the file
allhtmltxt.close() #closes file

Analaysis Method Outlines: NLP and Sentiment Analysis Classification

NLP Analysis: We wanted to use Natural Language Processing to inspect the relations of words in the articles we find, in the hopes that we can relate fequency of word use and the context that these fequently used words are in to the increase of gun violence in America. To start this the article text extracted must be tokenized (sorted into words), remove the words and punctuation that are too common to be useful, and assess the frequency of the cleaned words. Then ngrams are created of these words to assess the context that the frequent words come in. 

What remains to be done is connect these results to collective meaning about the articles, and how that relates to gun violence. 

In [None]:
import nltk
from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

articletextfile = open("articletext", encoding='utf-8')
articlestring = articletextfile.read()
articletextfile.close()

#need to tokenize the article text
tokenizer = RegexpTokenizer(r'\w+')
articletext = tokenizer.tokenize(articlestring)

#need to remove stopwords and punctuation
stopwords = nltk.corpus.stopwords.words('english')
capstopwords = [w.title() for w in stopwords]
numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "zero"]
stopwords.extend(capstopwords)
stopwords.extend(numbers)
articletext = [word.lower() for word in articletext if word not in stopwords]

#finds most common words
frequency_dist = FreqDist(articletext)
keywordstuple = frequency_dist.most_common(10)
keywords = []
for wordtuple in keywordstuple:
    keywords.append(wordtuple[0])
print(keywordstuple)

In [None]:
#ngram analysis of text
bigrams = nltk.bigrams(articletext)
trigrams = nltk.trigrams(articletext)
bigramlist = list(bigrams)
trigramlist = list(trigrams)
#need to find the keywords in the ngrams
keywordbigrams = [word_tuple for word_tuple in bigramlist if ((word_tuple[0] in keywords) | (word_tuple[1] in keywords))]
keywordtrigrams = [word_tuple for word_tuple in trigramlist if ((word_tuple[0] in keywords) | (word_tuple[1] in keywords) | (word_tuple[2] in keywords))]
#