## SQL Alchemy Class definition

In [None]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()

article = Article(title, article_text, url, most_common)
session.add(article)

session.commit()

In [None]:
for instance in session.query(Article).order_by(Article.id):
    print(instance.title, instance.url)

## Scraper code

In [1]:
from libs.article import Article

import re
import requests

from bs4 import BeautifulSoup
from sqlalchemy.orm import sessionmaker

from libs.sqlcreator import create_alchemy_engine
from libs.multi_thread import multi_thread

In [2]:
import string

from collections import Counter
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

def get_most_common(text):
    exclude = set(string.punctuation)
    text_nopunct = ''.join(ch for ch in text if ch not in exclude)

    words = text_nopunct.lower().split(" ")
    words = [word for word in words if word not in ENGLISH_STOP_WORDS and len(word) > 1]

    return [word for word,count in list(Counter(words).most_common(15))]


with open("npr_article_572945894.txt") as file_hdl:
    article_text = file_hdl.read()
    
title = article_text.split("\n")[0]
url = "npr.org/testing"

most_common = get_most_common(article_text)


In [3]:
def get_text(soup):
    text = ""
    for paragraph in soup.find_all('p'):
        if not paragraph.has_attr('class') and not paragraph.findChildren('b'):
            text += paragraph.get_text() + "\n\n"
            
    return text

In [4]:
def get_npr_urls(soup):
    
    pattern = r'20[\d]{2}/[\d]{1,2}/[\d]{1,2}/[\d]+/'
    
    urls = []
    for link in soup.find_all('a', href=True):
        url = link['href']
        if (url.startswith('https://www.npr.org/')):
            if re.search(pattern, url):
                urls.append(url)
        
    return urls

In [5]:
def get_title(soup):
    title = ""
    #find href with class == title, then get b
    for link in soup.find_all('a'):
        if link.has_attr('class'):
            if link['class'][0] == "title":
                title = link.findChildren('b')[0].get_text()
    
    return title

In [6]:
def get_details(url):
    pattern = r'20[\d]{2}/[\d]{1,2}/[\d]{1,2}/[\d]+/'
    
    matchObject = re.search(pattern, url, flags=0)
    match_split = matchObject[0].split("/")
    date = match_split[0] + "-" + match_split[1] + "-" + match_split[2]
    article_id = match_split[3]
    
    return date, article_id

In [7]:
def scrape_url(url):
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    links = get_npr_urls(soup)

    urls = [x for x in get_npr_urls(soup)]

    text = get_text(soup)

    return_dict = {"urls": links}
    if len(text) > 100:
        title = get_title(soup)
        date, article_id = get_details(url)

        return_dict["article"] = Article(title, text, url, get_most_common(text), date)
        
    return return_dict

In [8]:
article_dict = scrape_url("https://www.npr.org/2018/05/04/608323118/in-wake-of-school-shooting-trump-pence-to-address-nra")
article_dict["article"]

<libs.article.Article at 0x5307a90>

In [9]:
##############
## 

start_url = "https://www.npr.org/"
traversed_urls = set()

to_traverse = set()
to_traverse.add(start_url)

engine = create_alchemy_engine()


Session = sessionmaker(bind=engine)
session = Session()

count = 0
while len(to_traverse) > 0 and count < 1000:

    ## Create a list of URLS to traverse
    curr_traverse = []
    while len(curr_traverse) < 20 and len(to_traverse) > 0:
        curr_traverse.append(to_traverse.pop())
    
    ## TODO: Confirm that the url's have NOT been traversed before (query against DB)
    
    ## Pass that list to multi-threading
    results = multi_thread(scrape_url, curr_traverse, 10)
    
    ## Multi-threading should return dictionaries mapping to results and to discovered URLs
    
    
    ## Create class for DB that maintains unique set of to_traverse and traverse
    for result in results:
        result_dict = result[1]
        if "article" in result_dict:
            session.add(result_dict["article"])
            count += 1
        
        to_traverse = to_traverse.union(set(result_dict["urls"]) - traversed_urls)
        
    session.commit()        
    count += 1
              

In [15]:
from sqlalchemy import desc

#for instance in session.query(Article).order_by(desc(Article.date)):
#    print(instance.title, instance.url)

for instance in session.query(Article).order_by(desc(Article.date)).limit(10).offset(10):
    print(instance.title, instance.url)

Three More Women Come Forward To Accuse R. Kelly Of Abuse https://www.npr.org/sections/therecord/2018/05/04/608608800/three-more-women-come-forward-to-accuse-r-kelly-of-abuse
Slice Of History: Pieces Of British Royal Wedding Cakes Up For Auction https://www.npr.org/sections/thetwo-way/2018/05/04/608493330/slice-of-history-pieces-of-british-royal-wedding-cakes-up-for-auction
NASA Is Heading Back To Mars To Peer Inside The Red Planet https://www.npr.org/sections/thetwo-way/2018/05/04/608448614/nasa-is-heading-back-to-mars-to-peer-inside-the-red-planet
Big Jump Seen In Number Of Inmates Prescribed Psychiatric Drugs In California https://www.npr.org/sections/health-shots/2018/05/04/608271211/big-jump-in-number-of-inmates-prescribed-psychiatric-drugs-in-california
India Reforms Its Anti-Rape Laws — To Mixed Reaction https://www.npr.org/sections/goatsandsoda/2018/05/04/608516694/india-reforms-its-anti-rape-laws-to-mixed-reaction
D.C. Un-United: Amazon's Second HQ Pits City Vs. Its Suburbs ht