In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil.parser import parse

In [32]:
data = pd.read_csv('./ns-stories-full.csv', names = ["source_name", "timestamp", "label", "url"])

In [33]:
nw_fl_daily = data.loc[data['source_name']=='NW Florida Daily'].reset_index(drop = True)
nw_fl_daily.drop_duplicates()
nw_fl_daily = nw_fl_daily[~nw_fl_daily['url'].str.contains("www.news-journalonline")]

In [34]:
nw_fl_daily_chunks = np.array_split(nw_fl_daily, 7)

In [35]:
nw_fl_daily.shape

(5545, 4)

In [36]:
nw_fl_daily.head()

Unnamed: 0,source_name,timestamp,label,url
0,NW Florida Daily,02-JAN-19-02.33.51.659996,regular,https://www.nwfdailynews.com/news/20190102/air...
1,NW Florida Daily,02-JAN-19-02.35.45.576234,regular,https://www.nwfdailynews.com/entertainmentlife...
2,NW Florida Daily,02-JAN-19-04.28.17.893483,regular,https://www.nwfdailynews.com/news/20190102/cre...
3,NW Florida Daily,02-JAN-19-04.50.00.447978,regular,https://www.nwfdailynews.com/news/20190102/fwb...
4,NW Florida Daily,02-JAN-19-05.48.26.079910,guest,https://www.nwfdailynews.com/news/20190102/gue...


In [75]:
def nw_fl_daily_scraper(x):
    ## the following function is the main function to scrape data from NW Florida Daily
    # find url
    URL = x['url']
#     print(URL)
    if x.name%100 == 1:
        print("%dst URL: %s"% (x.name, URL))
    # request url
    r = requests.get(URL) 
    # soup = BeautifulSoup(r.content, 'html5lib')
    soup = BeautifulSoup(r.content, 'lxml')
    # check 404
    if not check_404(soup):
        # find heading
        x['heading'] = find_heading(soup, URL)
        x['created_date'] = find_created_date(soup, URL)
        x['updated_date'] = find_updated_date(soup, URL)
        x['author'] = find_author(soup, URL)
        article_body = find_article_body(soup, URL)
        x['article_text'] = find_article_text(article_body)
        x['article_links'] = find_hyperlinks(article_body)
    else:
        x['heading'] = np.nan
        x['created_date'] = np.nan
        x['updated_date'] = np.nan
        x['author'] = np.nan
        x['article_text'] = np.nan
        x['article_links'] = np.nan
    return x

In [55]:
def check_blogs_url(url):
    if "blogs.nwfdailynews.com" in url:
        return True
    else:
        return False
def check_404(soup):
    try :
        error_text = soup.find('section', attrs = {'class': 'section-general'}).find('div', attrs = {'class', 'inner'}).h2.text 
        if error_text == "Page Not Found":
            return True
        else:
            return False
    except:
        return False

def find_heading(soup, url):
    if check_blogs_url(url):
        return soup.find('h1', attrs = {'class': 'entry_title'}).a.text
    else:
        return soup.find('h1', attrs = {'class':'headline'}).text

def format_date(unformatted_date):
    try:
        return datetime.strptime(unformatted_date, '%b %d, %Y at %I:%M %p')
    except:
        return unformatted_date

def find_created_date(soup, url):
    if check_blogs_url(url):
        return datetime.strptime(soup.find('div', attrs = {'class': 'post_meta'}).time['datetime'][:-6], '%Y-%m-%dT%H:%M:%S')
    else: 
        created_date = soup.find('span', attrs = {'class': 'article-meta-date'}).text.replace(u'\xa0', u' ')
        return format_date(created_date)

def find_updated_date(soup, url):
    if check_blogs_url(url):
        return np.nan
    else:
        updated_date = soup.find('span', attrs = {'class': 'article-meta-updated'}).text.replace(u'\xa0', u' ')
        return format_date(updated_date)

def find_author(soup, url):
    if check_blogs_url(url):
        return soup.find('span', attrs = {'class': 'by-author'}).find('a', attrs = {'rel':'author'}).text
    else:
        author_unformatted = soup.find('span', attrs = {'class': 'byline-item'}).text
        return author_unformatted.replace(u'\r', u' ').replace(u'\r', u' ').replace(u'\t', u' ').strip()

def find_article_body(soup, url):
    if check_blogs_url(url):
        return soup.find('div', attrs = {'class':'entry_content'})
    else:
        return soup.find('div', attrs = {'class' :'article-body'})

def find_article_text(article_body):
    article_text = []
    article_text.extend([y.text for y in article_body.findAll('p')])
    return ' '.join(article_text).strip()

def find_hyperlinks(article_body):
    article_links = []
    article_links.extend([y['href'] for y in article_body.findAll('a')])
    return article_links

In [69]:
for i in range(3, len(nw_fl_daily_chunks)):
    nw_fl_daily_chunks[i] = nw_fl_daily_chunks[i].apply(lambda x: nw_fl_daily_scraper(x), axis = 1)

2401st URL: https://www.nwfdailynews.com/news/20190421/crestview-residents-given-no-notice-of-waste-change
2501st URL: https://www.nwfdailynews.com/sports/20190425/baseball-periscope
2601st URL: https://www.nwfdailynews.com/news/20190430/private-school-voucher-program-approved-by-house
2701st URL: https://www.nwfdailynews.com/news/20190504/crestview-brings-back-military-appreciation-day
2801st URL: https://www.nwfdailynews.com/news/20190509/at-this-florida-state-park-you-can-now-buy-beer-and-wine
2901st URL: https://www.nwfdailynews.com/news/20190513/mothers-day-miracle-storm-nearly-drowns-man-sailing-through-destin-photos-map
3001st URL: https://www.nwfdailynews.com/news/20190518/three-of-four-motions-denied-in-dannys-doghouse-case
3101st URL: https://www.nwfdailynews.com/sports/20190523/rockys-driscoll-signs-with-florida-southern
3201st URL: https://www.nwfdailynews.com/news/20190529/blackwater-river-state-park-preps-for-summer
3301st URL: https://www.nwfdailynews.com/news/20190603/g

In [70]:
nw_fl_daily = pd.concat(nw_fl_daily_chunks, axis = 0)

In [71]:
nw_fl_daily.to_csv('./nw_fl_daily_part_1.csv', index = False)

In [73]:
nw_fl_daily.shape

(5545, 10)