# Web Crawler

The script crawls articles on Feedspot's blog to gather RSS feeds.

## Load Data

Loads the following data from the database and stores it in global variables:
* Feed urls
* Unvisited lists (articles) urls
* Visited lists (articles) urls

In [5]:
import requests
import sqlite3
from bs4 import BeautifulSoup

# first list url to be scraped if there is not data in db
firstListUrl = 'https://blog.feedspot.com/uk_rss_feeds/'

# list of all feeds
feeds = []

unvisitedListUrls = []
visitedListUrls = []

# initial lengths of lists to make sure
# only new data is added to the db
initialNoFeeds = 0
initialNoVisitedLists = 0

# number n of visited lists
# will be used to remove top n urls from db
visitedLists = 0

# load data from the database
def loadData():

    print('Loading data from db...')

    # connect to the database
    conn = sqlite3.connect('feeds_dev.db')
    c = conn.cursor()

    # load feeds
    c.execute('SELECT url FROM feeds;')
    global feeds
    feeds = [i[0] for i in c.fetchall()]

    global initialNoFeeds
    initialNoFeeds = len(feeds)
    print('Loaded ' + str(initialNoFeeds) + ' feeds')

    # load unvisited lists
    c.execute('SELECT url FROM unvisited_lists;')
    global unvisitedListUrls
    unvisitedListUrls = [i[0] for i in c.fetchall()]

    global initialNoUnvisitedLists
    initialNoUnvisitedLists = len(unvisitedListUrls)
    print('Loaded ' + str(initialNoUnvisitedLists) + ' unvisited lists')
    
    # add an url if the db is empty
    if (len(unvisitedListUrls) == 0):
        unvisitedListUrls.append(firstListUrl)

    # load visited lists
    c.execute('SELECT url FROM visited_lists;')
    global visitedListUrls
    visitedListUrls = [i[0] for i in c.fetchall()]

    global initialNoVisitedLists
    initialNoVisitedLists = len(visitedListUrls)
    print('Loaded ' + str(initialNoVisitedLists) + ' visited lists')
    print('\n')

    conn.close()

## Scrape List of Feeds

Define a function that scrapes the url of a given Feedspot article that contains a list of RSS feeds, as well as a list of links to similar articles.

In [2]:
# retrieves all rss feed links from the given url
# and adds them to the global variable
def scrapeUrl(url):

    # retrieve the webpage content
    # include user-agent to ensure the response is not 403 Forbidden
    try: 
        headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Mobile Safari/537.36'}
        webpage_response = requests.get(url, headers=headers)

        # get the rss feeds on that webpage
        soup = BeautifulSoup(webpage_response.content, 'html.parser')
        for tag in soup.select('.trow .fa-rss + a'):

            href = tag.attrs['href']
            if href != '' and href not in feeds:
                feeds.append(tag.attrs['href'])

        # get the links to the feed lists on the page
        for tag in soup.select('.et_pb_extra_column_sidebar a'):

            # href only contains the path, construct complete url
            listUrl = 'https://blog.feedspot.com' + tag.attrs['href']
            
            # if the list has not been visited before,
            # add it to the unvisited lists
            
            # there may be lists of other content types (blog, website etc.) 
            # only add rss feed lists
            if listUrl not in visitedListUrls and listUrl not in unvisitedListUrls and 'rss_feeds' in listUrl:
                unvisitedListUrls.append(listUrl)
    except:
        print('Could not scrape url: ' + url)

## Crawl

Define a function that scrapes the contents of the initial url provided and then subsequently scrapes each unvisited url (stored in a global list).

The crawler stops when there are no more unvisited urls or when a predefined number of feeds (passed as an argument to the function) has been scraped.

In [3]:
# provide a maximum number of feeds to be collected
def crawl(maxNoOfFeeds):

    # scrape webpages until there are no more lists to be scraped
    # or the maximum number of feeds has been exceeded
    while unvisitedListUrls and len(feeds) - initialNoFeeds < maxNoOfFeeds:

        listUrl = unvisitedListUrls.pop(0)

        # increment the number of visited lists in this run of the crawler
        global visitedLists
        visitedLists += 1

        # add the list URL to the visited URLs
        visitedListUrls.append(listUrl)

        print('Scraping list: ', listUrl)
        scrapeUrl(listUrl)
        print('Number of feeds: ', len(feeds))
        print('\n')


## Reset Database

Delete all tables from the database and recreate them.

In [4]:
# reset the sqlite3 database of feeds
def resetDatabase():
    
    # connect to the database
    conn = sqlite3.connect('feeds_dev.db')
    c = conn.cursor()

    # drop the tables
    print('Deleting tables...')
    c.execute('DROP TABLE IF EXISTS feeds;')
    c.execute('DROP TABLE IF EXISTS posts;')
    c.execute('DROP TABLE IF EXISTS unvisited_lists;')
    c.execute('DROP TABLE IF EXISTS visited_lists')

    # re-create the tables
    print('Creating tables... \n')
    c.execute('CREATE TABLE feeds (_id INTEGER PRIMARY KEY, url TEXT, text TEXT, title TEXT, description TEXT);')
    c.execute('CREATE TABLE posts (_id INTEGER PRIMARY KEY, title TEXT, description TEXT, text TEXT, feed_title TEXT, FOREIGN KEY (feed_title) REFERENCES feeds (title);')
    c.execute('CREATE TABLE unvisited_lists (_id INTEGER PRIMARY KEY, url TEXT);')
    c.execute('CREATE TABLE visited_lists (_id INTEGER PRIMARY KEY, url TEXT);')    

    conn.close()

## Save Data

Define a function that saves all data that has been gathered by the crawler in the database.

In [5]:
# saves all feeds from the global list into the sqlite3 database
def saveData():

    print('Saving feeds...')

    # connect to the database
    conn = sqlite3.connect('feeds_dev.db')
    c = conn.cursor()

    # save feeds
    counter = 0
    for feed in feeds[initialNoFeeds:]:
        c.execute('INSERT INTO feeds (url) VALUES (?);', (feed,))
        counter += 1

    # update the unvisited lists in the database
    c.execute('DELETE FROM unvisited_lists;')
    for l in unvisitedListUrls:
        c.execute('INSERT INTO unvisited_lists (url) VALUES (?);', (l,))
 
    # save visited lists
    for l in visitedListUrls[initialNoVisitedLists:]:
        c.execute('INSERT INTO visited_lists (url) VALUES (?);', (l, ))

    # Commit the changes
    conn.commit()

    print('Saved ' + str(counter) + ' feeds')
    conn.close()  

## Run the Crawler

Crawl only feed urls.

In [7]:
loadData()
crawl(100)
saveData()

Loading data from db...
Loaded 13168 feeds
Loaded 430 unvisited lists
Loaded 311 visited lists


Scraping list:  https://blog.feedspot.com/australian_beauty_rss_feeds/
Number of feeds:  13241


Scraping list:  https://blog.feedspot.com/afl_rss_feeds/
Number of feeds:  13265


Scraping list:  https://blog.feedspot.com/australian_photography_rss_feeds/
Number of feeds:  13337


Saving feeds...
Saved 169 feeds


# Feed Info Scraper

Gather information about feed urls saved in the database.

Extract the following information:
* Title
* Description
* Text

*Text* is formed by concatenating the title and description of the feed with the title and description of each of its entries. *Text* must contain between 100 and 500 words and must not contain any HTML tags. All other preprocessing and feature extraction steps will be carried out in other parts of the system.

## Load Feed URLs

Loads the urls of the feeds that have not been processed. Randomly sample to get a subset of feed urls.

In [2]:
import feedparser
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import requests
import sqlite3
from bs4 import BeautifulSoup

count = 0

# list of feed urls loaded from the db
feed_url_list = []

# dict of feeds
# urls are keys the values are dicts containing
# features, title and description for that feed
feed_info = dict()

# load feeds urls of feeds that do not have features yet
# randomly sample the set of feeds
def loadFeedUrls(no_feeds):

    # clear feed urls list and feeds dict
    global feed_url_list
    feed_url_list.clear()
    feed_info.clear()

    global count
    count = 0

    print('Loading feed urls...')

    # connect to the db
    conn = sqlite3.connect('feeds.db')
    c = conn.cursor()

    # select all urls that have not been processed
    c.execute('SELECT url FROM feeds WHERE text IS NULL OR title IS NULL OR description IS NULL;')    
    for entry in c.fetchall():
        feed_url_list.append(entry[0])

    print('Loaded ' + str(len(feed_url_list)) + ' feeds')
    feed_url_list = random.sample(feed_url_list, no_feeds)

    print('Sampled ' + str(len(feed_url_list)) + ' feeds\n')

## Parse a Feed

Define a function that parses a feed given its url as a parameter.

In [3]:
# analyze a feed and generate its initial body of text
def parseFeed(url):

    global count
    count += 1
    print('Generating features for ', url, ' ', str(count))

    # return if the feed can not be parsed
    try:

        # get the rss feed content from the url
        headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Mobile Safari/537.36'}
        webpage = requests.get(url, headers=headers, timeout=10)

        d = feedparser.parse(webpage.content)
    except Exception as e:
        print('Could not parse feed ', url)
        print(e)
        return

    # body of text
    raw_text = ''

    # check that the feed has a title, description and at least one entry
    title = d['feed'].get('title')
    description = d['feed'].get('description')
    entries = d['entries']

    if not title or not description or len(entries) == 0:
        print('Invalid feed')
        
        # feed is invalid
        return

    # feed is valid, continue feature extraction
    # add title and description to body of text
    raw_text = title + ' ' + description

    # add the title and description of each entry to the body of text
    for entry in entries:
        
        # get entry info
        entry_title = entry.get('title')
        entry_title = entry_title if entry_title is not None else ''

        entry_description = entry.get('description')
        entry_description = entry_description if entry_description is not None else ''

        # add entry info to body of text
        raw_text = raw_text + ' ' + entry_title + ' ' + entry_description

    # remove html tags
    raw_text = BeautifulSoup(raw_text, 'html.parser').get_text()
    
    # check that the text has at least 100 words
    if len(raw_text.split()) > 100:

        # select at most 500 words
        raw_text = ' '.join(raw_text.split()[:500])

        # add the raw text to the feed's dict entry
        feed_info[url] = {
            'title': title,
            'description': description,
            'text': raw_text
        }
        

## Collect Info

Collect the required information for a specified number of feeds.

In [5]:
# generate features for the feeds
# specify for how many feeds to generate the features
def generateFeatures(no_feeds):

    loadFeedUrls(no_feeds)

    # generate features for all feeds
    for url in feed_url_list:   
        parseFeed(url)

    print('\nFeatures generated for ' + str(count) + ' urls\n')

    # save feed information
    print('Saving features...')

    # connect to the db
    conn = sqlite3.connect('feeds.db')
    c = conn.cursor()

    for url, info in feed_info.items():
        c.execute('UPDATE feeds SET text = ? WHERE url = ?;', (info['text'], url))
        c.execute('UPDATE feeds SET title = ? WHERE url = ?;', (info['title'], url))
        c.execute('UPDATE feeds SET description = ? WHERE url = ?;', (info['description'], url))

    # commit and close connection
    conn.commit()
    conn.close()

    print('Saved feed information\n')

for i in range(2):
    generateFeatures(100)


Loading feed urls...
Loaded 59916 feeds
Sampled 100 feeds

Generating features for  http://koalasplayground.com/feed/   1
Could not parse feed  http://koalasplayground.com/feed/
('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Generating features for  https://weddinginclude.com/category/wedding-cake/feed/   2
Invalid feed
Generating features for  https://data.fineartstudioonline.com/rssfeed.asp?id=24209   3
Invalid feed
Generating features for  https://www.personalfinanceplan.in/category/mutual-funds/feed/   4
Invalid feed
Generating features for  https://sqlundercover.com/feed/   5
Generating features for  https://mocbuilder.com/feed/   6
Generating features for  http://fysurf.com/feed/   7
Generating features for  https://www.sparklybelly.com/feed/   8
Generating features for  https://www.youtube.com/feeds/videos.xml?user=mngeocaching   9
Invalid feed
Generating features for  http://popdramatic.bl