# Feature Extraction

Script that extracts features from the feeds (given the urls in the database).

## Load Feed URLs

Define a function that loads the urls of the feeds that have not been processed.

In [1]:
import sqlite3
import feedparser
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import requests

count = 0

# list of feed urls loaded from the db
feed_url_list = []

# dict of feeds
# urls are keys the values are dicts containing
# features, title and description for that feed
feeds = dict()

# load feeds urls of feeds that do not have features yet
def loadFeedUrls(no_feeds):

    # clear feed urls list and feeds dict
    feed_url_list.clear()
    feeds.clear()

    global count
    count = 0

    print('Loading feed urls...')

    # connect to the db
    conn = sqlite3.connect('feeds_dev.db')
    c = conn.cursor()

    # select urls 
    c.execute('SELECT url FROM feeds WHERE _id > (SELECT _id FROM feeds WHERE text IS NOT NULL AND title IS NOT NULL AND description IS NOT NULL ORDER BY _id DESC LIMIT 1) LIMIT ?;', (no_feeds,))    
    for entry in c.fetchall():
        feed_url_list.append(entry[0])

    print('Loaded ' + str(len(feed_url_list)) + ' feeds\n')

loadFeedUrls(100)

Loading feed urls...
Loaded 100 feeds



## Define Normalization Steps

Define a function that normalizes a string through the following pipeline:
* Remove HTML tags
* Convert to lower case
* Remove numbers
* Remove symbols
* Remove stop words

In [2]:
# normalizes a string
def normalize(text):

    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # convert to lower case
    text = text.lower()

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # remove symbols
    text = text.replace('\n', ' ')
    text = re.sub(r'[^A-Za-z ]+', '', text)
    text = re.sub(r' +', ' ', text)

    # remove stop words
    tokenized = word_tokenize(text)
    text = ' '.join([word for word in tokenized if not word in stopwords.words('english')])

    return text

## Parse a Feed

Define a function that parses a feed given its url as a parameter.

### Phase 1

Create a body of text from the following information:
* Title of the feed
* Description of the feed
* Title of each entry
* Description of each entry

### Phase 2

Normalize the generated body of text the using the function defined above. This will result in the body of text that will further be used by the CorEx Topic Model to infert the feed's topics.

### Phase 3

Save the following information in the *feeds* dict:
* Text (normalized body of text)
* Title of the feed
* Description of the feed



In [3]:
# analyze a feed and generate its initial body of text
def parseFeed(url):

    global count
    count += 1
    print('Generating features for ', url, ' ', str(count))

    # return if the feed can not be parsed
    try:

        # get the rss feed content from the url
        headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Mobile Safari/537.36'}
        webpage = requests.get(url, headers=headers, timeout=10)

        d = feedparser.parse(webpage.content)
    except Exception as e:
        print('Could not parse feed ', url)
        print(e)
        return

    # body of text representing the features
    features = ''

    # check that the feed has a title, description and at least one entry
    title = d['feed'].get('title')
    description = d['feed'].get('description')
    entries = d['entries']

    if not title or not description or len(entries) == 0:
        
        # feed is invalid
        return

    # feed is valid, continue feature extraction
    # add title and description to body of text
    features = title + ' ' + description

    # add the title and description of each entry to the body of text
    for entry in entries:
        
        entry_title = entry.get('title')
        entry_title = entry_title if entry_title is not None else ''

        entry_description = entry.get('description')
        entry_description = entry_description if entry_description is not None else ''

        features = features + ' ' + entry_title + ' ' + entry_description

    # normalize the body of text
    features = normalize(features)
    
    # add the features to the feed's dict entry
    feeds[url] = {
        'title': title,
        'description': description,
        'features': features
    }


## Save Feed Info
 
Define a function that saves the information in the *feeds* dict into the database.

In [4]:
# save feed information from the dict to the database
def saveFeedInfo():

    print('Saving features...')

    # connect to the db
    conn = sqlite3.connect('feeds_dev.db')
    c = conn.cursor()

    for url, info in feeds.items():
        c.execute('UPDATE feeds SET text = ? WHERE url = ?;', (info['features'], url))
        c.execute('UPDATE feeds SET title = ? WHERE url = ?;', (info['title'], url))
        c.execute('UPDATE feeds SET description = ? WHERE url = ?;', (info['description'], url))

    # Commit and close connection
    conn.commit()
    conn.close()

    print('Saved feed information\n')

## Generate Features

Load and generate features for a predefined number of feeds.

In [5]:
# generate features for the feeds
# specify for how many feeds to generate the features
def generateFeatures(no_feeds):

    loadFeedUrls(no_feeds)

    # generate features for all feeds
    for url in feed_url_list:   
        parseFeed(url)

    print('\nFeatures generated for ' + str(count) + ' urls\n')

    # save the features
    saveFeedInfo()

# for i in range(5):
#     generateFeatures(10)
