In [8]:
import sqlite3
import feedparser
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from corextopic import corextopic as ct

count = 0

# list of feed urls loaded from the db
feed_url_list = []

# dict of feeds
# urls are keys and features (body of text) are values
feeds = dict()

# normalizes a string 
# - remove HTML tags (using BeautifulSoup)
# - convert to lower case
# - remove numbers
# - remove symbols
# - remove stop words (using NLTK)
def normalize(text):

    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # convert to lower case
    text = text.lower()

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # remove symbols
    text = text.replace('\n', ' ')
    text = re.sub(r'[^A-Za-z ]+', '', text)
    text = re.sub(r' +', ' ', text)

    # remove stop words
    tokenized = word_tokenize(text)
    text = ' '.join([word for word in tokenized if not word in stopwords.words('english')])

    return text

# load feeds from the db
# specify how many feeds to load from the database
def loadFeeds(no_feeds):

    # clear feed urls list and feeds dict
    feed_url_list.clear()
    feeds.clear()

    global count
    count = 0

    print('Loading feed urls...')

    # connect to the db
    conn = sqlite3.connect('feeds_dev.db')
    c = conn.cursor()

    # select urls 
    # add limit 100 during development
    c.execute('SELECT url FROM feeds LIMIT ?;', (no_feeds,))
    for entry in c.fetchall():
        feed_url_list.append(entry[0])

    print('Loaded feeds\n')

# analyze a feed and generate its initial body of text
def parseFeed(url):

    global count
    count += 1
    print('Generating features for ', url, ' ', str(count))

    # return if the feed can not be parsed
    try:
        d = feedparser.parse(url)
    except Exception:
        print('Could not parse feed ', url)
        return

    # body of text representing the features
    features = ''

    # check that the feed has a title, description and at least one entry
    title = d['feed'].get('title')
    description = d['feed'].get('description')
    entries = d['entries']

    if title == None or description == None or len(entries) == 0:
        
        # feed is invalid
        return

    # feed is valid, continue feature extraction
    # add title and description to body of text
    features = title + ' ' + description

    # add the title and description of each entry to the body of text
    for entry in entries:
        
        entry_title = entry.get('title')
        entry_title = entry_title if entry_title is not None else ''

        entry_description = entry.get('description')
        entry_description = entry_description if entry_description is not None else ''

        features = features + ' ' + entry_title + ' ' + entry_description

    # normalize the body of text
    features = normalize(features)

    # add the features to the feed's dict entry
    feeds[url] = features


# Generate Features
Load the feeds from the database and generate a body of text for each feed. 
The body of text contains the feed's (processed) data.

In [10]:
# save the features in the dict to the database
def saveFeatures():

    print('Saving features...')

    # connect to the db
    conn = sqlite3.connect('feeds_dev.db')
    c = conn.cursor()

    for url, features in feeds.items():
        c.execute('UPDATE feeds SET text = ? WHERE url = ?;', (features, url))

    # Commit and close connection
    conn.commit()
    conn.close()

    print('Saved features')

# generate features for the feeds
# specify for how many feeds to generate the features
def generateFeatures(no_feeds):

    loadFeeds(no_feeds)

    # generate features for all feeds
    for url in feed_url_list:   
        parseFeed(url)

    print('\nFeatures generated for ' + str(count) + ' urls\n')

    # save the features
    saveFeatures()

generateFeatures(1000)




Loading feed urls...
Loaded feeds

Generating features for  https://www.marieclaire.co.uk/feed   1
Generating features for  https://www.inthefrow.com/feed   2
Generating features for  https://www.eventbrite.co.uk/blog/feed/   3
Generating features for  http://www.thelondoner.me/feed   4
Generating features for  https://www.aluxurytravelblog.com/feed/   5
Generating features for  https://www.theannaedit.com/feed/   6
Generating features for  https://www.lovemydress.net/feed   7
Generating features for  https://www.newstatesman.com/feeds/site_feed.rss   8
Generating features for  http://scienceblog.cancerresearchuk.org/feed/   9
Generating features for  http://wearesocial.com/uk/feed   10
Generating features for  http://blog.nationalarchives.gov.uk/feed/   11
Generating features for  http://fadedspring.co.uk/feed/   12
Generating features for  http://www.lrb.co.uk/blog/feed/   13
Generating features for  https://order-order.com/feed   14
Generating features for  https://www.lifeinabreakd

IncompleteRead: IncompleteRead(69539 bytes read)

# Load Features
Load the previously generated bodies of text for the feeds and load them into the feeds dict

In [2]:
def loadFeatures():

    print('Loading features...')

    # connect to the db
    conn = sqlite3.connect('feeds_dev.db')
    c = conn.cursor()
    
    # select only the feeds for which the body of text 
    # has already been generated
    c.execute('SELECT url, text FROM feeds WHERE text IS NOT NULL;')
    for entry in c.fetchall():
        feeds[entry[0]] = entry[1]

    print('Loaded features for ' + str(len(feeds)) + ' feeds')

loadFeatures()

Loading features...
Loaded features for 92 feeds


# Vectorize Documents
Create a doc-word matrix from the previously-generated bodies of text.

In [3]:
# returns the feeds dict's values as a doc-word matrix
def vectorizeDocuments():
    corpus = list(feeds.values())
    vectorizer = CountVectorizer(max_features=20000, binary=True)
    doc_word = vectorizer.fit_transform(corpus)

    # get the words (column labels)
    words = vectorizer.get_feature_names()

    return doc_word, words

# create a doc-word matrix from the dicts
doc_word, words = vectorizeDocuments()
print(doc_word.shape)
print(len(words))

(92, 14334)
14334


# CorEx Topic Modelling
Infer topics for the documents in the doc-word matrix


In [18]:
# train the CorEx topic model with 50 topics
topic_model = ct.Corex(n_hidden=50, words=words, max_iter=200, verbose=False, seed=1)
topic_model.fit(doc_word, words=words)

<corextopic.corextopic.Corex at 0x17d9eea4640>

In [21]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: na, intensely, ensures, array, hon, method, rarely, obvious, korean, tweaks
1: tom, drink, music, sleep, clear, balance, crop, prepared, stretch, strength
2: night, said, attention, religious, country, army, jewish, islands, unlike, brain
3: apply, areas, established, wars, water, keeping, customs, initially, topics, clothing
4: full, pack, bid, construction, square, equipment, fascinating, fourth, inspiring, adult
5: pair, ahead, longer, article, weight, bank, became, bell, productivity, sought
6: personal, deals, craft, tang, biggest, blankets, sam, switch, likes, hosted
7: ingredients, sauce, easy, cheese, cream, sweet, rich, simple, filling, vegan
8: higher, former, report, communities, retirement, pay, importance, earth, sector, celebration
9: wireless, twice, ancient, picture, park, names, west, youd, japan, keen
10: delicious, recipe, recipes, meal, creamy, homemade, flavour, tasty, soup, food
11: gains, groups, tv, april, squeezed, repair, looked, bed, heat, listen
12: cocon