In [508]:
import urllib.request
from bs4 import BeautifulSoup
import feedparser
import re
import os
import json

## Parsing links

In [470]:
base_url = 'https://www.buzzfeed.com/{}.xml?page={}'
categories = ['health', 'tech', 'science', 'politics', 'reader']
feed = []

for category in categories:
    category_feed = []
    for page_num in range(1, 6):
        entries = feedparser.parse(base_url.format(category, page_num)).entries
        for entry in entries:
            try:
                summary = re.findall('<h1>(.*)<\/h1>', entry.summary)[0]
            except IndexError:
                summary = 'NaN'
            
            try:
                category_feed.append({
                    'title': entry.title,
                    'summary': summary,
                    'author': entry.author,
                    'link': entry.link,
                    'published': entry.published,
                })
            except AttributeError:
                pass
            
    feed.extend(category_feed[:200])

In [471]:
len(feed)

1000

In [553]:
feed[0]

{'title': 'Watching My Cousin’s Death Go Viral',
 'summary': 'My cousin Salahuddin was a victim of unforgivable police brutality. But before that, he was a young man let down by a society that still treats mental illness as a kind of crime in itself.',
 'author': 'Bilal Anwar',
 'link': 'https://www.buzzfeednews.com/article/bilalanwar/pakistan-salahuddin-ayubi-death-police-mental-health',
 'published': 'Mon, 06 Jan 2020 18:56:35 -0500'}

## Parsing texts

In [472]:
def load_text(url):
    html = urllib.request.urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html)
    texts = []
    
    # Parsing text areas
    for div in soup.findAll('div', attrs={'class': 'subbuzz-text'}):
        for p in div.findAll('p'):
            texts.append(p.get_text())
    
    # Parsing description areas
    for div in soup.findAll('div', attrs={'class': 'subbuzz__description'}):
        for p in div.findAll('p'):
            texts.append(p.get_text())
    
    # Parsing sub titles
    for span in soup.findAll('span', attrs={'class': 'js-subbuzz__title-text'}):
        texts.append(span.get_text())
        
    return ' '.join(texts)[:2000]

In [547]:
url = 'https://www.buzzfeednews.com/article/bilalanwar/pakistan-salahuddin-ayubi-death-police-mental-health'
texts = load_text(url)

print('Length:', len(texts), '\nText:')
texts[:500]

Length: 2000 
Text:


'The date in the grainy video footage says “July 17, 2019, 9:19 p.m.” A man in a muddy brown shalwar kameez enters an ATM booth. He pauses briefly to examine the machine before fiddling with it. As he sticks his finger into the cash slot, he notices the blinking red light of a camera observing him. Defiant, he sticks out his tongue and makes a face, puffing out his cheeks. The man proceeds to pry off the front panel of the ATM and notices a second camera embedded in the machine. He pulls more fac'