In [508]:
import urllib.request
from bs4 import BeautifulSoup
import feedparser
import re
import os
import json

## Parsing links

In [470]:
base_url = 'https://www.buzzfeed.com/{}.xml?page={}'
categories = ['health', 'tech', 'science', 'politics', 'reader']
feed = []

for category in categories:
    category_feed = []
    for page_num in range(1, 6):
        entries = feedparser.parse(base_url.format(category, page_num)).entries
        for entry in entries:
            try:
                summary = re.findall('<h1>(.*)<\/h1>', entry.summary)[0]
            except IndexError:
                summary = 'NaN'
            
            try:
                category_feed.append({
                    'title': entry.title,
                    'summary': summary,
                    'author': entry.author,
                    'link': entry.link,
                    'published': entry.published,
                })
            except AttributeError:
                pass
            
    feed.extend(category_feed[:200])

In [471]:
len(feed)

1000

## Parsing texts

In [472]:
def load_text(url):
    html = urllib.request.urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html)
    texts = []
    
    # Parsing text areas
    for div in soup.findAll('div', attrs={'class': 'subbuzz-text'}):
        for p in div.findAll('p'):
            texts.append(p.get_text())
    
    # Parsing description areas
    for div in soup.findAll('div', attrs={'class': 'subbuzz__description'}):
        for p in div.findAll('p'):
            texts.append(p.get_text())
    
    # Parsing sub titles
    for span in soup.findAll('span', attrs={'class': 'js-subbuzz__title-text'}):
        texts.append(span.get_text())
        
    return ' '.join(texts)[:2000]

In [547]:
url = 'https://www.buzzfeednews.com/article/bilalanwar/pakistan-salahuddin-ayubi-death-police-mental-health'
texts = load_text(url)

print('Length:', len(texts), '\nText:')
texts[:500]

Length: 2000 
Text:


'The date in the grainy video footage says “July 17, 2019, 9:19 p.m.” A man in a muddy brown shalwar kameez enters an ATM booth. He pauses briefly to examine the machine before fiddling with it. As he sticks his finger into the cash slot, he notices the blinking red light of a camera observing him. Defiant, he sticks out his tongue and makes a face, puffing out his cheeks. The man proceeds to pry off the front panel of the ATM and notices a second camera embedded in the machine. He pulls more fac'

In [537]:
with open('buzzfeed/output.json', 'r') as f:
    foo = json.load(f)

In [535]:
with open('buzzfeed/links_articles.txt', 'r') as f:
    bar = dict()
    for line in f.readlines():
        bar[line.split(' ')[0]] = line.split(' ')[1]

In [536]:
bar

{'0': 'https://www.buzzfeednews.com/article/bilalanwar/pakistan-salahuddin-ayubi-death-police-mental-health\n',
 '1': 'https://www.buzzfeed.com/spenceralthouse/weight-loss-tips-from-people-who-lost-over-40-lbs\n',
 '2': 'https://www.buzzfeed.com/kristatorres/i-tested-out-three-household-blenders-to-see-which-one\n',
 '3': 'https://www.buzzfeed.com/crystalro/birth-control-doctor\n',
 '4': 'https://www.buzzfeed.com/kristatorres/what-secret-questions-do-you-have-about-vaginal-ch\n',
 '5': 'https://www.buzzfeed.com/kristatorres/this-schools-moldy-bread-experiment-went-viral-after-it\n',
 '6': 'https://www.buzzfeed.com/marissamuller/kylie-jenner-wants-a-vibrator-from-kourtney\n',
 '7': 'https://www.buzzfeed.com/kristatorres/i-got-a-game-of-thrones-anti-aging-facial-and-th\n',
 '8': 'https://www.buzzfeed.com/spenceralthouse/how-sexually-pure-were-you-in-2019\n',
 '9': 'https://www.buzzfeed.com/sarahaspler/random-sex-knowledge-quiz\n',
 '10': 'https://www.buzzfeed.com/christopherhudspeth/diet

In [544]:
next(item for item in foo if item['id'] == 666)

{'id': 666,
 'title': 'I Chose Not To Have Kids Because I’m Afraid For The Planet',
 'summary': 'Growing up Mormon, I was taught that having babies is part of God’s plan. Today, I believe that humans owe it to each other (and the world) not to.',
 'author': 'Ash Sanders',
 'link': 'https://www.buzzfeednews.com/article/ashsanders/birth-strike-no-kids-climate-change-population',
 'published': 'Fri, 26 Jul 2019 13:25:52 -0400',
 'category': 'reader'}

In [546]:
bar['0']

'https://www.buzzfeednews.com/article/bilalanwar/pakistan-salahuddin-ayubi-death-police-mental-health\n'