# Import Statements

In [1]:
import requests
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import xml.etree.ElementTree as ET
from time import sleep
from bs4 import BeautifulSoup

# Global Variables
- `URL`: The URL we are scraping. This is an RSS feed URL, so we will be parsing XML
- `NUM_OF_ARTICLES`: set this to `0` in order to capture all articles the RSS feed currently provides

In [2]:
URL = 'https://cointelegraph.com/rss'
NUM_OF_ARTICLES = 0

# Function Definitions

In [3]:
def get_source(url):
    try:
        session = HTMLSession()
        response = session.get(url)
        return response
    except requests.exceptions.RequestException as e:
        print("Error ->", e)
        return None
    
def fixup_xml(string):
    """returns string without CDATA tags"""
    fix = '<root>{}</root>'.format(string)
    content = ET.fromstring(fix).text
    return repr(content)

def remove_newline(string):
    """returns string without newline chars"""
    return string.replace(r'\n', '')

def get_article_content(guid):
    """guid is the URL. returns article content, unformatted (no html tags)"""
    driver.get(guid)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    article_content = soup.find('div', {'class':'post-content'})
    try:
        return article_content.get_text()
    except AttributeError as e:
        print("Warning: blank string used for article content ->", e)
        return ''
    
def get_feed(url):
    """return a df of articles from url"""
    response = get_source(url)
    df = pd.DataFrame(columns = ['Title', 'Publication Date', 'Article URL', 'Description', 'Article Content'])
    with response as r:
        items = r.html.find("item", first=False)
        nth_article = 0
        for item in items:        
            nth_article = nth_article + 1
            title = fixup_xml(item.find('title', first=True).text)
            pubDate = item.find('pubDate', first=True).text
            guid = item.find('guid', first=True).text
            description = remove_newline(fixup_xml(item.find('description', first=True).text))
            article_content = get_article_content(guid)
            row = {'Title': title, 'Publication Date': pubDate, 'Article URL': guid, 'Description': description, 'Article Content':article_content}
            df = df.append(row, ignore_index=True)
            if nth_article == NUM_OF_ARTICLES:
                break
    return df 

# Script

In [4]:
driver = webdriver.Chrome(ChromeDriverManager().install())
df = get_feed(URL)
df.to_csv('cointelegraph_rss_scrape_output.csv', index=False)




[WDM] - Current google-chrome version is 104.0.5112
[WDM] - Get LATEST chromedriver version for 104.0.5112 google-chrome
[WDM] - Driver [/home/ff/.wdm/drivers/chromedriver/linux64/104.0.5112.79/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(ro