# Import Statements

In [1]:
import requests
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import xml.etree.ElementTree as ET
from time import sleep
from bs4 import BeautifulSoup



# Global Variables
- `URL`: The URL we are scraping. This is an RSS feed URL, so we will be parsing XML
- `NUM_OF_ARTICLES`: The number of articles the script will capture. `0` means all articles.

In [2]:
URL = 'https://cointelegraph.com/rss'
NUM_OF_ARTICLES = 0

# Function Definitions

In [3]:
def get_source(url):
    try:
        session = HTMLSession()
        response = session.get(url)
        return response
    except requests.exceptions.RequestException as e:
        print("Error ->", e)
        return None
    
def fixup_xml(string):
    """returns string without CDATA tags"""
    fix = '<root>{}</root>'.format(string)
    content = ET.fromstring(fix).text
    return repr(content)

def remove_newline(string):
    """returns string without newline chars"""
    return string.replace(r'\n', '')

def get_article_content(guid):
    """guid is the URL. returns article content, unformatted (no html tags)"""
    driver.get(guid)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    article_content = soup.find('div', {'class':'post-content'})
    if article_content != None:
        return article_content
    article_content = soup.find('div', {'class':'explained-post-content__item'})
    if article_content != None:
        return article_content
    
def get_feed(url):
    """return a df of articles from url"""
    response = get_source(url)
    df = pd.DataFrame(columns = ['Title', 'Publication Date', 'Article URL', 'Description', 'Article Content'])
    with response as r:
        items = r.html.find("item", first=False)
        nth_article = 0
        for item in items:        
            nth_article = nth_article + 1
            title = fixup_xml(item.find('title', first=True).text)
            pubDate = item.find('pubDate', first=True).text
            guid = item.find('guid', first=True).text
            description = remove_newline(fixup_xml(item.find('description', first=True).text))
            article_content = get_article_content(guid)
            row = {'Title': title, 'Publication Date': pubDate, 'Article URL': guid, 'Description': description, 'Article Content':article_content}
            df = df.append(row, ignore_index=True)
            if nth_article == NUM_OF_ARTICLES:
                break
    return df 

# Script

In [4]:
driver = webdriver.Chrome(ChromeDriverManager().install())
df = get_feed(URL)
df.to_csv('cointelegraph_rss_scrape_output.csv', index=False)



Current google-chrome version is 104.0.5112
Get LATEST chromedriver version for 104.0.5112 google-chrome
Driver [/home/ff/.wdm/drivers/chromedriver/linux64/104.0.5112.79/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)


In [5]:
df

Unnamed: 0,Title,Publication Date,Article URL,Description,Article Content
0,'NFT watchdog Rug Pull Finder gets its own NFT...,"Mon, 05 Sep 2022 03:33:55 +0100",https://cointelegraph.com/news/nft-watchdog-ru...,"""Rug Pull Finder's NFT contract was abused to ...","[[In an ironic twist, Rug Pull Finder (RPF), ,..."
1,"'Network outages have been Solana’s ‘curse,’ s...","Mon, 05 Sep 2022 03:19:24 +0100",https://cointelegraph.com/news/network-outages...,'The high-speed smart contract platform has su...,[[Network outages continue to be the Solana ne...
2,'A range-break from Bitcoin could trigger buyi...,"Sun, 04 Sep 2022 21:30:38 +0100",https://cointelegraph.com/news/a-range-break-f...,'If BTC bounces from its key underlying suppor...,[[The decline in the United States equities ma...
3,'Can the government track Bitcoin?',"Sun, 04 Sep 2022 15:00:00 +0100",https://cointelegraph.com/explained/can-the-go...,'The law enforcers like the IRS and FBI track ...,"[[], , [[\n 1.\n ], , [\n ..."
4,"'PwC Venezuela Twitter account hacked, attacke...","Sun, 04 Sep 2022 12:13:34 +0100",https://cointelegraph.com/news/pwc-venezuela-t...,'An attacker gained access to PwC Venezuela’s ...,[[An attacker gained access to PwC Venezuela’s...
5,"""Elon Musk-crypto video played on S. Korean go...","Sun, 04 Sep 2022 10:00:00 +0100",https://cointelegraph.com/news/elon-musk-crypt...,'The hacked YouTube account was restored withi...,[[A YouTube channel owned by the government of...
6,'Repurposing Bitcoin mining heat can solve glo...,"Sun, 04 Sep 2022 07:03:08 +0100",https://cointelegraph.com/news/repurposing-bit...,'While innovations in chipset manufacturing ha...,"[[The flexibility behind running Bitcoin (, [B..."
7,"'Saylor gets sued, FBI warns about DeFi exploi...","Sat, 03 Sep 2022 21:30:13 +0100",https://cointelegraph.com/magazine/2022/09/03/...,'',
8,'What is decentralized identity in blockchain?',"Sat, 03 Sep 2022 21:00:00 +0100",https://cointelegraph.com/explained/what-is-de...,"'What is a decentralized identity, and why doe...","[[], , [[\n 1.\n ], , [\n ..."
9,'The Bitcoin bottom — Are we there yet? Analys...,"Sat, 03 Sep 2022 17:13:09 +0100",https://cointelegraph.com/news/the-bitcoin-bot...,"'$20,000 is becoming the “new” resistance for ...","[[When Bitcoin was trading above $60,000, the ..."
