# Import Statements

In [1]:
import requests
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
from lxml import etree
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import xml.etree.ElementTree as ET
from time import sleep
from bs4 import BeautifulSoup
import warnings

# Global Variables
- `URL`: The URL we are scraping. This is an RSS feed URL, so we will be parsing XML
- `NUM_OF_ARTICLES`: The number of articles the script will capture. `0` means all articles.

In [2]:
URL = 'https://cointelegraph.com/rss'
NUM_OF_ARTICLES = 10
GOOGLE_API_KEY_LOCATION = './keys/key.json'

# Function Definitions

In [3]:
def get_source(url):
    try:
        session = HTMLSession()
        response = session.get(url)
        return response
    except requests.exceptions.RequestException as e:
        print("Error ->", e)
        return None

def remove_html_tags(string):
    soup = BeautifulSoup(string)
    return soup.get_text()

def get_article_content(soup):
    """returns article content, including html tags"""
    article_content = soup.find('div', {'class':'post-content'})
    if article_content != None:
        return article_content
    article_content = soup.find('div', {'class':'explained-post-content__item'})
    if article_content != None:
        return article_content
    
def get_image(image_url):
    return f'=IMAGE("{image_url}")'
    
def get_feed(url):
    """return a df of articles from url"""
    response = get_source(url)
    df = pd.DataFrame(columns = ['Title', 'Publication Date', 'Article URL', 'Description', 'Article Content'])
    xml = etree.fromstring(response.content)
    nth_article = 0
    for item in xml.xpath('/rss/channel/item'):
        nth_article = nth_article + 1
        title = item.xpath('./title/text()')[0]
        guid = item.xpath('./guid/text()')[0]
        pubDate = item.xpath('./pubDate/text()')[0]
        description = item.xpath('./description/text()')[0]
        description = remove_html_tags(description)
        image_url = item.xpath("./*[local-name()='content']/@url")[0]
        image = get_image(image_url)
        creator = item.xpath("./*[local-name()='creator']/text()")[0]
        
        # get html content using chrome webdriver:
        driver.get(guid)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        article_content = get_article_content(soup)
        
        row = {'Title': title, 'Publication Date': pubDate, 'Creator':creator, 'Article URL': guid, 'Description': description, 'image_url':image_url, 'image':image, 'Article Content':article_content}
        df = df.append(row, ignore_index=True)
        if nth_article == NUM_OF_ARTICLES:
            break
    return df 

# Script

## Iinitialize Webdriver

In [4]:
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
driver = webdriver.Chrome(ChromeDriverManager().install())
chrome_options = webdriver.ChromeOptions()




[WDM] - Current google-chrome version is 104.0.5112
[WDM] - Get LATEST chromedriver version for 104.0.5112 google-chrome
[WDM] - About to download new driver from https://chromedriver.storage.googleapis.com/104.0.5112.79/chromedriver_linux64.zip
[WDM] - Driver has been saved in cache [/home/ff/.wdm/drivers/chromedriver/linux64/104.0.5112.79]


## Create/Update Spreadsheet

In [5]:
df = get_feed(URL)
df.to_csv('cointelegraph_rss_scrape_output.csv', index=False)

## Send CSV File to Google Drive

In [6]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets',
         "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]

credentials = ServiceAccountCredentials.from_json_keyfile_name('./keys/key.json', scope)
client = gspread.authorize(credentials)
spreadsheet = client.open('web_scrape')

with open('cointelegraph_rss_scrape_output.csv', 'r') as file_obj:
    content = file_obj.read()
    client.import_csv(spreadsheet.id, data=content)