In [1]:
from requests_html import HTML # for parsing an html source or code

In [2]:
with open('simple.html') as html_file:
    src = html_file.read()
    html = HTML(html=src)

In [3]:
print(type(html.html))
print(html.html)

<class 'str'>
<!doctype html>
<html class="no-js" lang="">
    <head>
        <title>Test - A Sample Website</title>
        <meta charset="utf-8">
        <link rel="stylesheet" href="css/normalize.css">
        <link rel="stylesheet" href="css/main.css">
    </head>
    <body>
        <h1 id='site_title'>Test Website</h1>
        <hr></hr>
        <div class="article article-box">
            <h2><a href="article_1.html">Article 1 Headline</a></h2>
            <p>This is a summary of article 1</p>
        </div>
        <hr></hr>
        <div class="article article-box number">
            <h2><a href="article_2.html">Article 2 Headline</a></h2>
            <p>This is a summary of article 2</p>
        </div>
        <hr></hr>
        <div id='footer'>
            <p>Footer Information</p>
        </div>
        <script>
        var para = document.createElement("p");
        var node = document.createTextNode("This is text generated by JavaScript.");
        para.appendChild(node);


In [4]:
print(html.text)

Test - A Sample Website
Test Website
Article 1 Headline
This is a summary of article 1
Article 2 Headline
This is a summary of article 2
Footer Information
var para = document.createElement("p"); var node = document.createTextNode("This is text generated by JavaScript."); para.appendChild(node); var element = document.getElementById("footer"); element.appendChild(para);


In [5]:
match = html.find('title') # finding via css-selectors
print(match)
print(match[0])
print(match[0].html)
print(match[0].text)

[<Element 'title' >]
<Element 'title' >
<title>Test - A Sample Website</title>
Test - A Sample Website


In [6]:
match = html.find('title', first=True) # finding via css-selectors
print(match)
print(match.html)
print(match.text)

<Element 'title' >
<title>Test - A Sample Website</title>
Test - A Sample Website


In [7]:
match = html.find('#footer', first=True)
print(match)
print(match.html)
print(match.text)

<Element 'div' id='footer'>
<div id="footer">
<p>Footer Information</p>
</div>
Footer Information


In [8]:
article = html.find('div.article', first=True)
print(article)
print(article.html)
print(article.text)

<Element 'div' class=('article', 'article-box')>
<div class="article article-box">
<h2><a href="article_1.html">Article 1 Headline</a></h2>
<p>This is a summary of article 1</p>
</div>
Article 1 Headline
This is a summary of article 1


In [9]:
article = html.find('div.article', first=True)
headline = article.find('h2', first=True).text
summary = article.find('p', first=True).text
print(headline)
print(summary)

Article 1 Headline
This is a summary of article 1


In [10]:
articles = html.find('div.article')
for article in articles:
    headline = article.find('h2', first=True).text
    summary = article.find('p', first=True).text
    print(headline)
    print(summary)
    print()

Article 1 Headline
This is a summary of article 1

Article 2 Headline
This is a summary of article 2



# scraping coreyms.com

In [11]:
from requests_html import HTMLSession

In [12]:
session = HTMLSession()
res = session.get('https://coreyms.com')

In [13]:
print(res.html) # same as html = HTML(html=src)

<HTML url='https://coreyms.com/'>


In [14]:
article = res.html.find('article', first=True)
print(article.html)

<article class="post-1670 post type-post status-publish format-standard has-post-thumbnail category-development category-python tag-gzip tag-shutil tag-zip tag-zipfile entry" itemscope="" itemtype="https://schema.org/CreativeWork"><header class="entry-header"><h2 class="entry-title" itemprop="headline"><a class="entry-title-link" rel="bookmark" href="https://coreyms.com/development/python/python-tutorial-zip-files-creating-and-extracting-zip-archives">Python Tutorial: Zip Files – Creating and Extracting Zip Archives</a></h2>
<p class="entry-meta"><time class="entry-time" itemprop="datePublished" datetime="2019-11-19T13:02:37-05:00">November 19, 2019</time> by <span class="entry-author" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><a href="https://coreyms.com/author/coreymschafer" class="entry-author-link" itemprop="url" rel="author"><span class="entry-author-name" itemprop="name">Corey Schafer</span></a></span> <span class="entry-comments-link"><a href="https://c

In [15]:
headline = article.find('.entry-title-link', first=True).text
headline

'Python Tutorial: Zip Files – Creating and Extracting Zip Archives'

In [16]:
summary = article.find('.entry-content p', first=True).text # <p> within class .entry-content
summary

'In this video, we will be learning how to create and extract zip archives. We will start by using the zipfile module, and then we will see how to do this using the shutil module. We will learn how to do this with single files and directories, as well as learning how to use gzip as well. Let’s get started…'

In [17]:
iframe = article.find('iframe', first=True)
iframe.html

'<iframe class="youtube-player" width="640" height="360" src="https://www.youtube.com/embed/z0gguhEmWiY?version=3&amp;rel=1&amp;showsearch=0&amp;showinfo=1&amp;iv_load_policy=1&amp;fs=1&amp;hl=en-US&amp;autohide=2&amp;wmode=transparent" allowfullscreen="true" style="border:0;" sandbox="allow-scripts allow-same-origin allow-popups allow-presentation"/>'

In [18]:
iframe.attrs

{'class': ('youtube-player',),
 'width': '640',
 'height': '360',
 'src': 'https://www.youtube.com/embed/z0gguhEmWiY?version=3&rel=1&showsearch=0&showinfo=1&iv_load_policy=1&fs=1&hl=en-US&autohide=2&wmode=transparent',
 'allowfullscreen': 'true',
 'style': 'border:0;',
 'sandbox': 'allow-scripts allow-same-origin allow-popups allow-presentation'}

In [19]:
iframe.attrs['src']

'https://www.youtube.com/embed/z0gguhEmWiY?version=3&rel=1&showsearch=0&showinfo=1&iv_load_policy=1&fs=1&hl=en-US&autohide=2&wmode=transparent'

In [20]:
video_id = iframe.attrs['src'].split('?')[0].split('/')[-1]

In [21]:
video_id

'z0gguhEmWiY'

In [22]:
video_link = f"https://youtube.com/watch?v={video_id}"
video_link

'https://youtube.com/watch?v=z0gguhEmWiY'

In [25]:
import csv
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'summary', 'video'])

24

In [26]:
articles = res.html.find('article')

for article in articles:
    headline = article.find('.entry-title-link', first=True).text
#     print(headline)
    summary = article.find('.entry-content p', first=True).text # <p> within class .entry-content
#     print(summary)
    
    try:
        iframe = article.find('iframe', first=True)
        v_id = iframe.attrs['src'].split('?')[0].split('/')[-1]
        v_link = f"https://youtube.com/watch?v={v_id}"
    except:
        v_link = None
    
#     print(v_link)
#     print()
    
    csv_writer.writerow([headline, summary, v_link])

csv_file.close()

In [32]:
links = res.html.links
print(type(links), len(links))

<class 'set'> 93


In [31]:
abs_links = res.html.absolute_links
print(type(abs_links), len(abs_links))

<class 'set'> 93


# rendering html javascript

In [35]:
# html.render()
match = html.find('#footer', first=True)
print(match.html)

<div id="footer">
<p>Footer Information</p>
</div>


# Sync and Async requests

In [37]:
import time
# from requests_html import HTMLSession

In [38]:
session = HTMLSession()

In [40]:
t1 = time.perf_counter()

r = session.get('https://httpbin.org/delay/1')
response = r.html.url
print(response)

r = session.get('https://httpbin.org/delay/2')
response = r.html.url
print(response)

r = session.get('https://httpbin.org/delay/3')
response = r.html.url
print(response)

t2 = time.perf_counter()

print(f"Synchronous : {t2 - t1} seconds")

https://httpbin.org/delay/1
https://httpbin.org/delay/2
https://httpbin.org/delay/3
Synchronous : 9.493530099999589 seconds


In [41]:
from requests_html import AsyncHTMLSession
async_session = AsyncHTMLSession()

In [42]:
async def get_delay1():
    r = await async_session.get('https://httpbin.org/delay/1')
    return r

async def get_delay2():
    r = await async_session.get('https://httpbin.org/delay/2')
    return r

async def get_delay3():
    r = await async_session.get('https://httpbin.org/delay/3')
    return r


In [44]:
t1 = time.perf_counter()

results = async_session.run(get_delay1, get_delay2, get_delay3)

# Each item in the results list is a response object and can be interacted with as such
# basically makes request for all of them around the same time and then manage results as they comes in
#
for result in results:
    response = result.html.url
    print(response)

t2 = time.perf_counter()
print(f"Synchronous : {t2 - t1} seconds")

RuntimeError: This event loop is already running