# Downloading a web page using requests


In [1]:
!pip install requests --upgrade --quiet

In [43]:
import requests

In [3]:
topic_url = 'https://github.com/topics/machine-learning'

In [4]:
response = requests.get(topic_url)

In [5]:
type(response)

requests.models.Response

In [6]:
response.status_code

200

In [7]:
page_contents = response.text

In [8]:
len(page_contents)

432249

In [9]:
page_contents[:1000]

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark" data-a11y-animated-images="system">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n\n\n  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-719f1193e0c0.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-0c343b529849.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" media="all" rel="stylesheet" data-href="htt

In [10]:
with open('machine-learning-topics.html', 'w', encoding="utf-8") as file:
    file.write(page_contents)

In [11]:
!pip install jovian --upgrade --quiet


In [12]:
import jovian

<IPython.core.display.Javascript object>

In [None]:
jovian.commit(project='python-web-scraping-and-rest-api')


<IPython.core.display.Javascript object>

[jovian] Please enter your API key ( from https://jovian.ai/ ):
API KEY: ········


[jovian] Error: The current API key is invalid or expired.


[jovian] Please enter your API key ( from https://jovian.ai/ ):
API KEY: 

# Extracting information from HTML using Beautiful Soup


In [1]:
!pip install beautifulsoup4 --upgrade --quiet


In [2]:
from bs4 import BeautifulSoup


In [3]:
?BeautifulSoup

In [4]:
with open('machine-learning-topics.html', 'r') as f:
    html_source = f.read()

In [5]:
html_source[:1000]


'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark" data-a11y-animated-images="system">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n\n\n  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-719f1193e0c0.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-0c343b529849.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" media="all" rel="stylesheet" data-href="htt

In [6]:
doc = BeautifulSoup(html_source, 'html.parser')


In [7]:
type(doc)


bs4.BeautifulSoup

In [9]:
title_tag = doc.title
title_tag

<title>machine-learning · GitHub Topics · GitHub</title>

In [10]:
type(title_tag)

bs4.element.Tag

In [11]:
title_tag.name

'title'

In [12]:
title_tag.text

'machine-learning · GitHub Topics · GitHub'

In [13]:
first_link = doc.a

In [14]:
first_link

<a class="px-2 py-4 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content" href="#start-of-content">Skip to content</a>

In [15]:
first_link.text

'Skip to content'

In [17]:
all_link_tags = doc.find_all('a')


In [18]:
len(all_link_tags)


408

In [19]:
all_link_tags[:3]


[<a class="px-2 py-4 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content" href="#start-of-content">Skip to content</a>,
 <a aria-label="Homepage" class="mr-lg-3 color-fg-inherit flex-order-2" data-ga-click="(Logged out) Header, go to homepage, icon:logo-wordmark" href="https://github.com/">
 <svg aria-hidden="true" class="octicon octicon-mark-github" data-view-component="true" height="32" version="1.1" viewbox="0 0 16 16" width="32">
 <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55

In [20]:
first_link


<a class="px-2 py-4 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content" href="#start-of-content">Skip to content</a>

In [21]:
first_link['href']


'#start-of-content'

In [22]:
first_link['class']


['px-2',
 'py-4',
 'color-bg-accent-emphasis',
 'color-fg-on-emphasis',
 'show-on-focus',
 'js-skip-to-content']

In [23]:
first_link.attrs


{'href': '#start-of-content',
 'class': ['px-2',
  'py-4',
  'color-bg-accent-emphasis',
  'color-fg-on-emphasis',
  'show-on-focus',
  'js-skip-to-content']}

# Searching by Attribute Value


In [24]:
doc.find_all('img', { 'alt': 'transformers'})

[<img alt="transformers" class="d-block width-full" loading="lazy" src="https://repository-images.githubusercontent.com/155220641/a16c4880-a501-11ea-9e8f-646cf611702e"/>]

In [25]:
doc.find('img', { 'alt': 'transformers'})

<img alt="transformers" class="d-block width-full" loading="lazy" src="https://repository-images.githubusercontent.com/155220641/a16c4880-a501-11ea-9e8f-646cf611702e"/>

# Searching by Class


In [26]:
matching_tags = doc.find_all(class_='HeaderMenu-link')

In [27]:
matching_tags

[<button aria-expanded="false" class="HeaderMenu-link border-0 width-full width-lg-auto px-0 px-lg-2 py-3 py-lg-2 no-wrap d-flex flex-items-center flex-justify-between js-details-target" type="button">
       Product
       <svg aria-hidden="true" class="octicon octicon-chevron-down HeaderMenu-icon ml-1" data-view-component="true" height="16" opacity="0.5" version="1.1" viewbox="0 0 16 16" width="16">
 <path d="M12.78 6.22a.75.75 0 010 1.06l-4.25 4.25a.75.75 0 01-1.06 0L3.22 7.28a.75.75 0 011.06-1.06L8 9.94l3.72-3.72a.75.75 0 011.06 0z" fill-rule="evenodd"></path>
 </svg>
 </button>,
 <button aria-expanded="false" class="HeaderMenu-link border-0 width-full width-lg-auto px-0 px-lg-2 py-3 py-lg-2 no-wrap d-flex flex-items-center flex-justify-between js-details-target" type="button">
       Solutions
       <svg aria-hidden="true" class="octicon octicon-chevron-down HeaderMenu-icon ml-1" data-view-component="true" height="16" opacity="0.5" version="1.1" viewbox="0 0 16 16" width="16">
 <

In [28]:
header_link_tags = doc.find_all('a', class_='HeaderMenu-link')

In [29]:
header_link_tags

[<a class="HeaderMenu-link no-underline px-0 px-lg-2 py-3 py-lg-2 d-block d-lg-inline-block" data-analytics-event='{"category":"Header menu top item (logged out)","action":"click to go to Pricing","label":"ref_page:/topics/machine-learning;ref_cta:Pricing;"}' href="/pricing">Pricing</a>,
 <a class="HeaderMenu-link HeaderMenu-link--sign-in flex-shrink-0 no-underline d-block d-lg-inline-block border border-lg-0 rounded rounded-lg-0 p-2 p-lg-0" data-ga-click="(Logged out) Header, clicked Sign in, text:sign-in" data-hydro-click='{"event_type":"authentication.click","payload":{"location_in_page":"site header menu","repository_id":null,"auth_type":"SIGN_UP","originating_url":"https://github.com/topics/machine-learning","user_id":null}}' data-hydro-click-hmac="77fa0805c2ee5e083e5dbe5571a2e37d8661eca3a115806e97d54914b8332ea9" href="/login?return_to=https%3A%2F%2Fgithub.com%2Ftopics%2Fmachine-learning">
               Sign in
             </a>,
 <a class="HeaderMenu-link HeaderMenu-link--sign-u

In [30]:
header_link_tags[0]['href']

'/pricing'

In [31]:
header_links = []
base_url = 'https://github.com'

for tag in header_link_tags:
    header_links.append({ 'title': tag.text.strip(), 'url': base_url + tag['href']})
    
header_links

[{'title': 'Pricing', 'url': 'https://github.com/pricing'},
 {'title': 'Sign in',
  'url': 'https://github.com/login?return_to=https%3A%2F%2Fgithub.com%2Ftopics%2Fmachine-learning'},
 {'title': 'Sign up',
  'url': 'https://github.com/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2Ftopics%2Fmachine-learning&source=header'}]

# Elements inside a tag


In [32]:
sample_html = """
<html>
    <body>
        <ul class="top-list">
            <li>Item 1</li>
            <li>Item 2</li>
            <li>
                <ul>
                    <li>Item 3.1</li>
                    <li>Item 3.2</li>
                    <li>Item 3.3</li>
                </ul> 
            </li>
        </ul>
    </body>
</html>"""

In [33]:
sample_doc = BeautifulSoup(sample_html)


In [34]:
list_tag = sample_doc.find('ul', class_='top-list')

In [35]:
list_item_tags = list_tag.find_all('li', recursive=False)

In [36]:
list_item_tags


[<li>Item 1</li>,
 <li>Item 2</li>,
 <li>
 <ul>
 <li>Item 3.1</li>
 <li>Item 3.2</li>
 <li>Item 3.3</li>
 </ul>
 </li>]

In [37]:
list_tag.find_all('li')

[<li>Item 1</li>,
 <li>Item 2</li>,
 <li>
 <ul>
 <li>Item 3.1</li>
 <li>Item 3.2</li>
 <li>Item 3.3</li>
 </ul>
 </li>,
 <li>Item 3.1</li>,
 <li>Item 3.2</li>,
 <li>Item 3.3</li>]

In [38]:
jovian.commit()


NameError: name 'jovian' is not defined

# Top Repositories for a Topic


In [44]:
def get_topic_page(topic):
    # Construct the URL
    topic_repos_url = 'https://github.com/topics/' + topic
    
    # Get the HTML page content using requests
    response = requests.get(topic_repos_url)
    
    # Ensure that the reponse is valid
    if response.status_code != 200:
        print('Status code:', response.status_code)
        raise Exception('Failed to fetch web page ' + topic_repos_url)
    
    # Construct a beautiful soup document
    doc = BeautifulSoup(response.text)
    
    return doc

In [45]:
doc = get_topic_page('machine-learning')

In [46]:
doc.title.text

'machine-learning · GitHub Topics · GitHub'

In [47]:
doc2 = get_topic_page('data-analysis')

In [48]:
doc2.title.text

'data-analysis · GitHub Topics · GitHub'

In [49]:
article_tags = doc.find_all('article', class_='border rounded color-shadow-small color-bg-subtle my-4')

In [50]:
len(article_tags)

20

In [51]:
article_tag = article_tags[4]

In [52]:
article_tag

<article class="border rounded color-shadow-small color-bg-subtle my-4">
<div class="px-3">
<div class="d-flex flex-justify-between flex-items-start flex-wrap gap-2 my-3">
<div class="d-flex flex-1">
<span style="margin-top:2px">
<svg aria-hidden="true" class="octicon octicon-repo color-fg-muted mr-2" data-view-component="true" height="16" version="1.1" viewbox="0 0 16 16" width="16">
<path d="M2 2.5A2.5 2.5 0 014.5 0h8.75a.75.75 0 01.75.75v12.5a.75.75 0 01-.75.75h-2.5a.75.75 0 110-1.5h1.75v-2h-8a1 1 0 00-.714 1.7.75.75 0 01-1.072 1.05A2.495 2.495 0 012 11.5v-9zm10.5-1V9h-8c-.356 0-.694.074-1 .208V2.5a1 1 0 011-1h8zM5 12.25v3.25a.25.25 0 00.4.2l1.45-1.087a.25.25 0 01.3 0L8.6 15.7a.25.25 0 00.4-.2v-3.25a.25.25 0 00-.25-.25h-3.5a.25.25 0 00-.25.25z" fill-rule="evenodd"></path>
</svg>
</span>
<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representa

In [53]:
h3_tag = article_tag.find('h3')
h3_tag

<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":365630,"originating_url":"https://github.com/topics/machine-learning","user_id":null}}' data-hydro-click-hmac="ca9a4bb6121eb598cf4c316a9a406000c33e8c7a0735062810684ad1e368a549" data-turbo="false" data-view-component="true" href="/scikit-learn">
            scikit-learn
</a>          /
          <a class="text-bold wb-break-word" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":843222,"originating_url":"https://github.com/topics/machine-learning","user_id":null}}' data-hydro-click-hmac="51db3b0c4ab72cf76883ec0167919a1f07be32b9f65e8f40c4ca9d7ed9f6e332" data-turbo="false" data-vie

In [54]:
a_tags = h3_tag.find_all('a', recursive=False)

In [55]:
username = a_tags[0].text
username

'\n            scikit-learn\n'

In [56]:
username = a_tags[0].text.strip()
username

'scikit-learn'

In [57]:
repo_name = a_tags[1].text.strip()
repo_name

'scikit-learn'

In [58]:
repo_path = a_tags[1]['href'].strip()
repo_path

'/scikit-learn/scikit-learn'

In [59]:
base_url = 'https://github.com'
repo_url = base_url + repo_path 
repo_url

'https://github.com/scikit-learn/scikit-learn'

In [60]:
article_tags[4]


<article class="border rounded color-shadow-small color-bg-subtle my-4">
<div class="px-3">
<div class="d-flex flex-justify-between flex-items-start flex-wrap gap-2 my-3">
<div class="d-flex flex-1">
<span style="margin-top:2px">
<svg aria-hidden="true" class="octicon octicon-repo color-fg-muted mr-2" data-view-component="true" height="16" version="1.1" viewbox="0 0 16 16" width="16">
<path d="M2 2.5A2.5 2.5 0 014.5 0h8.75a.75.75 0 01.75.75v12.5a.75.75 0 01-.75.75h-2.5a.75.75 0 110-1.5h1.75v-2h-8a1 1 0 00-.714 1.7.75.75 0 01-1.072 1.05A2.495 2.495 0 012 11.5v-9zm10.5-1V9h-8c-.356 0-.694.074-1 .208V2.5a1 1 0 011-1h8zM5 12.25v3.25a.25.25 0 00.4.2l1.45-1.087a.25.25 0 01.3 0L8.6 15.7a.25.25 0 00.4-.2v-3.25a.25.25 0 00-.25-.25h-3.5a.25.25 0 00-.25.25z" fill-rule="evenodd"></path>
</svg>
</span>
<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representa

In [61]:
a_star_tag = article_tags[4].find('span', class_='Counter js-social-count')


In [62]:
a_star_tag

<span aria-label="52550 users starred this repository" class="Counter js-social-count" data-plural-suffix="users starred this repository" data-singular-suffix="user starred this repository" data-turbo-replace="true" data-view-component="true" id="repo-stars-counter-star" title="52,550">52.6k</span>

In [63]:
a_star_tag.text.strip()

'52.6k'

In [64]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1]) * 1000)
    else:
        return int(stars_str)

In [65]:
parse_star_count('40.3k')

40300

In [66]:
parse_star_count('991')

991

In [67]:
star_count = parse_star_count(a_star_tag.text.strip())

In [68]:
star_count

52600

In [69]:
print('Repository name:', repo_name)
print("Owner's username:", username)
print('Stars:', star_count)
print('Repository URL:', repo_url)

Repository name: scikit-learn
Owner's username: scikit-learn
Stars: 52600
Repository URL: https://github.com/scikit-learn/scikit-learn


In [70]:
def parse_repository(article_tag):
    # <a> tags containing username, repository name and URL
    a_tags = article_tag.h3.find_all('a')
    # Owner's username
    username = a_tags[0].text.strip()
    # Repository name
    repo_name = a_tags[1].text.strip()
    # Repository URL
    repo_url = base_url + a_tags[1]['href'].strip()
    # Star count
    stars_tag = article_tag.find('span', class_='Counter js-social-count')
    star_count = parse_star_count(stars_tag.text.strip())
    # Return a dictionary
    return {
        'repository_name': repo_name,
        'owner_username': username,        
        'stars': star_count,
        'repository_url': repo_url
    }

In [71]:
parse_repository(article_tags[0])

{'repository_name': 'tensorflow',
 'owner_username': 'tensorflow',
 'stars': 170000,
 'repository_url': 'https://github.com/tensorflow/tensorflow'}

In [72]:
parse_repository(article_tags[10])


{'repository_name': 'awesome-scalability',
 'owner_username': 'binhnguyennus',
 'stars': 43000,
 'repository_url': 'https://github.com/binhnguyennus/awesome-scalability'}

In [73]:
top_repositories = [parse_repository(tag) for tag in article_tags]

In [74]:
len(top_repositories)

20

In [75]:
top_repositories[:5]

[{'repository_name': 'tensorflow',
  'owner_username': 'tensorflow',
  'stars': 170000,
  'repository_url': 'https://github.com/tensorflow/tensorflow'},
 {'repository_name': 'transformers',
  'owner_username': 'huggingface',
  'stars': 77800,
  'repository_url': 'https://github.com/huggingface/transformers'},
 {'repository_name': 'pytorch',
  'owner_username': 'pytorch',
  'stars': 61500,
  'repository_url': 'https://github.com/pytorch/pytorch'},
 {'repository_name': 'keras',
  'owner_username': 'keras-team',
  'stars': 57100,
  'repository_url': 'https://github.com/keras-team/keras'},
 {'repository_name': 'scikit-learn',
  'owner_username': 'scikit-learn',
  'stars': 52600,
  'repository_url': 'https://github.com/scikit-learn/scikit-learn'}]

In [76]:
def get_top_repositories(doc):
    article_tags = doc.find_all('article', class_='border rounded color-shadow-small color-bg-subtle my-4')
    topic_repos = [parse_repository(tag) for tag in article_tags]
    return topic_repos

In [77]:
topic_page_ml = get_topic_page('machine-learning')
top_repos_ml = get_top_repositories(topic_page_ml)
top_repos_ml[:5]

[{'repository_name': 'tensorflow',
  'owner_username': 'tensorflow',
  'stars': 170000,
  'repository_url': 'https://github.com/tensorflow/tensorflow'},
 {'repository_name': 'transformers',
  'owner_username': 'huggingface',
  'stars': 77800,
  'repository_url': 'https://github.com/huggingface/transformers'},
 {'repository_name': 'pytorch',
  'owner_username': 'pytorch',
  'stars': 61500,
  'repository_url': 'https://github.com/pytorch/pytorch'},
 {'repository_name': 'keras',
  'owner_username': 'keras-team',
  'stars': 57100,
  'repository_url': 'https://github.com/keras-team/keras'},
 {'repository_name': 'scikit-learn',
  'owner_username': 'scikit-learn',
  'stars': 52600,
  'repository_url': 'https://github.com/scikit-learn/scikit-learn'}]

In [78]:
topic_page_da = get_topic_page('data-analysis')
top_repos_da = get_top_repositories(topic_page_da)
top_repos_da[:5]

[{'repository_name': 'scikit-learn',
  'owner_username': 'scikit-learn',
  'stars': 52600,
  'repository_url': 'https://github.com/scikit-learn/scikit-learn'},
 {'repository_name': 'superset',
  'owner_username': 'apache',
  'stars': 50100,
  'repository_url': 'https://github.com/apache/superset'},
 {'repository_name': 'pandas',
  'owner_username': 'pandas-dev',
  'stars': 36500,
  'repository_url': 'https://github.com/pandas-dev/pandas'},
 {'repository_name': 'metabase',
  'owner_username': 'metabase',
  'stars': 31100,
  'repository_url': 'https://github.com/metabase/metabase'},
 {'repository_name': 'AI-Expert-Roadmap',
  'owner_username': 'AMAI-GmbH',
  'stars': 23400,
  'repository_url': 'https://github.com/AMAI-GmbH/AI-Expert-Roadmap'}]

In [79]:
get_top_repositories(get_topic_page('python'))[:5]


[{'repository_name': 'system-design-primer',
  'owner_username': 'donnemartin',
  'stars': 208000,
  'repository_url': 'https://github.com/donnemartin/system-design-primer'},
 {'repository_name': 'tensorflow',
  'owner_username': 'tensorflow',
  'stars': 170000,
  'repository_url': 'https://github.com/tensorflow/tensorflow'},
 {'repository_name': 'CS-Notes',
  'owner_username': 'CyC2018',
  'stars': 161000,
  'repository_url': 'https://github.com/CyC2018/CS-Notes'},
 {'repository_name': 'awesome-python',
  'owner_username': 'vinta',
  'stars': 153000,
  'repository_url': 'https://github.com/vinta/awesome-python'},
 {'repository_name': 'Python',
  'owner_username': 'TheAlgorithms',
  'stars': 151000,
  'repository_url': 'https://github.com/TheAlgorithms/Python'}]

# Writing information to CSV files


In [80]:
def write_csv(items, path):
    # Open the file in write mode
    with open(path, 'w') as f:
        # Return if there's nothing to write
        if len(items) == 0:
            return
        
        # Write the headers in the first line
        headers = list(items[0].keys())
        f.write(','.join(headers) + '\n')
        
        # Write one item per line
        for item in items:
            values = []
            for header in headers:
                values.append(str(item.get(header, "")))
            f.write(','.join(values) + "\n")

In [81]:
len(top_repos_ml)


20

In [82]:
top_repos_ml[:3]

[{'repository_name': 'tensorflow',
  'owner_username': 'tensorflow',
  'stars': 170000,
  'repository_url': 'https://github.com/tensorflow/tensorflow'},
 {'repository_name': 'transformers',
  'owner_username': 'huggingface',
  'stars': 77800,
  'repository_url': 'https://github.com/huggingface/transformers'},
 {'repository_name': 'pytorch',
  'owner_username': 'pytorch',
  'stars': 61500,
  'repository_url': 'https://github.com/pytorch/pytorch'}]

In [83]:
write_csv(top_repositories, 'machine-learning.csv')


In [84]:
with open('machine-learning.csv', 'r') as f:
    print(f.read())

repository_name,owner_username,stars,repository_url
tensorflow,tensorflow,170000,https://github.com/tensorflow/tensorflow
transformers,huggingface,77800,https://github.com/huggingface/transformers
pytorch,pytorch,61500,https://github.com/pytorch/pytorch
keras,keras-team,57100,https://github.com/keras-team/keras
scikit-learn,scikit-learn,52600,https://github.com/scikit-learn/scikit-learn
cs-video-courses,Developer-Y,49700,https://github.com/Developer-Y/cs-video-courses
tesseract,tesseract-ocr,48400,https://github.com/tesseract-ocr/tesseract
face_recognition,ageitgey,47000,https://github.com/ageitgey/face_recognition
ML-For-Beginners,microsoft,43500,https://github.com/microsoft/ML-For-Beginners
faceswap,deepfakes,43200,https://github.com/deepfakes/faceswap
awesome-scalability,binhnguyennus,43000,https://github.com/binhnguyennus/awesome-scalability
TensorFlow-Examples,aymericdamien,42500,https://github.com/aymericdamien/TensorFlow-Examples
julia,JuliaLang,41300,https://github.com/JuliaLan

QUESTION: Write a Python function that creates a CSV file (comma-separated values) containing details about the 25 top GitHub repositories for any given topic. The top repositories for the topic machine-learning can be found on this page: https://github.com/topics/machine-learning. The output CSV should contain these details: repository name, owner's username, no. of stars, repository URL.

In [85]:
import requests
from bs4 import BeautifulSoup
base_url = 'https://github.com'

def scrape_topic_repositories(topic, path=None):
    """Get the top repositories for a topic and write them to a CSV file"""
    if path is None:
        path = topic + '.csv'
    topic_page_doc = get_topic_page(topic)
    topic_repositories = get_top_repositories(topic_page_doc)
    write_csv(topic_repositories, path)
    print('Top repositories for topic "{}" written to file "{}"'.format(topic, path))
    return path

def get_top_repositories(doc):
    """Parse the top repositories for a topic given a Beautiful Soup document"""
    article_tags = doc.find_all('article', class_='border rounded color-shadow-small color-bg-subtle my-4')
    topic_repos = [parse_repository(tag) for tag in article_tags]
    return topic_repos

def get_topic_page(topic):
    """Get the web page containing the top repositories for a topic as a Beautiful Soup document"""
    topic_repos_url = 'https://github.com/topics/' + topic
    response = requests.get(topic_repos_url)
    if response.status_code != 200:
        print('Status code:', response.status_code)
        raise Exception('Failed to fetch web page ' + topic_repos_url)
    return BeautifulSoup(response.text)    

def parse_repository(article_tag):
    """Parse information about a repository from an <article> tag"""
    a_tags = article_tag.h3.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href'].strip()
    stars_tag = article_tag.find('span', class_='Counter js-social-count')
    star_count = parse_star_count(stars_tag.text.strip())
    return {'repository_name': repo_name, 'owner_username': username, 'stars': star_count, 'repository_url': repo_url}

def parse_star_count(stars_str):
    """Parse strings like 40.3k and get the no. of stars as a number"""
    stars_str = stars_str.strip()
    return int(float(stars_str[:-1]) * 1000) if stars_str[-1] == 'k' else int(stars_str)

def write_csv(items, path):
    """Write a list of dictionaries to a CSV file"""
    with open(path, 'w') as f:
        if len(items) == 0:
            return
        headers = list(items[0].keys())
        f.write(','.join(headers) + '\n')
        for item in items:
            values = []
            for header in headers:
                values.append(str(item.get(header, "")))
            f.write(','.join(values) + "\n")

In [86]:
scrape_topic_repositories('machine-learning')

Top repositories for topic "machine-learning" written to file "machine-learning.csv"


'machine-learning.csv'

In [87]:
import pandas as pd


In [88]:
pd.read_csv('machine-learning.csv')


Unnamed: 0,repository_name,owner_username,stars,repository_url
0,tensorflow,tensorflow,170000,https://github.com/tensorflow/tensorflow
1,transformers,huggingface,77800,https://github.com/huggingface/transformers
2,pytorch,pytorch,61500,https://github.com/pytorch/pytorch
3,keras,keras-team,57100,https://github.com/keras-team/keras
4,scikit-learn,scikit-learn,52600,https://github.com/scikit-learn/scikit-learn
5,cs-video-courses,Developer-Y,49700,https://github.com/Developer-Y/cs-video-courses
6,tesseract,tesseract-ocr,48400,https://github.com/tesseract-ocr/tesseract
7,face_recognition,ageitgey,47000,https://github.com/ageitgey/face_recognition
8,ML-For-Beginners,microsoft,43500,https://github.com/microsoft/ML-For-Beginners
9,faceswap,deepfakes,43200,https://github.com/deepfakes/faceswap


In [89]:
scrape_topic_repositories('data-analysis')

Top repositories for topic "data-analysis" written to file "data-analysis.csv"


'data-analysis.csv'

In [90]:
pd.read_csv('data-analysis.csv')

Unnamed: 0,repository_name,owner_username,stars,repository_url
0,scikit-learn,scikit-learn,52600,https://github.com/scikit-learn/scikit-learn
1,superset,apache,50100,https://github.com/apache/superset
2,pandas,pandas-dev,36500,https://github.com/pandas-dev/pandas
3,metabase,metabase,31100,https://github.com/metabase/metabase
4,AI-Expert-Roadmap,AMAI-GmbH,23400,https://github.com/AMAI-GmbH/AI-Expert-Roadmap
5,streamlit,streamlit,22100,https://github.com/streamlit/streamlit
6,CyberChef,gchq,19900,https://github.com/gchq/CyberChef
7,Data-Science-For-Beginners,microsoft,17000,https://github.com/microsoft/Data-Science-For-...
8,goaccess,allinurl,15600,https://github.com/allinurl/goaccess
9,best-of-ml-python,ml-tooling,12400,https://github.com/ml-tooling/best-of-ml-python


In [91]:
scrape_topic_repositories('python')

Top repositories for topic "python" written to file "python.csv"


'python.csv'

In [92]:
pd.read_csv('python.csv')


Unnamed: 0,repository_name,owner_username,stars,repository_url
0,system-design-primer,donnemartin,208000,https://github.com/donnemartin/system-design-p...
1,tensorflow,tensorflow,170000,https://github.com/tensorflow/tensorflow
2,CS-Notes,CyC2018,161000,https://github.com/CyC2018/CS-Notes
3,awesome-python,vinta,153000,https://github.com/vinta/awesome-python
4,Python,TheAlgorithms,151000,https://github.com/TheAlgorithms/Python
5,free-programming-books-zh_CN,justjavac,98600,https://github.com/justjavac/free-programming-...
6,project-based-learning,practical-tutorials,86100,https://github.com/practical-tutorials/project...
7,transformers,huggingface,77800,https://github.com/huggingface/transformers
8,thefuck,nvbn,75100,https://github.com/nvbn/thefuck
9,django,django,68100,https://github.com/django/django


# Using a REST API to retrieve data as JSON


Not all URLs point to an HTML page. Consider this URL for example: https://api.github.com/repos/octocat/hello-world . It points to a JSON document, which has a structure like this:

{
  "name": "Hello-World",
  "full_name": "octocat/Hello-World",
  "private": false,
  "owner": {
    "login": "octocat",
    "id": 583231,
  },
  "html_url": "https://github.com/octocat/Hello-World",
}
It's quite similar to a Python dictionary. In fact, you can use the json module from python to convert a JSON document into a Python dictionary.

In [93]:
response = requests.get('https://api.github.com/repos/octocat/hello-world')

In [94]:
import json

data_dict = json.loads(response.text)

In [95]:
data_dict

{'id': 1296269,
 'node_id': 'MDEwOlJlcG9zaXRvcnkxMjk2MjY5',
 'name': 'Hello-World',
 'full_name': 'octocat/Hello-World',
 'private': False,
 'owner': {'login': 'octocat',
  'id': 583231,
  'node_id': 'MDQ6VXNlcjU4MzIzMQ==',
  'avatar_url': 'https://avatars.githubusercontent.com/u/583231?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/octocat',
  'html_url': 'https://github.com/octocat',
  'followers_url': 'https://api.github.com/users/octocat/followers',
  'following_url': 'https://api.github.com/users/octocat/following{/other_user}',
  'gists_url': 'https://api.github.com/users/octocat/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/octocat/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/octocat/subscriptions',
  'organizations_url': 'https://api.github.com/users/octocat/orgs',
  'repos_url': 'https://api.github.com/users/octocat/repos',
  'events_url': 'https://api.github.com/users/octocat/events{/privacy}',
  'received

QUESTION: Write a function get_repo_details to find the following information about a repository: description, watcher count, fork count, open issues count, created at time and updated at time.

In [96]:
def get_repo_details(username,repo_name):
    print("Fetching information for {}/{}".format(username,repo_name))
    repo_details_url = "https://api.github.com/repos/" + username + "/" + repo_name
    response = requests.get(repo_details_url)
    if not response.ok:
        print("Failed to featch!")
        return {}
    repo_data = json.loads(response.text)
    return {
        'description':repo_data['description'],
        'watchers': repo_data['watchers_count'],
        'open_issues': repo_data['open_issues_count'],
        'created_at': repo_data['created_at'],
        'updated_at': repo_data['updated_at']
    }

In [97]:
get_repo_details('octocat','hello-world')

Fetching information for octocat/hello-world


{'description': 'My first repository on GitHub!',
 'watchers': 2105,
 'open_issues': 991,
 'created_at': '2011-01-26T19:01:12Z',
 'updated_at': '2023-01-11T05:56:39Z'}

In [98]:
get_repo_details('tensorflow', 'tensorflow')


Fetching information for tensorflow/tensorflow


{'description': 'An Open Source Machine Learning Framework for Everyone',
 'watchers': 170292,
 'open_issues': 2332,
 'created_at': '2015-11-07T01:19:20Z',
 'updated_at': '2023-01-11T10:07:10Z'}

QUESTION: Augment the list of top repositories for a topic with the repository description, watcher count, fork count, open issues count, created at time and updated at time.

In [99]:
def add_repo_details(repos):
    return [dict(**get_repo_details(repo['owner_username'], repo['repository_name']), **repo) for repo in repos]

In [100]:
add_repo_details(top_repositories[:5])


Fetching information for tensorflow/tensorflow
Fetching information for huggingface/transformers
Fetching information for pytorch/pytorch
Fetching information for keras-team/keras
Fetching information for scikit-learn/scikit-learn


[{'description': 'An Open Source Machine Learning Framework for Everyone',
  'watchers': 170292,
  'open_issues': 2332,
  'created_at': '2015-11-07T01:19:20Z',
  'updated_at': '2023-01-11T10:07:10Z',
  'repository_name': 'tensorflow',
  'owner_username': 'tensorflow',
  'stars': 170000,
  'repository_url': 'https://github.com/tensorflow/tensorflow'},
 {'description': '🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.',
  'watchers': 77797,
  'open_issues': 574,
  'created_at': '2018-10-29T13:56:00Z',
  'updated_at': '2023-01-11T10:48:07Z',
  'repository_name': 'transformers',
  'owner_username': 'huggingface',
  'stars': 77800,
  'repository_url': 'https://github.com/huggingface/transformers'},
 {'description': 'Tensors and Dynamic neural networks in Python with strong GPU acceleration',
  'watchers': 61541,
  'open_issues': 10544,
  'created_at': '2016-08-13T05:26:41Z',
  'updated_at': '2023-01-11T10:53:20Z',
  'repository_name': 'pytorch',
  'owner_u

In [101]:
from getpass import getpass

token = getpass()

········


# Crawling Websites by Parsing Links on a Page


EXERCISE: Find the first occurrence of each of these tags in doc: div, img, span, p, etc.



In [103]:
first_div=doc.find_all('div')
#first=doc.find('div')
#first=doc.div
first_div[0]

<div class="logged-out env-production page-responsive" data-turbo-body="" style="word-wrap: break-word;">
<div class="position-relative js-header-wrapper">
<a class="px-2 py-4 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content" href="#start-of-content">Skip to content</a>
<span class="progress-pjax-loader Progress position-fixed width-full" data-view-component="true">
<span class="Progress-item progress-pjax-loader-bar left-0 top-0 color-bg-accent-emphasis" data-view-component="true" style="width: 0%;"></span>
</span>
<script crossorigin="anonymous" defer="defer" src="https://github.githubassets.com/assets/sessions-4ec6e61a96fe.js" type="application/javascript"></script>
<header class="Header-old header-logged-out js-details-container Details position-relative f4 py-3" role="banner">
<button aria-label="Toggle navigation" class="Header-backdrop d-lg-none border-0 position-fixed top-0 left-0 width-full height-full js-details-target" type="button">
<span class

In [104]:
first_img=doc.find('img')
#first_img=doc.find_all('img')
#first_img=doc.img
first_img


<img alt="" aria-label="Team" class="avatar mr-2 flex-shrink-0 js-jump-to-suggestion-avatar d-none" height="28" src="" width="28"/>

In [105]:
first_span=doc.span
#first_span=doc.find_all['span']
#first_span=doc.find['span']
first_span


<span class="progress-pjax-loader Progress position-fixed width-full" data-view-component="true">
<span class="Progress-item progress-pjax-loader-bar left-0 top-0 color-bg-accent-emphasis" data-view-component="true" style="width: 0%;"></span>
</span>

In [106]:
first_p=doc.find_all('p')
first_p[0]

<p>Machine learning is the practice of teaching a computer to learn. The concept uses pattern recognition, as well as other forms of predictive algorithms, to make judgments on incoming data. This field is closely related to artificial intelligence and computational statistics.</p>

EXERCISE: Get a list of all the img tags on the page. How many images does the page contain?



In [107]:
all_images=doc.find_all('img')
len(all_images)

10

EXERCISE: Find the 5th image tag on the page (counting from 0). Which attributes does the tag contain? Find the values of the src and alt attributes of the tag.

In [108]:
fifth_image=all_images[5]
fifth_image

<img alt="ML-For-Beginners" class="d-block width-full" loading="lazy" src="https://repository-images.githubusercontent.com/343965132/549b1a80-c897-11eb-9436-918072d2e0f8"/>

In [109]:
fifth_image['src']


'https://repository-images.githubusercontent.com/343965132/549b1a80-c897-11eb-9436-918072d2e0f8'

In [110]:
fifth_image['alt']


'ML-For-Beginners'

EXERCISE: Find the src attribute of the first img tag with the alt attribute set to julia. Visit the link and check what the image represents.

In [111]:
doc.find('img',{'alt':'julia'})['src']

'https://repository-images.githubusercontent.com/1644196/ddfc1e00-6638-11e9-9b80-0fe7b9aedd72'

EXERCISE: Find the list of all the images matching the class d-block width-full. Each list element should be a dictionary containing two keys, "username" and "url". You can obtain the username using the alt attribute of a tag and the URL using the src attribute.

In [112]:
image_link_tag = doc.find_all('img',class_='d-block width-full')
avatar_users = []
for tag in image_link_tag:
    avatar_users.append({
        'username' : tag['alt'],
        'url' : tag['src']
    })
    
avatar_users

[{'username': 'transformers',
  'url': 'https://repository-images.githubusercontent.com/155220641/a16c4880-a501-11ea-9e8f-646cf611702e'},
 {'username': 'ML-For-Beginners',
  'url': 'https://repository-images.githubusercontent.com/343965132/549b1a80-c897-11eb-9436-918072d2e0f8'},
 {'username': 'awesome-scalability',
  'url': 'https://repository-images.githubusercontent.com/115478820/109a8e00-283a-11ea-8891-ad7215b06a4c'},
 {'username': 'julia',
  'url': 'https://repository-images.githubusercontent.com/1644196/ddfc1e00-6638-11e9-9b80-0fe7b9aedd72'},
 {'username': 'yolov5',
  'url': 'https://repository-images.githubusercontent.com/264818686/40f8c2c3-7919-4652-b278-ec6a7fb06a53'},
 {'username': 'Made-With-ML',
  'url': 'https://repository-images.githubusercontent.com/156157055/680e625b-498a-4af6-b8c9-ff19672929b8'}]

EXERCISE: Write a function scrape_topics which takes a list of topics and creates CSV files containing top repositories for a list of topics. Test it out using the empty cells below.

In [113]:
topics=['data-analysis','python','deep-learning']

In [114]:
def scrape_topics(topics):
    for topic in topics:
        scrape_topic_repositories(topic)

In [115]:
scrape_topics(topics)


Top repositories for topic "data-analysis" written to file "data-analysis.csv"
Top repositories for topic "python" written to file "python.csv"
Top repositories for topic "deep-learning" written to file "deep-learning.csv"


In [116]:
pd.read_csv('data-analysis.csv')


Unnamed: 0,repository_name,owner_username,stars,repository_url
0,scikit-learn,scikit-learn,52600,https://github.com/scikit-learn/scikit-learn
1,superset,apache,50100,https://github.com/apache/superset
2,pandas,pandas-dev,36500,https://github.com/pandas-dev/pandas
3,metabase,metabase,31100,https://github.com/metabase/metabase
4,AI-Expert-Roadmap,AMAI-GmbH,23400,https://github.com/AMAI-GmbH/AI-Expert-Roadmap
5,streamlit,streamlit,22100,https://github.com/streamlit/streamlit
6,CyberChef,gchq,19900,https://github.com/gchq/CyberChef
7,Data-Science-For-Beginners,microsoft,17000,https://github.com/microsoft/Data-Science-For-...
8,goaccess,allinurl,15600,https://github.com/allinurl/goaccess
9,best-of-ml-python,ml-tooling,12400,https://github.com/ml-tooling/best-of-ml-python


In [117]:
pd.read_csv('python.csv')

Unnamed: 0,repository_name,owner_username,stars,repository_url
0,system-design-primer,donnemartin,208000,https://github.com/donnemartin/system-design-p...
1,tensorflow,tensorflow,170000,https://github.com/tensorflow/tensorflow
2,CS-Notes,CyC2018,161000,https://github.com/CyC2018/CS-Notes
3,awesome-python,vinta,153000,https://github.com/vinta/awesome-python
4,Python,TheAlgorithms,151000,https://github.com/TheAlgorithms/Python
5,free-programming-books-zh_CN,justjavac,98600,https://github.com/justjavac/free-programming-...
6,project-based-learning,practical-tutorials,86100,https://github.com/practical-tutorials/project...
7,transformers,huggingface,77800,https://github.com/huggingface/transformers
8,thefuck,nvbn,75100,https://github.com/nvbn/thefuck
9,django,django,68100,https://github.com/django/django


In [118]:
pd.read_csv('deep-learning.csv')


Unnamed: 0,repository_name,owner_username,stars,repository_url
0,tensorflow,tensorflow,170000,https://github.com/tensorflow/tensorflow
1,transformers,huggingface,77800,https://github.com/huggingface/transformers
2,opencv,opencv,65900,https://github.com/opencv/opencv
3,pytorch,pytorch,61500,https://github.com/pytorch/pytorch
4,keras,keras-team,57100,https://github.com/keras-team/keras
5,faceswap,deepfakes,43200,https://github.com/deepfakes/faceswap
6,TensorFlow-Examples,aymericdamien,42500,https://github.com/aymericdamien/TensorFlow-Ex...
7,100-Days-Of-ML-Code,Avik-Jain,39400,https://github.com/Avik-Jain/100-Days-Of-ML-Code
8,Real-Time-Voice-Cloning,CorentinJ,38700,https://github.com/CorentinJ/Real-Time-Voice-C...
9,d2l-zh,d2l-ai,37200,https://github.com/d2l-ai/d2l-zh


EXERCISE: Get the top 100 repositories for the all the featured topics on GitHub. You might find these URLs useful:

Eighth page of featured topics: https://github.com/topics/?page=8
Second page of top repositories for a topic: https://github.com/topics/machine-learning?page=2

In [119]:
def get_feature_page(n):
    docs = []
    for i in range(1,n):
        topic_url = 'https://github.com/topics/?page='+str(i)
        response = requests.get(topic_url)
        if response.status_code != 200:
            print("Status Code :",response.status_code)
            raise Exception("Failed to fetch web page :",topic_url)
        doc = BeautifulSoup(response.text)
        docs.append(doc)
    return docs

In [131]:
def get_featured_topics(docs):
    hrefs = []
    for doc in docs:
        href = doc.find_all('a',class_='no-underline d-flex flex-column flex-justify-center')
        for a in href:
            link = a['href']
            hrefs.append(link)
            
    return hrefs

In [132]:
def scrape_featured_repositories(topic, path=None):
    """Get the top repositories for a topic and write them to a CSV file"""
    for i in range(1,6):
        path= topic.strip('/topics')+str(i)+".csv"
        topic_repos_url = "https://github.com/"+topic+"?page=" + str(i)
        response = requests.get(topic_repos_url)
        if response.status_code != 200:
            print("Status code :" ,response.status_code)
            raise Exception("Failed to featch webpage ",topic_repos_url)
        topic_page_doc = BeautifulSoup(response.text)
        topic_repositories = get_top_repositories(topic_page_doc)
        write_csv(topic_repositories, path)
        print("Top repositories for topic '{}' written to file '{}' ".format(topic,path))

    return path

In [133]:
def get_top_featured_repositories(n):
    docs=get_feature_page(n)
    topics=get_featured_topics(docs)
    dfs=[]
    for topic in topics:
        scrape_featured_repositories(topic)


In [134]:
get_top_featured_repositories(2)

Top repositories for topic '/topics/go' written to file 'g1.csv' 
Top repositories for topic '/topics/go' written to file 'g2.csv' 
Top repositories for topic '/topics/go' written to file 'g3.csv' 
Top repositories for topic '/topics/go' written to file 'g4.csv' 
Top repositories for topic '/topics/go' written to file 'g5.csv' 
Top repositories for topic '/topics/vue' written to file 'vue1.csv' 
Top repositories for topic '/topics/vue' written to file 'vue2.csv' 
Top repositories for topic '/topics/vue' written to file 'vue3.csv' 
Top repositories for topic '/topics/vue' written to file 'vue4.csv' 
Top repositories for topic '/topics/vue' written to file 'vue5.csv' 
Top repositories for topic '/topics/typescript' written to file 'ypescr1.csv' 
Top repositories for topic '/topics/typescript' written to file 'ypescr2.csv' 
Top repositories for topic '/topics/typescript' written to file 'ypescr3.csv' 
Top repositories for topic '/topics/typescript' written to file 'ypescr4.csv' 
Top repos

In [136]:
pd.read_csv('matlab1.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'matlab1.csv'