In [1]:
import requests
import json
import os
import pandas as pd
from bs4 import BeautifulSoup

In [34]:
# create folder
if not os.path.exists('data'):
    os.makedirs('data')

# download json
url = 'https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json'
response = requests.get(url)
data = response.json()

# save json
with open('data/top-pypi-packages-30-days.json', 'w') as outfile:
    json.dump(data, outfile)

In [93]:
# open json
with open('data/top-pypi-packages-30-days.json') as f:
    data = json.load(f)

# create dataframe
df = pd.read_json(json.dumps(data['rows']))

In [94]:
df['link'] = df.project.apply(lambda x: f"https://pypi.org/project/{x}/")

In [95]:
df = df[:110]

In [96]:
df

Unnamed: 0,download_count,project,link
0,719072639,boto3,https://pypi.org/project/boto3/
1,329215565,urllib3,https://pypi.org/project/urllib3/
2,320415214,botocore,https://pypi.org/project/botocore/
3,284144318,requests,https://pypi.org/project/requests/
4,246176073,setuptools,https://pypi.org/project/setuptools/
...,...,...,...
105,35844052,scikit-learn,https://pypi.org/project/scikit-learn/
106,35410517,proto-plus,https://pypi.org/project/proto-plus/
107,35383569,gitpython,https://pypi.org/project/gitpython/
108,35368910,msal,https://pypi.org/project/msal/


In [97]:
link = df.link[0]
response = requests.get(link)
html_content = response.content

soup = BeautifulSoup(html_content, 'html.parser')

In [98]:
desired_class = 'vertical-tabs__tab vertical-tabs__tab--with-icon vertical-tabs__tab--condensed'
elements_with_class = soup.find(class_='fab fa-github')

In [99]:
elements_with_class.find_previous('a').get('href')

'https://github.com/boto/boto3'

In [100]:
def get_github_link(link):
    response = requests.get(link)
    html_content = response.content

    soup = BeautifulSoup(html_content, 'html.parser')
    elements_with_class = soup.find(class_='fab fa-github')
    if elements_with_class is None:
        elements_with_class = soup.find(class_='fas fa-home')
    if elements_with_class is None:
        return None
    return elements_with_class.find_previous('a').get('href')

In [101]:
df['github_link'] = df.link.apply(lambda x: get_github_link(x))

In [102]:
df

Unnamed: 0,download_count,project,link,github_link
0,719072639,boto3,https://pypi.org/project/boto3/,https://github.com/boto/boto3
1,329215565,urllib3,https://pypi.org/project/urllib3/,https://github.com/urllib3/urllib3
2,320415214,botocore,https://pypi.org/project/botocore/,https://github.com/boto/botocore
3,284144318,requests,https://pypi.org/project/requests/,https://github.com/psf/requests
4,246176073,setuptools,https://pypi.org/project/setuptools/,https://github.com/pypa/setuptools
...,...,...,...,...
105,35844052,scikit-learn,https://pypi.org/project/scikit-learn/,https://github.com/scikit-learn/scikit-learn
106,35410517,proto-plus,https://pypi.org/project/proto-plus/,https://github.com/googleapis/proto-plus-pytho...
107,35383569,gitpython,https://pypi.org/project/gitpython/,https://github.com/gitpython-developers/GitPython
108,35368910,msal,https://pypi.org/project/msal/,https://github.com/AzureAD/microsoft-authentic...


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   download_count  110 non-null    int64 
 1   project         110 non-null    object
 2   link            110 non-null    object
 3   github_link     109 non-null    object
dtypes: int64(1), object(3)
memory usage: 3.6+ KB


In [105]:
df.dropna(inplace=True)
df = df[:100]

In [106]:
df

Unnamed: 0,download_count,project,link,github_link
0,719072639,boto3,https://pypi.org/project/boto3/,https://github.com/boto/boto3
1,329215565,urllib3,https://pypi.org/project/urllib3/,https://github.com/urllib3/urllib3
2,320415214,botocore,https://pypi.org/project/botocore/,https://github.com/boto/botocore
3,284144318,requests,https://pypi.org/project/requests/,https://github.com/psf/requests
4,246176073,setuptools,https://pypi.org/project/setuptools/,https://github.com/pypa/setuptools
...,...,...,...,...
96,40256059,itsdangerous,https://pypi.org/project/itsdangerous/,https://github.com/pallets/itsdangerous/
97,40215189,pynacl,https://pypi.org/project/pynacl/,https://github.com/pyca/pynacl/
98,38980469,paramiko,https://pypi.org/project/paramiko/,https://github.com/paramiko/paramiko
99,38850042,deprecated,https://pypi.org/project/deprecated/,https://github.com/tantale/deprecated


In [108]:
df.to_json('data/link-data.json', orient='records')