In [1]:
import json
import pickle
import os

import requests
import numpy
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def get_json(path, **kwargs):
    res = requests.get(path.format(**kwargs))
    info = json.loads(res.text)
    return info

In [3]:
project_path = 'https://pypi.org/pypi/{project}/json'
release_path = 'https://pypi.org/pypi/{project}/{release}/json'

In [4]:
all_package_name = requests.get('https://pypi.org/simple')
soup = BeautifulSoup(all_package_name.text, 'html.parser')
all_packages = soup.find_all('a')

In [5]:
all_package_text = [i.text for i in all_packages]

In [6]:
def from_api_to_json(log, j, start, end):
    infos = []
    for idx, package_name in enumerate(all_package_text[start: end]):
        try:
            info = get_json(project_path, project=package_name)
        except json.JSONDecodeError:
            print(package_name, file=log)
            continue
        releases = list(info['releases'].keys())
        true_releases = []
        package_info = []
        for release in releases:
            try:
                info = get_json(release_path, project=package_name, release=release)
            except json.JSONDecodeError:
                print(package_name, release, file=log)
                continue
            true_releases.append(release)
            del info['releases']
            del info['last_serial']
            del info['info']['description']
            package_info.append(info)
        infos.append({'_id': package_name, 'releases': true_releases, 'info': package_info})
        
    json.dump(infos, j)

In [None]:
step = 100
for i in range(50):
    start = 25000 + i * step
    end = start + step
    with open(f'./log/json_{start}-{end}.log', 'w') as log, \
         open(f'./json_file/json_{start}-{end}.json', 'w') as j:
            from_api_to_json(log, j, start, end)
    print(start, end, 'done')

25000 25100 done
25100 25200 done
25200 25300 done
25300 25400 done
25400 25500 done
25500 25600 done
25600 25700 done
25700 25800 done
25800 25900 done
25900 26000 done
26000 26100 done
26100 26200 done
26200 26300 done
26300 26400 done
26400 26500 done
26500 26600 done
26600 26700 done
26700 26800 done
26800 26900 done
26900 27000 done
27000 27100 done
27100 27200 done
27200 27300 done
27300 27400 done
27400 27500 done
27500 27600 done
27600 27700 done
27700 27800 done
27800 27900 done
27900 28000 done
28000 28100 done
28100 28200 done


In [6]:
def from_api(log, start, end):
    for idx, package_name in enumerate(all_package_text[start:end]):
        try:
            info = get_json(project_path, project=package_name)
        except json.JSONDecodeError:
            print(package_name, file=log)
            continue
        releases = list(info['releases'].keys())
        try:
            os.mkdir(f'./json/{package_name}')
        except FileExistsError:
            pass
        for release in releases:
            try:
                info = get_json(release_path, project=package_name, release=release)
            except json.JSONDecodeError:
                print(package_name, release, file=log)
                continue
            with open(f'./json/{package_name}/{release}', 'w') as f:
                json.dump(info, f)

In [7]:
start, end = 5000, 10000
with open(f'./log/json_{start}-{end}.log', 'w') as log:
    from_api(log, start, end)

ConnectionError: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /pypi/allennlp-pvt-nightly/0.9.0.dev201908251300/json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f560492ee20>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [33]:
def main(log, start, end):
    index = [[], []]
    python2s = []
    python3s = []
    pythons = []
    topics = []
    dependencies = []
    dependency_details = []
    code_urls = []
    homepages = []
    times = []

    for idx, package_name in enumerate(all_package_text[start:end]):
        try:
            info = get_json(project_path, project=package_name)
        except json.JSONDecodeError:
            print(package_name, file=log)
            continue
        releases = list(info['releases'].keys())
        index[0].extend([package_name] * len(releases))
        index[1].extend(releases)
        for release in releases:
            try:
                info = get_json(release_path, project=package_name, release=release)
            except json.JSONDecodeError:
                print(package_name, release, file=log)
                continue
            
            classifiers = info['info']['classifiers']
            python = []
            topic = set()
            python2 = python3 = False
            for i in classifiers:
                if 'Python :: ' in i:
                    if '.' in i:
                        python.append(i[-3:])
                    elif '2' in i:
                        python2 = True
                    elif '3' in i:
                        python3 = True
                elif 'Topic :: ' in i:
                    topic.add(i.split(' :: ')[1])
            python2s.append(python2)
            python3s.append(python3)
            pythons.append(python)
            topics.append(topic)

            requires_dist = info['info']['requires_dist']
            dependency = []
            dependency_detail = []
            if requires_dist is not None:
                for dist in requires_dist:
                    if 'extra' in dist:
                        break
                    elif ' ' in dist:
                        dependency.append(dist.split(' ')[0])
                        dependency_detail.append(dist.split(' ')[1][1:-1])
                    else:
                        dependency.append(dist)
                        dependency_detail.append(None)
            dependencies.append(dependency)
            dependency_details.append(dependency_detail)

            project_urls = info['info']['project_urls']  
            if project_urls is not None:
                code_urls.append(project_urls.get('Code'))
                homepages.append(project_urls.get('Homepage'))
            else:
                code_urls.append(None)
                homepages.append(None) 
            
            times.append(info['urls'][0]['upload_time'].split('T')[0])
            
    data = {
        'python2s': python2s,
        'python3s': python3s,
        'pythons': pythons,
        'topics': topics,
        'dependencies': dependencies,
        'dependency_details': dependency_details,
        'code_urls': code_urls,
        'homepages': homepages,
        'times': times
    }
    
    df = pd.DataFrame(data=data, index=index)
    return df

In [34]:
start, end = 1000, 2000
with open(f'pandas_{start}-{end}.log', 'w') as log:
    df = main(log, start, end)

In [32]:
df.sample(10)

Unnamed: 0,Unnamed: 1,python2s,python3s,pythons,topics,dependencies,dependency_details,code_urls,homepages
101703301-Project1-TOPSIS,0.0.2,False,True,[],{},[],[],,https://github.com/
12factor-vault,0.1.18,False,True,[],{},"[django-dbconn-retry, hvac]","[>=0.1.4, >=0.3.0]",,https://github.com/jdelic/12factor-vault/
17MonIP,0.2.1,False,False,[],{},[],[],,http://lxyu.github.io/17monip/
0x-order-utils,4.0.0.dev8,False,True,"[3.6, 3.7]","{Security, Software Development, Other/Nonlist...","[0x-contract-addresses, 0x-contract-artifacts,...","[==3.0.0.dev3, ==3.0.0.dev2, ==2.1.0.dev2, Non...",,https://github.com/0xProject/0x-monorepo/tree/...
0lever-so,1.1.1,False,False,[],{},[],[],,https://github.com/0lever/so
01changer,1.0.0,False,False,[],{},[],[],,404 NOT FOUND
0x-json-schemas,2.0.0,False,True,"[3.6, 3.7]","{Security, Software Development, Other/Nonlist...","[jsonschema, mypy-extensions, stringcase]","[None, None, None]",,https://github.com/0xProject/0x-monorepo/tree/...
17MonIP,0.2.6,False,False,"[2.6, 2.7, 3.2, 3.3, 3.4]",{},[],[],,http://lxyu.github.io/17monip/
0-core-client,1.1.0a8,False,False,[],{},[redis],[>=2.10.5],,https://github.com/zero-os/0-core
101703088-topsis,2.0.2,False,True,"[3.4, 3.5, 3.6]",{Software Development},[],[],,https://github.com/user/101703088-topsis
