In [4]:
import json
import os

import pandas as pd

In [5]:
def main(package_infos):
    index = [[], []]
    names = []
    python2s = []
    python3s = []
    pythons = []
    topics = []
    dependencies = []
    dependency_details = []
    requires_pythons = []
#     code_urls = []
#     homepages = []
#     repositories = []
#     downloads = []
    urls = []
    release_times = []
    
    for idx, package_info in enumerate(package_infos):
        package_id = package_info['_id']
        releases = package_info['releases']
        index[0].extend([package_id] * len(releases))
        index[1].extend(releases)
        for release in package_info['info']:
            release_info = release['info']
            classifiers = release_info['classifiers']
            python = []
            topic = set()
            python2 = python3 = False
            for i in classifiers:
                if 'Python :: ' in i:
                    if '.' in i:
                        python.append(i[-3:])
                    elif '2' in i:
                        python2 = True
                    elif '3' in i:
                        python3 = True
                elif 'Topic :: ' in i:
                    topic.add(i.split(' :: ')[1])
            python2s.append(python2)
            python3s.append(python3)
            pythons.append(python)
            topics.append(topic)
            
            requires_pythons.append(release_info['requires_python'])
            names.append(release_info['name'])
            
            requires_dist = release_info['requires_dist']
            dependency = []
            dependency_detail = []
            if requires_dist is not None:
                for dist in requires_dist:
                    if 'extra' in dist:
                        break
                    elif ' ' in dist:
                        dependency.append(dist.split(' ')[0])
                        dependency_detail.append(dist.split(' ')[1][1:-1])  # 去掉头尾的括号
                    else:
                        dependency.append(dist)
                        dependency_detail.append(None)
            dependencies.append(dependency)
            dependency_details.append(dependency_detail)

            project_urls = release_info['project_urls']
            urls.append(project_urls)
#             if project_urls is not None:
#                 code_urls.append(project_urls.get('Code'))
#                 homepages.append(project_urls.get('Homepage'))
#                 repositories.append(project_urls.get('Repository'))
#                 downloads.append(project_urls.get('Download'))
#             else:
#                 code_urls.append(None)
#                 homepages.append(None)
#                 repositories.append(None)
#                 downloads.append(None)
            try:
                release_url = release['urls'][0]
            except IndexError:
                release_times.append(None)
            else:
                release_times.append(release_url['upload_time'].split('T')[0])
    data = {
        'name': names,
        'python2': python2s,
        'python3': python3s,
        'python': pythons,
        'topic': topics,
        'dependency': dependencies,
        'dependency_detail': dependency_details,
        'requires_python': requires_pythons,
        'project_urls': urls,
        'upload_time': release_times
    }
    
    df = pd.DataFrame(data=data, index=index)
    return df

In [6]:
package_paths = [i for i in os.listdir('./BIG_JSON') if i != '.ipynb_checkpoints']
for idx, path in enumerate(package_paths):
    with open(f'./BIG_JSON/{path}') as f:
        package_infos = json.load(f)
    if idx == 0:
        df = main(package_infos)
    else:
        df = pd.concat([df, main(package_infos)])

#### 一共抓到多少包

In [7]:
print(len(df), len({i[0] for i in df.index}))

1139283 143124


#### 有多少在classifiers中明确标出python3的

In [8]:
def filt(x):
    return any('3.' in i for i in x)

sum((df.python3 == True) | df.python.apply(filt))

631519

#### 有多少给出github网址的

In [9]:
def filt(x):
    return (x is not None and any('github' in i for i in x.values()))

sum(df.project_urls.apply(filt))

794555

In [10]:
url_keys = set()

for i in df.project_urls:
    if i is not None:
        url_keys |= i.keys()

url_keys

{'"Source Code"',
 '.git',
 'API',
 'API Docs Reference',
 'API Documentation',
 'API Product Data Dictionary',
 'API Project',
 'API Reference',
 'API Source',
 'API registration',
 'About',
 'About Us',
 'Addgene Guide',
 'Administration',
 'Algorithms Source',
 'Alpino',
 'Ancillary Tools',
 'App',
 'Apprenticeship Program',
 'ArXiv',
 'Archive',
 'Archives',
 'Argentina Chat Spanish',
 'Ask a Question',
 'Author',
 'Author LinkedIn',
 'Author WebSite',
 'Author website',
 "Author's Website",
 "Author's website",
 'Authors',
 'Azure Pipelines',
 'BIN Checker API',
 'Background',
 'Based on',
 'Blog',
 'Blog Post',
 'Blog post',
 'BoZeng',
 'Browse Source',
 'Bug Report',
 'Bug Reporting',
 'Bug Reports',
 'Bug Reports/Issues',
 'Bug Tracker',
 'Bug Tracking',
 'Bug reports',
 'Bug tracker',
 'Bug-Tracker',
 'Bugs',
 'Bugtracker',
 'Build status',
 'Builds',
 'But Tracker',
 'Buy Hardware',
 'Buy me a coffee',
 'Buy me a soda!',
 'C++ code repository',
 'CERN Mattermost/JAliEn',
 'CI

In [11]:
def url2github(x):
    if not x:
        return None
    else:
        prefixs = (
            'https://github.com/',
            'http://github.com/',
            'https://www.github.com/',
            'http://www.github.com/',
            'git@github.com:'
        )
        results = None
        githubs = list({i for i in x.values() if 'github.com' in i})
        for url in githubs:
            try:
                for prefix in prefixs:
                    if url.startswith(prefix):
                        owner, repo = url.split(prefix)[1].split('/')[:2]
                        break
                else:
#                     print('不知道啥玩意的github网站', x)
                    continue
            except ValueError:
#                 print('bad github', x)
                continue
            
            if repo.endswith('.git'):
                repo = repo[:-4]
            
            if ((owner, repo) != results) and (results is not None):
#                 print('有歧义啊', x)
                return False
            else:
                results = owner, repo
                
        return results if results is not None else True
df['github'] = df.project_urls.apply(url2github)
df.github

hdfdict      0.1.1alpha           (SiggiGue, hdfdict)
             0.1alpha                            True
             0.3.1                (SiggiGue, hdfdict)
hdfe         0.0.3                (esantorella, hdfe)
             0.0.4                (esantorella, hdfe)
                                      ...            
auto_tagify  1.0           (ednapiranha, auto-tagify)
             1.1           (ednapiranha, auto-tagify)
             1.2           (ednapiranha, auto-tagify)
             1.3           (ednapiranha, auto-tagify)
             1.4           (ednapiranha, auto-tagify)
Name: github, Length: 1139283, dtype: object

In [12]:
df.to_pickle('./pypi_json.pkl')

In [21]:
import requests

In [41]:
path_r = 'https://pypi.org/pypi/{}/{}/json'
res_r = requests.get(path_r.format('hdidx', '0.2.1'))

path_p = 'https://pypi.org/pypi/{}/json'
res_p = requests.get(path_p.format('hdfs'))

In [42]:
info_r = json.loads(res_r.text)
info_r.keys()

dict_keys(['info', 'last_serial', 'releases', 'urls'])

In [43]:
info_p = json.loads(res_p.text)
info_p['info']['classifiers']

['Development Status :: 5 - Production/Stable',
 'Intended Audience :: Developers',
 'License :: OSI Approved :: MIT License',
 'Programming Language :: Python',
 'Programming Language :: Python :: 2.6',
 'Programming Language :: Python :: 2.7',
 'Programming Language :: Python :: 3.3',
 'Programming Language :: Python :: 3.4',
 'Programming Language :: Python :: 3.5',
 'Programming Language :: Python :: 3.6']