# Gathered PyPI package info

In [147]:
import yaml
import json
import os
import datetime
import dateutil.parser
import datetime
import requests

from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from thoth.python import PackageVersion
from packaging.version import parse
from itertools import chain

This notebook uses data as produced [fridex/pypi-gather-package-info](https://github.com/fridex/pypi-gather-package-info). The dataset can be downloaded from Thoth's Ceph hosted on OCP4 cluster.

In [3]:
release_dates = {}
released_packages = {}

for f_yaml in tqdm(os.listdir("../data/")):
    if not f_yaml.endswith("_gathered_pypi_package_info.yaml"):
        print(f"Skipping file {f_yaml!r}...")
        continue

    with open(os.path.join("../data/", f_yaml)) as f:
        content = yaml.safe_load(f)

    for package_info in content["packages_info"]:
        for version, release in package_info["releases"].items():
            package_name = PackageVersion.normalize_python_package_name(package_info["info"]["name"])

            # We take info just from the first artifact of a release.
            if not release:
                continue

            d = dateutil.parser.parse(release[0]["upload_time_iso_8601"])
            key = (d.year, d.month)

            release_dates[key] = release_dates.get(key, 0) + 1
            released_packages.setdefault(key, []).append((package_name, version))

    del content

100%|██████████| 1000/1000 [4:59:49<00:00, 17.99s/it]  


In [4]:
sorted_keys = sorted(release_dates.keys())

oldest = sorted(release_dates)[0]
newest = sorted(release_dates)[-1]
(oldest, newest)

((2005, 3), (2021, 6))

In [6]:
start_date = datetime.date(oldest[0], oldest[1], 1)
end_date = datetime.date(newest[0], newest[1], 1)

In [7]:
start_date

datetime.date(2005, 3, 1)

In [8]:
end_date

datetime.date(2021, 6, 1)

Now prepare data for plots.

In [9]:
plot_values = []
plot_keys = []
increment = relativedelta(months=1)

position = start_date

while True:
    plot_values.append(release_dates.get((position.year, position.month), 0))
    plot_keys.append(f"{position.year}-{position.month}")

    if position == end_date:
        break

    position += increment

## Number of packages released on PyPI from March 2005 to Jun 2021

In [103]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

p = figure(x_range=plot_keys)
p.vbar(x=plot_keys, top=plot_values, width=0.7)

output_notebook()
show(p)

## Number of packages released on PyPI from December 2016 to Jun 2021

Let's check packages that were published before Jun 2016. This is [release date of Python 3.6](https://docs.python.org/3/whatsnew/3.6.html).

In [123]:
plot_keys[141]

'2016-12'

In [134]:
MARKER = 141

In [135]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

p = figure(x_range=plot_keys[MARKER:])
p.vbar(x=plot_keys[MARKER:], top=plot_values[MARKER:], width=0.7)

output_notebook()
show(p)

In [129]:
sum(plot_values[141:])

2075271

## Number of packages released on PyPI from March 2005 to December 2016

In [136]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

p = figure(x_range=plot_keys[:MARKER])
p.vbar(x=plot_keys[:MARKER], top=plot_values[:MARKER], width=0.7)

output_notebook()
show(p)

In [138]:
sum(plot_values[:MARKER])

539026

## Create Thoth solver rules for packages released after December 2016

In [140]:
count = 0
packages_to_block = []

for k, v in released_packages.items():
    if f"{k[0]}-{k[1]}" in plot_keys[:MARKER]:
        count += len(v)
        packages_to_block.extend(v)

In [141]:
package_versions = {}

for pv in packages_to_block:
    if pv[0] not in package_versions:
        package_versions[pv[0]] = pv[1]
        continue

    stored = parse(package_versions[pv[0]])
    current = parse(pv[1])

    if current > stored:
        package_versions[pv[0]] = pv[1]

In [142]:
solver_rules = []

for package_name, package_version in package_versions.items():
    solver_rules.append({
      "description": "Package released before December 2016",
      "index_url": "https://pypi.org/simple",
      "package_name": package_name,
      "version_specifier": f"<={package_version}"
    })


In [143]:
len(solver_rules)

80678

In [146]:
with open("solver_rules.json", "w") as f:
    json.dump(solver_rules, f)

## Configure Thoth deployment to ignore packages released after December 2016

In [None]:
_MANAGEMENT_API = "https://management.test.thoth-station.ninja/api/v1/solver/python/rule"
_SECRET = os.getenv("THOTH_SECRET")

for solver_rule in solver_rules:
    response = requests.post(_MANAGEMENT_API, json=solver_rule, params={"secret": ""})
    response.raise_for_status()

### Dump pre-computed data to have them handy

In [97]:
import pickle

with open("release_dates.pickle", "wb") as f:
    pickle.dump(release_dates, f)

In [98]:
with open("released_packages.pickle", "wb") as f:
    pickle.dump(released_packages, f)

In [None]:
import pickle

with open("release_dates.pickle", "rb") as f:
    release_dates = pickle.load(f)

with open("released_packages.pickle", "rb") as f:
    released_packages = pickle.load(f)