In [1]:
# Old method
from pkg_resources import working_set

unknown_license = "UNKNOWN"
all_packages_old = []


def parse_license(license_str: str) -> str:
    # Get license
    if not license_str:
        return ""
    license_str = license_str.strip()

    if "copyright (c) " in license_str:
        license_str = license_str.lower().split("copyright (c) ")[0]

    if len(license_str) > 300:
        license_str = license_str[:300]

    license_str = license_str if license_str else unknown_license
    return license_str


for key in working_set.normalized_to_canonical_keys:
    """
    The metadata of a package is stored in "metadata" or "PKG-INFO"
    """
    try:
        package_name = working_set.normalized_to_canonical_keys[key]
        generic_name = package_name.replace("-", "_").lower()
        package = working_set.by_key[package_name]
        metadata_lines = package.get_metadata("METADATA").split("\n")
    except:
        try:
            metadata_lines = package.get_metadata("PKG-INFO").split("\n")
        except:
            all_packages_old.append([package_name, unknown_license, generic_name])
            continue

    license_arg_raw = ""
    license_classifier_raw = ""
    for line in metadata_lines:
        line: str = line
        if line.startswith("License-Expression: "):
            license_arg_raw = line.replace("License-Expression: ", "")
        if line.startswith("License: "):
            license_arg_raw = line.replace("License: ", "")
        if line.startswith("Classifier: License ::"):
            license_classifier_raw = line.split(" :: ")[-1]
            if license_classifier_raw.lower() in ["osi approved"]:
                license_classifier_raw = ""

    general_license = "NOT FOUND"
    license_arg = parse_license(license_arg_raw)
    license_classifier = parse_license(license_classifier_raw)

    # You can get the license from license argument or classifier
    if license_arg is not "" and license_arg != unknown_license:
        general_license = license_arg
    elif license_classifier:
        general_license = license_classifier
    else:
        general_license = unknown_license

    all_packages_old.append([package_name, general_license, generic_name])

all_packages_old.sort()
print("Number of packages found (old method):", len(all_packages_old))
all_packages_old

Number of packages found (old method): 223


  from pkg_resources import working_set
  if license_arg is not "" and license_arg != unknown_license:


[['annotated-types', 'MIT License', 'annotated_types'],
 ['anyio', 'MIT', 'anyio'],
 ['argon2-cffi', 'MIT', 'argon2_cffi'],
 ['argon2-cffi-bindings', 'MIT', 'argon2_cffi_bindings'],
 ['arrow', 'Apache Software License', 'arrow'],
 ['asttokens', 'Apache 2.0', 'asttokens'],
 ['async-lru', 'MIT License', 'async_lru'],
 ['athena-clients', 'UNKNOWN', 'athena_clients'],
 ['attrs', 'MIT', 'attrs'],
 ['authlib', 'BSD-3-Clause', 'authlib'],
 ['autocommand', 'LGPLv3', 'autocommand'],
 ['babel', 'BSD-3-Clause', 'babel'],
 ['backports.tarfile', 'MIT License', 'backports.tarfile'],
 ['beartype', 'MIT License', 'beartype'],
 ['beautifulsoup4', 'MIT License', 'beautifulsoup4'],
 ['black', 'MIT', 'black'],
 ['bleach', 'Apache Software License', 'bleach'],
 ['build', 'MIT License', 'build'],
 ['cachecontrol', 'Apache-2.0', 'cachecontrol'],
 ['cachetools', 'MIT', 'cachetools'],
 ['catalogue', 'MIT', 'catalogue'],
 ['certifi', 'MPL-2.0', 'certifi'],
 ['cffi', 'MIT', 'cffi'],
 ['charset-normalizer', 'MIT'

In [2]:
from copy_pkg_resources_2 import get_all_package_names

all_packages = get_all_package_names()
print("Number of packages found (new method):", len(all_packages))

Number of packages found (new method): 237


In [3]:
import importlib_metadata as im


def get_license(name: str) -> str:
    try:
        metas = im.metadata(name)
    except im.PackageNotFoundError:
        return unknown_license
    all_metas = str(metas).split("\n")
    for m in all_metas:
        if m.startswith("License-Expression: "):
            license = m.replace("License-Expression: ", "")
            if license and license != "UNKNOWN":
                return license
        if m.startswith("License: "):
            license = m.replace("License: ", "")
            if license and license != "UNKNOWN":
                return license
        if m.startswith("Classifier: License ::"):
            license = m.split("::")[-1]
            if license and license != "UNKNOWN":
                return license

    return unknown_license

In [4]:
all_packages_new = []
for name in get_all_package_names():
    try:
        generic_name = name.replace("-", "_").lower()
        all_packages_new.append([name, get_license(name), generic_name])
    except:
        print(f"Failed to get license for {name}")
        continue

all_packages_new.sort()
all_packages_new

[['MarkupSafe', 'Copyright 2010 Pallets', 'markupsafe'],
 ['PyYAML', 'MIT', 'pyyaml'],
 ['Send2Trash', 'BSD License', 'send2trash'],
 ['annotated-types', ' MIT License', 'annotated_types'],
 ['anyio', 'MIT', 'anyio'],
 ['argon2-cffi', 'MIT', 'argon2_cffi'],
 ['argon2-cffi-bindings', 'MIT', 'argon2_cffi_bindings'],
 ['arrow', ' Apache Software License', 'arrow'],
 ['asttokens', 'Apache 2.0', 'asttokens'],
 ['async-lru', 'MIT License', 'async_lru'],
 ['athena-clients', 'UNKNOWN', 'athena_clients'],
 ['attrs', 'MIT', 'attrs'],
 ['authlib', 'BSD-3-Clause', 'authlib'],
 ['autocommand', 'LGPLv3', 'autocommand'],
 ['autocommand', 'LGPLv3', 'autocommand'],
 ['babel', 'BSD-3-Clause', 'babel'],
 ['backports.tarfile', ' MIT License', 'backports.tarfile'],
 ['beartype', 'MIT License', 'beartype'],
 ['beautifulsoup4', 'MIT License', 'beautifulsoup4'],
 ['black', 'MIT', 'black'],
 ['bleach', 'Apache Software License', 'bleach'],
 ['build', ' MIT License', 'build'],
 ['cachecontrol', 'Apache-2.0', 'c

In [5]:
import pandas as pd

all_packages = pd.DataFrame(
    columns=[
        "package_name_old",
        "package_name_new",
        "package_name_general",
        "license_new",
        "license_old",
    ]
)

all_packages_new_2 = all_packages_new.copy()

# All old packages
for i in all_packages_old:
    new_package = []
    # Match with new packages
    for j in all_packages_new_2:
        if j[2] == i[2]:
            new_package = j
            all_packages_new_2.remove(j)

    new_row = pd.Series(
        {
            "package_name_old": i[0].strip(),
            "package_name_new": new_package[0].strip() if new_package else "",
            "package_name_general": i[2].strip(),
            "license_new": new_package[1].strip() if new_package else "",
            "license_old": i[1].strip(),
        }
    )
    all_packages = pd.concat([all_packages, new_row.to_frame().T])

# Add new packages NOT matches an old one
for i in all_packages_new_2:

    new_row = pd.Series(
        {
            "package_name_new": i[0].strip(),
            "package_name_general": i[2].strip(),
            "license_new": i[1].strip(),
        }
    )
    all_packages = pd.concat([all_packages, new_row.to_frame().T])

all_packages = all_packages.fillna("")
all_packages.head()

Unnamed: 0,package_name_old,package_name_new,package_name_general,license_new,license_old
0,annotated-types,annotated-types,annotated_types,MIT License,MIT License
0,anyio,anyio,anyio,MIT,MIT
0,argon2-cffi,argon2-cffi,argon2_cffi,MIT,MIT
0,argon2-cffi-bindings,argon2-cffi-bindings,argon2_cffi_bindings,MIT,MIT
0,arrow,arrow,arrow,Apache Software License,Apache Software License


In [6]:
all_packages.to_csv("test_1.csv", index=False)

In [7]:
print(
    f"Number of mismatching linceses {(all_packages.license_new != all_packages.license_old).sum()}"
)

Number of mismatching linceses 16


In [8]:
all_packages[all_packages.license_new != all_packages.license_old].head()

Unnamed: 0,package_name_old,package_name_new,package_name_general,license_new,license_old
0,packaging,packaging,packaging,Apache Software License,BSD License
0,pydantic-core,pydantic-core,pydantic_core,MIT License,MIT
0,sympy,sympy,sympy,BSD,New BSD License (see the [LICENSE](LICENSE) fi...
0,,autocommand,autocommand,LGPLv3,
0,,importlib-metadata,importlib_metadata,Apache Software License,


In [9]:
print(
    f"Number of missing from new packages {len(all_packages[all_packages.package_name_new == ""])}"
)

Number of missing from new packages 0


In [10]:
print(
    f"Number of missing from old packages {len(all_packages[all_packages.package_name_old == ""])}"
)

Number of missing from old packages 13


In [11]:
mismatches = all_packages.loc[
    all_packages.license_new != all_packages.license_old
].sort_values(by=["package_name_general"])

mismatches.to_csv("mismatches.csv", index=False)

mismatches

Unnamed: 0,package_name_old,package_name_new,package_name_general,license_new,license_old
0,,autocommand,autocommand,LGPLv3,
0,,importlib-metadata,importlib_metadata,Apache Software License,
0,,jaraco.context,jaraco.context,MIT License,
0,,jaraco.functools,jaraco.functools,MIT License,
0,,jaraco.text,jaraco.text,MIT License,
0,,license-scanner,license_scanner,MIT License,
0,,more-itertools,more_itertools,MIT License,
0,packaging,packaging,packaging,Apache Software License,BSD License
0,,packaging,packaging,Apache Software License,
0,,platformdirs,platformdirs,MIT,


In [13]:
str(im.metadata("packaging")).split("\n")

['Metadata-Version: 2.3',
 'Name: packaging',
 'Version: 25.0',
 'Summary: Core utilities for Python packages',
 'Author-email: Donald Stufft <donald@stufft.io>',
 'Requires-Python: >=3.8',
 'Description-Content-Type: text/x-rst',
 'Classifier: Development Status :: 5 - Production/Stable',
 'Classifier: Intended Audience :: Developers',
 'Classifier: License :: OSI Approved :: Apache Software License',
 'Classifier: License :: OSI Approved :: BSD License',
 'Classifier: Programming Language :: Python',
 'Classifier: Programming Language :: Python :: 3',
 'Classifier: Programming Language :: Python :: 3 :: Only',
 'Classifier: Programming Language :: Python :: 3.8',
 'Classifier: Programming Language :: Python :: 3.9',
 'Classifier: Programming Language :: Python :: 3.10',
 'Classifier: Programming Language :: Python :: 3.11',
 'Classifier: Programming Language :: Python :: 3.12',
 'Classifier: Programming Language :: Python :: 3.13',
 'Classifier: Programming Language :: Python :: Impl