In [1]:
# Old method
from pkg_resources import working_set

unknown_license = "UNKNOWN"
all_packages_old = []


def parse_license(license_str: str) -> str:
    # Get license
    if not license_str:
        return ""
    license_str = license_str.strip()

    if "copyright (c) " in license_str:
        license_str = license_str.lower().split("copyright (c) ")[0]

    if len(license_str) > 300:
        license_str = license_str[:300]

    license_str = license_str if license_str else unknown_license
    return license_str


for key in working_set.normalized_to_canonical_keys:
    """
    The metadata of a package is stored in "metadata" or "PKG-INFO"
    """
    try:
        package_name = working_set.normalized_to_canonical_keys[key]
        generic_name = package_name.replace("-", "_").lower()
        package = working_set.by_key[package_name]
        metadata_lines = package.get_metadata("METADATA").split("\n")
    except:
        try:
            metadata_lines = package.get_metadata("PKG-INFO").split("\n")
        except:
            all_packages_old.append([package_name, unknown_license, generic_name])
            continue

    license_arg_raw = ""
    license_classifier_raw = ""
    for line in metadata_lines:
        line: str = line
        if line.startswith("License-Expression: "):
            license_arg_raw = line.replace("License-Expression: ", "")
        if line.startswith("License: "):
            license_arg_raw = line.replace("License: ", "")
        if line.startswith("Classifier: License ::"):
            license_classifier_raw = line.split(" :: ")[-1]
            if license_classifier_raw.lower() in ["osi approved"]:
                license_classifier_raw = ""

    general_license = "NOT FOUND"
    license_arg = parse_license(license_arg_raw)
    license_classifier = parse_license(license_classifier_raw)

    # You can get the license from license argument or classifier
    if license_arg is not "" and license_arg != unknown_license:
        general_license = license_arg
    elif license_classifier:
        general_license = license_classifier
    else:
        general_license = unknown_license

    all_packages_old.append([package_name, general_license, generic_name])

all_packages_old.sort()
all_packages_old

  from pkg_resources import working_set
  if license_arg is not "" and license_arg != unknown_license:


[['absl-py', 'Apache-2.0', 'absl_py'],
 ['accelerate', 'Apache', 'accelerate'],
 ['acme', 'Apache-2.0', 'acme'],
 ['adal', 'MIT', 'adal'],
 ['adlfs', 'BSD', 'adlfs'],
 ['aenum', 'BSD License', 'aenum'],
 ['agate', 'MIT', 'agate'],
 ['aioboto3', 'Apache-2.0', 'aioboto3'],
 ['aiobotocore', 'Apache-2.0', 'aiobotocore'],
 ['aiodns', 'MIT', 'aiodns'],
 ['aiofiles', 'Apache-2.0', 'aiofiles'],
 ['aiohappyeyeballs', 'PSF-2.0', 'aiohappyeyeballs'],
 ['aiohttp', 'Apache-2.0 AND MIT', 'aiohttp'],
 ['aiohttp-cors', 'Apache License, Version 2.0', 'aiohttp_cors'],
 ['aiohttp-retry', 'MIT', 'aiohttp_retry'],
 ['aioitertools', 'MIT License', 'aioitertools'],
 ['aioresponses', 'MIT License', 'aioresponses'],
 ['aiosignal', 'Apache 2.0', 'aiosignal'],
 ['aiosqlite', 'MIT License', 'aiosqlite'],
 ['alabaster', 'BSD License', 'alabaster'],
 ['alembic', 'MIT', 'alembic'],
 ['aliyun-python-sdk-core', 'Apache License 2.0', 'aliyun_python_sdk_core'],
 ['altair', 'Copyright (c) 2015-2023, Vega-Altair Developer

In [None]:
import importlib_metadata as im


def get_license(name: str) -> str:
    try:
        metas = im.metadata(name)
    except im.PackageNotFoundError:
        return ""
    all_metas = str(metas).split("\n")
    for m in all_metas:
        if m.startswith("License-Expression: "):
            return m.replace("License-Expression: ", "")
        if m.startswith("License: "):
            return m.replace("License: ", "")
        if m.startswith("Classifier: License ::"):
            return m.split("::")[-1]

    return ""

: 

In [None]:
import pkgutil

all_packages_new = []
for i in pkgutil.iter_modules():
    try:
        name = i.name
        generic_name = name.replace("-", "_").lower()
        all_packages_new.append([name, get_license(name), generic_name])
    except:
        continue

all_packages_new.sort()
all_packages_new

: 

In [None]:
import pandas as pd

all_packages = pd.DataFrame(
    columns=[
        "package_name_old",
        "package_name_new",
        "package_name_general",
        "license_new",
        "license_old",
    ]
)

all_packages_new_2 = all_packages_new.copy()

# All old packages
for i in all_packages_old:
    new_package = []
    # Match with new packages
    for j in all_packages_new_2:
        if j[2] == i[2]:
            new_package = j
            all_packages_new_2.remove(j)

    new_row = pd.Series(
        {
            "package_name_old": i[0].strip(),
            "package_name_new": new_package[0].strip() if new_package else "",
            "package_name_general": i[2].strip(),
            "license_new": new_package[1].strip() if new_package else "",
            "license_old": i[1].strip(),
        }
    )
    all_packages = pd.concat([all_packages, new_row.to_frame().T])

# Add new packages NOT matches an old one
for i in all_packages_new_2:

    new_row = pd.Series(
        {
            "package_name_new": i[0].strip(),
            "package_name_general": i[2].strip(),
            "license_new": i[1].strip(),
        }
    )
    all_packages = pd.concat([all_packages, new_row.to_frame().T])

all_packages = all_packages.fillna("")
all_packages.head()

: 

In [None]:
all_packages.to_csv("test_1.csv", index=False)

: 

In [None]:
print(
    f"Number of mismatching linceses {(all_packages.license_new != all_packages.license_old).sum()}"
)

: 

In [None]:
all_packages[all_packages.license_new != all_packages.license_old].head()

: 

In [None]:
print(
    f"Number of missing for new packages {len(all_packages[all_packages.package_name_new == ""])}"
)

: 

In [None]:
print(
    f"Number of missing for old packages {len(all_packages[all_packages.package_name_old == ""])}"
)

: 