In [12]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import defaultdict
from pathlib import Path
from typing import Dict, Any, Set
from fd_analysis.prepare import filter_corrupt_files, get_depsfiles, reduce_directory_levels


def exctract_code_directories(codedir: Dict[str, Dict[str, Any]], project_name: str) -> Dict[str, Dict[str, Any]]:
    """Exctract code directories and sum up the number of files in each directory."""
    code_dirs = defaultdict(int)
    for folder, source_type_dict in codedir.items():
        if folder.lower() == project_name.lower():
            code_dirs |= {
                ("PROJECT_NAME", "py"): source_type_dict["py"], 
                ("PROJECT_NAME","ipynb"): source_type_dict["ipynb"]
                }
        else:
            code_dirs |= {
                (folder, "py"): source_type_dict["py"], 
                (folder,"ipynb"): source_type_dict["ipynb"]
                }
    return code_dirs


def get_python_projects(data: Dict[str, Dict[str, Any]]) -> Set[str]:
    """Get all projects that have Python code. Create a set of Python project names."""
    codedirs = defaultdict(dict)
    for k, d in data.items():
        project_name = d["metadata"]["project_name"]
        # There should be .py or .ipynb files in the code_dirs
        # If there are only .ipynb files and no imports, then
        # it is most likely an R project
        # There are some projects written in Python 2.X,
        # example: https://github.com/mattloose/RUFigs
        # For those, FawltyDeps does not work and the results are not reliable.
        # We assume that all Python projects have 3-rd party imports.
        if d["code_dirs"] and d["imports"]:
            code_dirs = exctract_code_directories(d["code_dirs"], project_name)
            codedirs[project_name] = code_dirs
        else:
            print(f"Project {project_name} is not considered a Python project.")
            print("Imports: ", d["imports"])
            print("Code directories: ", d["code_dirs"])


    df_codedirs = pd.DataFrame.from_dict(codedirs, orient="index")

    python_projects = set(df_codedirs.index)
    return python_projects


In [13]:
# import PyPI data

# paths = "../data/pypi_0.13.1.1/"
paths = "../data/results_pypi_20240423/"

data_pypi, corrupt_pypi = filter_corrupt_files([Path(paths) / file for file in os.listdir(paths)])

print("Corrupt: ", corrupt_pypi)
len(data_pypi)

Corrupt:  []


1160

In [14]:
python_projects = get_python_projects(data_pypi)
len(python_projects)

Project azure-sdk-for-python is not considered a Python project.
Imports:  []
Code directories:  None
Project python-workflows is not considered a Python project.
Imports:  []
Code directories:  None
Project backcall is not considered a Python project.
Imports:  []
Code directories:  {'backcall': {'py': 3, 'ipynb': 0, 'total': 3}, 'tests': {'py': 2, 'ipynb': 0, 'total': 2}, 'docs': {'py': 1, 'ipynb': 0, 'total': 1}, '.': {'py': 0, 'ipynb': 1, 'total': 1}}
Project pyjson5 is not considered a Python project.
Imports:  []
Code directories:  {'json5': {'py': 8, 'ipynb': 0, 'total': 8}, 'tests': {'py': 5, 'ipynb': 0, 'total': 5}, 'benchmarks': {'py': 1, 'ipynb': 0, 'total': 1}}
Project configparser is not considered a Python project.
Imports:  []
Code directories:  {'tests': {'py': 3, 'ipynb': 0, 'total': 3}, 'backports/configparser': {'py': 2, 'ipynb': 0, 'total': 2}, 'docs': {'py': 1, 'ipynb': 0, 'total': 1}}
Project python-secret-manager is not considered a Python project.
Imports:  []
C

1118