In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import defaultdict
from pathlib import Path
from typing import Dict, Any, Set
from fd_analysis.prepare import filter_corrupt_files, get_depsfiles, reduce_directory_levels


def exctract_code_directories(codedir: Dict[str, Dict[str, Any]], project_name: str) -> Dict[str, Dict[str, Any]]:
    """Exctract code directories and sum up the number of files in each directory."""
    code_dirs = defaultdict(int)
    for folder, source_type_dict in codedir.items():
        if folder.lower() == project_name.lower():
            code_dirs |= {
                ("PROJECT_NAME", "py"): source_type_dict["py"], 
                ("PROJECT_NAME","ipynb"): source_type_dict["ipynb"]
                }
        else:
            code_dirs |= {
                (folder, "py"): source_type_dict["py"], 
                (folder,"ipynb"): source_type_dict["ipynb"]
                }
    return code_dirs


def get_python_projects(data: Dict[str, Dict[str, Any]]) -> Set[str]:
    """Get all projects that have Python code. Create a set of Python project names."""
    codedirs = defaultdict(dict)
    non_python_projects = []
    for k, d in data.items():
        project_name = d["metadata"]["project_name"]
        # There should be .py or .ipynb files in the code_dirs
        # If there are only .ipynb files and no imports, then
        # it is most likely an R project
        # There are some projects written in Python 2.X,
        # example: https://github.com/mattloose/RUFigs
        # For those, FawltyDeps does not work and the results are not reliable.
        # We assume that all Python projects have 3-rd party imports.
        if d["code_dirs"] and d["imports"]:
            code_dirs = exctract_code_directories(d["code_dirs"], project_name)
            codedirs[project_name] = code_dirs
        else:
            non_python_projects.append({project_name: {"imports": d["imports"], "code_dirs": d["code_dirs"]}})  


    df_codedirs = pd.DataFrame.from_dict(codedirs, orient="index")

    python_projects = set(df_codedirs.index)
    return python_projects, non_python_projects


## For PyPI data

In [2]:
# import PyPI data

# paths = "../data/pypi_0.13.1.1/"
paths = "../data/results_pypi_20240423/"

data_pypi, corrupt_pypi = filter_corrupt_files([Path(paths) / file for file in os.listdir(paths)])

print("Corrupt: ", corrupt_pypi)
len(data_pypi)

Corrupt:  []


1160

In [3]:
python_projects, non_python_projects = get_python_projects(data_pypi)
print(len(python_projects), len(non_python_projects))

1118 42


In [4]:
no_code_dirs_only = [k for k in non_python_projects if list(k.values())[0]["code_dirs"] is None and len(list(k.values())[0]["imports"])]
print(f"[Sanity check] Number of projects with code_dirs and some imports: {len(no_code_dirs_only)}")
no_code_dirs_only

[Sanity check] Number of projects with code_dirs and some imports: 0


[]

In [5]:
no_imports_only = [k for k in non_python_projects if not list(k.values())[0]["imports"] and list(k.values())[0]["code_dirs"] is not None]
print("Number of projects with no imports and code_dirs: ", len(no_imports_only))
no_imports_only

Number of projects with no imports and code_dirs:  15


[{'backcall': {'imports': [],
   'code_dirs': {'backcall': {'py': 3, 'ipynb': 0, 'total': 3},
    'tests': {'py': 2, 'ipynb': 0, 'total': 2},
    'docs': {'py': 1, 'ipynb': 0, 'total': 1},
    '.': {'py': 0, 'ipynb': 1, 'total': 1}}}},
 {'pyjson5': {'imports': [],
   'code_dirs': {'json5': {'py': 8, 'ipynb': 0, 'total': 8},
    'tests': {'py': 5, 'ipynb': 0, 'total': 5},
    'benchmarks': {'py': 1, 'ipynb': 0, 'total': 1}}}},
 {'configparser': {'imports': [],
   'code_dirs': {'tests': {'py': 3, 'ipynb': 0, 'total': 3},
    'backports/configparser': {'py': 2, 'ipynb': 0, 'total': 2},
    'docs': {'py': 1, 'ipynb': 0, 'total': 1}}}},
 {'CommonMark-py': {'imports': [],
   'code_dirs': {'CommonMark': {'py': 3, 'ipynb': 0, 'total': 3},
    '.': {'py': 1, 'ipynb': 0, 'total': 1},
    'bin': {'py': 1, 'ipynb': 0, 'total': 1},
    'CommonMark/test': {'py': 1, 'ipynb': 0, 'total': 1}}}},
 {'ptyprocess': {'imports': [],
   'code_dirs': {'tests': {'py': 6, 'ipynb': 0, 'total': 6},
    'ptyprocess

In [6]:
project_with_missing_both = [k for k in non_python_projects if not list(k.values())[0]["imports"] and list(k.values())[0]["code_dirs"] is None]
print("Number of projects with no imports and no code_dirs: ",len(project_with_missing_both))
project_with_missing_both

Number of projects with no imports and no code_dirs:  27


[{'azure-sdk-for-python': {'imports': [], 'code_dirs': None}},
 {'python-workflows': {'imports': [], 'code_dirs': None}},
 {'python-secret-manager': {'imports': [], 'code_dirs': None}},
 {'googleapis': {'imports': [], 'code_dirs': None}},
 {'cdk-nag': {'imports': [], 'code_dirs': None}},
 {'pybcj': {'imports': [], 'code_dirs': None}},
 {'python-appengine-logging': {'imports': [], 'code_dirs': None}},
 {'core': {'imports': [], 'code_dirs': None}},
 {'python-cloudbuild': {'imports': [], 'code_dirs': None}},
 {'python-kms': {'imports': [], 'code_dirs': None}},
 {'keep-alive': {'imports': [], 'code_dirs': None}},
 {'python-dataplex': {'imports': [], 'code_dirs': None}},
 {'python-videointelligence': {'imports': [], 'code_dirs': None}},
 {'python-recommendations-ai': {'imports': [], 'code_dirs': None}},
 {'python-redis': {'imports': [], 'code_dirs': None}},
 {'python-orchestration-airflow': {'imports': [], 'code_dirs': None}},
 {'python-dataproc-metastore': {'imports': [], 'code_dirs': None

## For Biomed data

In [7]:
# import Biomedical's data

paths = "../data/biomedical_0.13.1.1/"

data_biomed, corrupt_biomed = filter_corrupt_files([Path(paths) / file for file in os.listdir(paths)])

print("Corrupt: ", corrupt_biomed)
len(data_biomed)

Corrupt:  []


1863

In [8]:
biomed_python_projects, biomed_non_python_projects = get_python_projects(data_biomed)
print(len(biomed_python_projects), len(biomed_non_python_projects))

1356 507


In [9]:
biomed_no_code_dirs_only = [k for k in biomed_non_python_projects if list(k.values())[0]["code_dirs"] is None and len(list(k.values())[0]["imports"])]
print(f"[Sanity check] Number of projects with code_dirs and some imports: {len(biomed_no_code_dirs_only)}")

biomed_no_imports_only = [k for k in biomed_non_python_projects if not list(k.values())[0]["imports"] and list(k.values())[0]["code_dirs"] is not None]
print("Number of projects with no imports and code_dirs: ", len(biomed_no_imports_only))

biomed_project_with_missing_both = [k for k in biomed_non_python_projects if not list(k.values())[0]["imports"] and list(k.values())[0]["code_dirs"] is None]
print("Number of projects with no imports and no code_dirs: ", len(biomed_project_with_missing_both))

[Sanity check] Number of projects with code_dirs and some imports: 0
Number of projects with no imports and code_dirs:  117
Number of projects with no imports and no code_dirs:  390
