# exploring many `pyproject.toml configs`

In [1]:
    import pandas, requests, tomli
    from toolz.curried import *
    locals().setdefault("__path__", [""])    
    from .info import header
    __import__("requests_cache").install_cache()

compose a graqhql query to retrieve to the `pyproject.toml` from a bunch of python projects. i'd love suggestions on a better query. currently, there are a lot of empty pyproject.toml requests.

In [2]:
    query = """
    {
      search(type: REPOSITORY, query: "language:python stars:>500", first:100 %s) {
        pageInfo {
          hasNextPage endCursor
        }
        edges {  
            node {
            ... on Repository {
              url 
              stargazerCount
              object(expression:"HEAD:pyproject.toml") {
                ... on Blob {
                  text
                  
                }
              }
            }
          }
        }
      }
    }"""

paginate through the query to get multiple results

In [3]:
    responses = []

    for i in range(15):
        q = query % ""
        if responses:
            if not responses[-1].json()["data"]["search"]["pageInfo"]["hasNextPage"]:
                break
            q = query % """, after: "%s" """ % responses[-1].json()["data"]["search"]["pageInfo"]["endCursor"]
        responses.append(
            requests.post("https://api.github.com/graphql", json=dict(query=q), **header)
        )
        if responses[-1].status_code != 200: break

transform the responses in a big `pandas` dataframe of `configs`

In [4]:
    df = pipe(responses, map(
        compose_left(operator.methodcaller("json"), get("data"), get("search"), get("edges"), pandas.DataFrame)
    ), partial(pandas.concat, axis=1)).stack()
    configs = df.apply(pandas.Series).dropna(subset="object")\
    .set_index("url")["object"].apply(pandas.Series)["text"].apply(tomli.loads).apply(pandas.Series)

In [5]:
    print(F"""we made {len(responses)} requests returning information about a {len(df)} repositories.
    we retrieved {len(configs)} from this scrape.
    """)

we made 10 requests returning information about a 1000 repositories.
we retrieved 214 from this scrape.



## inspecting the build backend

In [6]:
    builds = configs["build-system"].dropna().apply(pandas.Series)

In [7]:
    F"""{len(builds)} projects define a build backends."""

'140 projects define a build backends.'

the different build backends frequencies

In [8]:
    builds["build-backend"].dropna().value_counts().to_frame("build-backend").T

Unnamed: 0,setuptools.build_meta,poetry.core.masonry.api,hatchling.build,flit_core.buildapi,poetry.masonry.api,mesonpy,build_backend,maturin,pdm.pep517.api
build-backend,66,24,15,8,4,2,1,1,1


## inspecting the tools

the different tool frequencies

In [9]:
    tools = configs["tool"].dropna()
    ranks = tools.apply(list).apply(pandas.Series).stack().value_counts()

In [10]:
    ranks[ranks>4].to_frame("top").T

Unnamed: 0,black,isort,pytest,mypy,coverage,poetry,towncrier,pylint,hatch,setuptools,setuptools_scm,cibuildwheel,pyright,flit
top,120,84,59,39,33,30,19,16,15,14,14,10,10,6


In [11]:
    pandas.options.display.max_colwidth = None

In [12]:
    ranks[ranks<=4].to_frame("bottom").reset_index().groupby("bottom").agg(list).iloc[::-1]

Unnamed: 0_level_0,index
bottom,Unnamed: 1_level_1
4,[flake8]
3,[tox]
2,"[versioningit, interrogate, tbump, nbqa, zimports, slotscheck, codespell, portray, devpy, vendoring, maturin, poe, distutils, blue, ruff, check-wheel-contents, bench, importlinter, poetry-version-plugin]"
1,"[bandit, flakeheaven, yapf, usort, doit, pdm, mutmut, vulture, aliases, pydocstyle, versioneer, check-manifest, hooky, pydantic-mypy, pyodide, pycln, jupyter-releaser, poetry-dynamic-versioning, autoflake]"
