# exploring many `pyproject.toml configs`

In [1]:
    import pandas, requests, tomli
    from toolz.curried import *
    locals().setdefault("__path__", [""])    
    from .info import header
    __import__("requests_cache").install_cache()

compose a graqhql query to retrieve to the `pyproject.toml` from a bunch of python projects. i'd love suggestions on a better query. currently, there are a lot of empty pyproject.toml requests.

In [2]:
    query = """
    {
      search(type: REPOSITORY, query: "language:python stars:>500", first:100 %s) {
        pageInfo {
          hasNextPage endCursor
        }
        edges {  
            node {
            ... on Repository {
              url 
              stargazerCount
              object(expression:"HEAD:pyproject.toml") {
                ... on Blob {
                  text
                  
                }
              }
            }
          }
        }
      }
    }"""

paginate through the query to get multiple results

In [None]:
    responses = []

    for i in range(15):
        q = query % ""
        if responses:
            if not responses[-1].json()["data"]["search"]["pageInfo"]["hasNextPage"]:
                break
            q = query % """, after: "%s" """ % responses[-1].json()["data"]["search"]["pageInfo"]["endCursor"]
        responses.append(
            requests.post("https://api.github.com/graphql", json=dict(query=q), **header)
        )
        if responses[-1].status_code != 200: break

transform the responses in a big `pandas` dataframe of `configs`

In [None]:
    df = pipe(responses, map(
        compose_left(operator.methodcaller("json"), get("data"), get("search"), get("edges"), pandas.DataFrame)
    ), partial(pandas.concat, axis=1)).stack()
    configs = df.apply(pandas.Series).dropna(subset="object")\
    .set_index("url")["object"].apply(pandas.Series)["text"].apply(tomli.loads).apply(pandas.Series)

In [None]:
    print(F"""we made {len(responses)} requests returning information about a {len(df)} repositories.
    we retrieved {len(configs)} from this scrape.
    """)

## inspecting the build backend

In [None]:
    builds = configs["build-system"].dropna().apply(pandas.Series)

In [None]:
    F"""{len(builds)} projects define a build backends."""

the different build backends frequencies

In [None]:
    builds["build-backend"].dropna().value_counts().to_frame("build-backend").T

## inspecting the tools

the different tool frequencies

In [None]:
    tools = configs["tool"].dropna()
    ranks = tools.apply(list).apply(pandas.Series).stack().value_counts()

In [None]:
    ranks[ranks>4].to_frame("top").T

In [None]:
    pandas.options.display.max_colwidth = None

In [None]:
    ranks[ranks<=4].to_frame("bottom").reset_index().groupby("bottom").agg(list).iloc[::-1]