In [1]:
# setup
from IPython.display import display, HTML
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

data_path = Path("data")

In [2]:
def count_apis(library):
    """
    Count the number of public APIs in a given library and return additional information.

    Parameters:
        library: The imported library module for which to count APIs.

    Returns:
        dict: Dictionary containing the library name, number of public APIs, and a list of public APIs.
    """
    # Get all attributes and methods
    all_attributes = dir(library)

    # Filter out private and special attributes (those starting with '_')
    public_apis = [attr for attr in all_attributes if not attr.startswith("_")]

    # Prepare the result dictionary
    result = {
        "library": library.__name__,  # Get the library name
        "num_public_apis": len(public_apis),
        "list_public_api": sorted(public_apis, key=str.lower),
    }

    return result


# Example usage:
import pandas as pd
import polars as pl  # Assuming Polars is imported as 'pl'

print(f"Number of public APIs in Pandas: {count_apis(pd)['num_public_apis']}")
print(f"Number of public APIs in Polars: {count_apis(pl)['num_public_apis']}")

Number of public APIs in Pandas: 119
Number of public APIs in Polars: 192


In [3]:
def count_apis_dataframe(libraries):
    """
    Count the APIs for a list of libraries and return the information in a DataFrame.

    Parameters:
        libraries: List of imported library modules to inspect.

    Returns:
        DataFrame: Each column represents a library. The first two rows contain the library name and number of public APIs.
                   Additional rows contain the sorted list of public APIs.
    """
    data = {}
    for lib in libraries:
        api_info = count_apis(lib)
        col_name = api_info["library"]
        col_values = [col_name, api_info["num_public_apis"]] + api_info[
            "list_public_api"
        ]
        data[col_name] = col_values

    # Create DataFrame, filling NaN values for uneven column lengths
    df = pd.DataFrame.from_dict(data, orient="index").T.fillna("")

    return df

In [4]:
pd.set_option("display.max_rows", 500)
df = count_apis_dataframe([pd, pl])
print(df)

                   pandas                    polars
0                  pandas                    polars
1                     119                       192
2             annotations              align_frames
3                     api                       all
4                   array            all_horizontal
5                  arrays                       any
6              ArrowDtype            any_horizontal
7             bdate_range                       api
8            BooleanDtype                     apply
9             Categorical           approx_n_unique
10       CategoricalDtype                    arange
11       CategoricalIndex                   arctan2
12                 compat                  arctan2d
13                 concat               arg_sort_by
14                   core                 arg_where
15               crosstab                     Array
16                    cut                ArrowError
17              DataFrame                       avg
18          

In [5]:
def count_apis_dataframe_v4(libraries):
    """
    Count the APIs for a list of libraries and return the information in a DataFrame.

    Parameters:
        libraries: List of imported library modules to inspect.

    Returns:
        DataFrame: Each column represents a library. Rows are grouped by 'library', 'num_public_api', and the first letter of public APIs.
    """

    # Create initial rows for 'library' and 'num_public_api'
    library_info = {
        lib.__name__: [
            lib.__name__,
            len([attr for attr in dir(lib) if not attr.startswith("_")]),
        ]
        for lib in libraries
    }
    df1 = pd.DataFrame.from_dict(
        library_info, orient="index", columns=["library", "num_public_api"]
    ).T

    # Initialize list to hold individual API records
    api_records = []

    # Populate the list with public APIs, grouped by first letter
    for lib in libraries:
        lib_name = lib.__name__
        public_apis = sorted(
            [attr for attr in dir(lib) if not attr.startswith("_")], key=str.lower
        )
        for api in public_apis:
            first_letter = api[0].upper()
            api_records.append({"library": lib_name, "group": first_letter, "api": api})

    # Create a DataFrame from the API records
    df_api = pd.DataFrame(api_records)

    # Create the cross-tabulation
    df2 = pd.crosstab(
        index=df_api["group"],
        columns=df_api["library"],
        values=df_api["api"],
        aggfunc=list,
    ).fillna("")

    # Concatenate the two DataFrames
    result_df = pd.concat([df1, df2], keys=["group", ""])

    return result_df

In [6]:
pd.set_option("display.max_rows", 5000)
df4 = count_apis_dataframe_v4([pd, pl])
df4

Unnamed: 0,Unnamed: 1,pandas,polars
group,library,pandas,polars
group,num_public_api,119,192
,A,"[annotations, api, array, arrays, ArrowDtype]","[align_frames, all, all_horizontal, any, any_h..."
,B,"[bdate_range, BooleanDtype]","[Binary, Boolean, build_info]"
,C,"[Categorical, CategoricalDtype, CategoricalInd...","[Categorical, ChronoFormatWarning, coalesce, c..."
,D,"[DataFrame, date_range, DateOffset, DatetimeIn...","[DataFrame, dataframe, DataType, datatypes, Da..."
,E,"[errors, eval, ExcelFile, ExcelWriter]","[element, enable_string_cache, exceptions, exc..."
,F,"[factorize, Flags, Float32Dtype, Float64Dtype,...","[Field, first, Float32, Float64, FLOAT_DTYPES,..."
,G,"[get_dummies, get_option, Grouper]","[get_index_type, groups]"
,H,[HDFStore],[head]


In [7]:
def custom_explode(df, list_cols):
    # Identify rows where all specified columns contain lists
    all_list_rows = df[list_cols].apply(
        lambda row: all(isinstance(x, list) for x in row), axis=1
    )

    # Explode rows containing lists
    exploded_list_rows = (
        df[all_list_rows]
        .apply(lambda x: x.apply(lambda y: y if isinstance(y, list) else [y]))
        .explode(list_cols)
    )

    # Keep rows not containing lists as they are
    non_list_rows = df[~all_list_rows]

    # Concatenate both DataFrames
    result_df = (
        pd.concat([exploded_list_rows, non_list_rows])
        .sort_index()
        .reset_index(drop=True)
    )

    return result_df

In [8]:
pd.set_option("display.max_rows", 5000)

# Explode using custom function
df1 = custom_explode(df4.iloc[2:], ["pandas"])
df2 = custom_explode(df4.iloc[2:], ["polars"])

# Merge on the index
merged_df4 = (
    pd.merge(df1, df2, left_index=True, right_index=True, how="inner")
    .drop(labels=["pandas_y", "polars_x"], axis=1)
    .rename(columns={"pandas_x": "pandas", "polars_y": "polars"})
)
merged_df4

Unnamed: 0,pandas,polars
0,annotations,align_frames
1,api,all
2,array,all_horizontal
3,arrays,any
4,ArrowDtype,any_horizontal
5,bdate_range,api
6,BooleanDtype,apply
7,Categorical,approx_n_unique
8,CategoricalDtype,arange
9,CategoricalIndex,arctan2
