# proper html tables with multiple indexes

our goal is reduce the empty cells in tables, especially where headers should.
empty cells diminish the experience for assistive technology users.
through this study we'll design some accessible options we could generically use to represent dataframes.

In [1]:
    import pandas, bs4, enum, numpy, midgy

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas, bs4, enum, numpy, midgy


In [2]:
%%
<style>
:is(.cell, .jp-OutputArea-output.jp-RenderedHTMLCommon) :is(td,th) {
    border: 1px solid;
}
</style>


<style>
:is(.cell, .jp-OutputArea-output.jp-RenderedHTMLCommon) :is(td,th) {
    border: 1px solid;
}
</style>


sample dataframe

In [3]:
    index = pandas.MultiIndex.from_product([
        ["A", "Z"], ["M", "N", "O"], [1, 2, 3]
    ], names=[*"JKL"])
    (df := pandas.DataFrame(columns=index, index=index).rename_axis(columns=[10, 100, 1000]).head())

Unnamed: 0_level_0,Unnamed: 1_level_0,10,A,A,A,A,A,A,A,A,A,Z,Z,Z,Z,Z,Z,Z,Z,Z
Unnamed: 0_level_1,Unnamed: 1_level_1,100,M,M,M,N,N,N,O,O,O,M,M,M,N,N,N,O,O,O
Unnamed: 0_level_2,Unnamed: 1_level_2,1000,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
J,K,L,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3
A,M,1,,,,,,,,,,,,,,,,,,
A,M,2,,,,,,,,,,,,,,,,,,
A,M,3,,,,,,,,,,,,,,,,,,
A,N,1,,,,,,,,,,,,,,,,,,
A,N,2,,,,,,,,,,,,,,,,,,


In [4]:
%%
    def index_span(index: pandas.Index) -> pandas.DataFrame: 
analyze an index to determine how different indexes span rows or columns.
        
        return pandas.concat(
            dict(
                diff=(diff := index.to_frame().pipe(diff_shift)),
                label=(label := diff.cumsum()),
                span=label.apply(
                    lambda s: s.drop_duplicates().apply(s.value_counts().get), axis=0
                )
            ), axis=1
        ).replace({numpy.nan: None})

    def diff_shift(df: pandas.DataFrame) -> pandas.DataFrame:
        return pandas.DataFrame(
            numpy.concatenate((numpy.array([[True]*df.shape[1]]), df.values[:-1] != df.values[1:]), 0), 
            columns=df.columns
        )


    def index_span(index: pandas.Index) -> pandas.DataFrame: 
analyze an index to determine how different indexes span rows or columns.
        
        return pandas.concat(
            dict(
                diff=(diff := index.to_frame().pipe(diff_shift)),
                label=(label := diff.cumsum()),
                span=label.apply(
                    lambda s: s.drop_duplicates().apply(s.value_counts().get), axis=0
                )
            ), axis=1
        ).replace({numpy.nan: None})

    def diff_shift(df: pandas.DataFrame) -> pandas.DataFrame:
        return pandas.DataFrame(
            numpy.concatenate((numpy.array([[True]*df.shape[1]]), df.values[:-1] != df.values[1:]), 0), 
            columns=df.columns
        )


In [5]:
%%
    def column_major(df: pandas.DataFrame, caption=None, SPAN=True) -> bs4.BeautifulSoup:
convert a dataframe to a `column_major` html representation that presents the column index names first.
        
        soup = bs4.BeautifulSoup(features="html.parser")
        soup.append(table := soup.new_tag("table"))
        if caption:
            table.append(cap := soup.new_tag("caption"))
            cap.append(caption)
        ROWS, COLS = any(df.index.names), any(df.columns.names)

pre-compute the grouping structure of the indexes

        row_span, col_span = index_span(df.index), index_span(df.columns)
        
        for col_level, col_name in enumerate(df.columns.names):
1. show the column index names
            
            table.append(tr := soup.new_tag("tr"))
            if COLS:
                attrs = dict(scope="row")
                if df.index.nlevels > 1:
                    attrs.update(colspan=df.index.nlevels)
                tr.append(th := soup.new_tag("th", attrs=attrs))
                th.append(str(col_name) or F"level {col_level}")
    
            for col_index, col_value in enumerate(df.columns.get_level_values(col_level)):
1. show the column index values

                attrs = dict(scope="col")
                span = col_span["span"].iloc[col_index, col_level] if SPAN else 1
                if span:
                    if span > 1:
                        attrs.update(colspan=int(span))
                    tr.append(th := soup.new_tag("th", attrs=attrs))
                    th.append(str(col_value))
        if ROWS:
1. insert the row names below the column names 

            table.append(tr := soup.new_tag("tr"))
            attrs = dict(scope="col")
            for row_level, row_name in enumerate(df.index.names):
                tr.append(th := soup.new_tag("th", attrs=attrs))
                th.append(str(row_name) or F"index {row_level}")
                
            for col_value in df.columns.get_level_values(col_level):
   followed by a blank row, a blank row is suboptimal for assistive technology.
                
                attrs = dict(scope="col")
                tr.append(td := soup.new_tag("td"))
    
        for row_index in range(df.shape[0]):
1. write the row index headers

            table.append(tr := soup.new_tag("tr"))
            for row_level in range(df.index.nlevels):
                span = row_span["span"].iloc[row_index, row_level] if SPAN else 1
                if span:
                    attrs = dict(scope="row")
                    if span > 1:
                        attrs.update(rowspan=int(span))
                    tr.append(th := soup.new_tag("th", attrs=attrs))
                    th.append(str(df.index.get_level_values(row_level)[row_index]))
    
            for value in df.iloc[row_index].values:
1. write the values of the dataframe
                
                tr.append(td := soup.new_tag("td"))
                td.append(str(value))
        return soup


    def column_major(df: pandas.DataFrame, caption=None, SPAN=True) -> bs4.BeautifulSoup:
convert a dataframe to a `column_major` html representation that presents the column index names first.
        
        soup = bs4.BeautifulSoup(features="html.parser")
        soup.append(table := soup.new_tag("table"))
        if caption:
            table.append(cap := soup.new_tag("caption"))
            cap.append(caption)
        ROWS, COLS = any(df.index.names), any(df.columns.names)

pre-compute the grouping structure of the indexes

        row_span, col_span = index_span(df.index), index_span(df.columns)
        
        for col_level, col_name in enumerate(df.columns.names):
1. show the column index names
            
            table.append(tr := soup.new_tag("tr"))
            if COLS:
                attrs = dict(scope="row")
                if df.index.nlevels > 1:
                    attrs.update(colspan=df.index.nlevels)
                tr.append(th := soup.new_tag("th", attrs=attrs))
                th.append(str(col_name) or F"level {col_level}")
    
            for col_index, col_value in enumerate(df.columns.get_level_values(col_level)):
1. show the column index values

                attrs = dict(scope="col")
                span = col_span["span"].iloc[col_index, col_level] if SPAN else 1
                if span:
                    if span > 1:
                        attrs.update(colspan=int(span))
                    tr.append(th := soup.new_tag("th", attrs=attrs))
                    th.append(str(col_value))
        if ROWS:
1. insert the row names below the column names 

            table.append(tr := soup.new_tag("tr"))
            attrs = dict(scope="col")
            for row_level, row_name in enumerate(df.index.names):
                tr.append(th := soup.new_tag("th", attrs=attrs))
                th.append(str(row_name) or F"index {row_level}")
                
            for col_value in df.columns.get_level_values(col_level):
   followed by a blank row, a blank row is suboptimal for assistive technology.
                
                attrs = dict(scope="col")
                tr.append(td := soup.new_tag("td"))
    
        for row_index in range(df.shape[0]):
1. write the row index headers

            table.append(tr := soup.new_tag("tr"))
            for row_level in range(df.index.nlevels):
                span = row_span["span"].iloc[row_index, row_level] if SPAN else 1
                if span:
                    attrs = dict(scope="row")
                    if span > 1:
                        attrs.update(rowspan=int(span))
                    tr.append(th := soup.new_tag("th", attrs=attrs))
                    th.append(str(df.index.get_level_values(row_level)[row_index]))
    
            for value in df.iloc[row_index].values:
1. write the values of the dataframe
                
                tr.append(td := soup.new_tag("td"))
                td.append(str(value))
        return soup


In [6]:
%%
    def row_major(df, caption=None, SPAN=True):
a `row_major` representation that presents the row index names first.
    
        soup = bs4.BeautifulSoup(features="lxml")
        soup.append(table := soup.new_tag("table"))
    
        ROWS, COLS = any(df.index.names), any(df.columns.names)
1. precompute the row and column index spans

        row_span, col_span = index_span(df.index), index_span(df.columns)
        
        for col_level, col_name in enumerate(df.columns.names):
            table.append(tr := soup.new_tag("tr"))
            if not col_level:
1. write the index names on the first pass of the header rows.

                if ROWS or not COLS:
                    attrs = dict(scope="col")
                    if df.columns.nlevels > 1:
                        attrs.update(rowspan=df.columns.nlevels) 
                    for row_level, row_name in enumerate(df.index.names):
                        tr.append(th := soup.new_tag("th", attrs=attrs))
                        th.append(str(row_name) or F"index {row_level}")
                
            if COLS:
1. include the column index names if they exist

                attrs = dict(scope="row")
                if not ROWS and df.index.nlevels > 1:
                    attrs.update(colspan=df.index.nlevels)
                tr.append(th := soup.new_tag("th", attrs=attrs))
                th.append(str(col_name) or F"level {col_level}")
    
            for col_index, col_value in enumerate(df.columns.get_level_values(col_level)):
1.  write the values for the column index
                
                attrs = dict(scope="col")
                span = col_span["span"].iloc[col_index, col_level] if SPAN else 1
                if span:
                    attrs = dict(scope="col")
                    if span > 1:
                        attrs.update(colspan=int(span))
                    tr.append(th := soup.new_tag("th", attrs=attrs))
                    th.append(str(col_value))
            
    
        for row_index in range(df.shape[0]):
1.  write the index header values

            table.append(tr := soup.new_tag("tr"))
            for row_level in range(df.index.nlevels):
                span = row_span["span"].iloc[row_index, row_level] if SPAN else 1
                if span:
                    attrs = dict(scope="row")
                    if span > 1:
                        attrs.update(rowspan=int(span))
                    tr.append(th := soup.new_tag("th", attrs=attrs))
                    th.append(str(df.index.get_level_values(row_level)[row_index]))
    
            if ROWS and COLS:
1.  insert an empty column if we have column names

                tr.append(td := soup.new_tag("td"))
    
            for value in df.iloc[row_index].values:
1.  write the data

                tr.append(td := soup.new_tag("td"))
                td.append(str(value))
        return soup


    def row_major(df, caption=None, SPAN=True):
a `row_major` representation that presents the row index names first.
    
        soup = bs4.BeautifulSoup(features="lxml")
        soup.append(table := soup.new_tag("table"))
    
        ROWS, COLS = any(df.index.names), any(df.columns.names)
1. precompute the row and column index spans

        row_span, col_span = index_span(df.index), index_span(df.columns)
        
        for col_level, col_name in enumerate(df.columns.names):
            table.append(tr := soup.new_tag("tr"))
            if not col_level:
1. write the index names on the first pass of the header rows.

                if ROWS or not COLS:
                    attrs = dict(scope="col")
                    if df.columns.nlevels > 1:
                        attrs.update(rowspan=df.columns.nlevels) 
                    for row_level, row_name in enumerate(df.index.names):
                        tr.append(th := soup.new_tag("th", attrs=attrs))
                        th.append(str(row_name) or F"index {row_level}")
                
            if COLS:
1. include the column index names if they exist

                attrs = dict(scope="row")
                if not ROWS and df.index.nlevels > 1:
                    attrs.update(colspan=df.index.nlevels)
                tr.append(th := soup.new_tag("th", attrs=attrs))
                th.append(str(col_name) or F"level {col_level}")
    
            for col_index, col_value in enumerate(df.columns.get_level_values(col_level)):
1.  write the values for the column index
                
                attrs = dict(scope="col")
                span = col_span["span"].iloc[col_index, col_level] if SPAN else 1
                if span:
                    attrs = dict(scope="col")
                    if span > 1:
                        attrs.update(colspan=int(span))
                    tr.append(th := soup.new_tag("th", attrs=attrs))
                    th.append(str(col_value))
            
    
        for row_index in range(df.shape[0]):
1.  write the index header values

            table.append(tr := soup.new_tag("tr"))
            for row_level in range(df.index.nlevels):
                span = row_span["span"].iloc[row_index, row_level] if SPAN else 1
                if span:
                    attrs = dict(scope="row")
                    if span > 1:
                        attrs.update(rowspan=int(span))
                    tr.append(th := soup.new_tag("th", attrs=attrs))
                    th.append(str(df.index.get_level_values(row_level)[row_index]))
    
            if ROWS and COLS:
1.  insert an empty column if we have column names

                tr.append(td := soup.new_tag("td"))
    
            for value in df.iloc[row_index].values:
1.  write the data

                tr.append(td := soup.new_tag("td"))
                td.append(str(value))
        return soup


In [7]:
%%
replace the `bs4.BeautifulSoup` representation with an html representation

    get_ipython().display_formatter.formatters["text/html"].for_type(bs4.BeautifulSoup, str)


replace the `bs4.BeautifulSoup` representation with an html representation

    get_ipython().display_formatter.formatters["text/html"].for_type(bs4.BeautifulSoup, str)


In [8]:
    row_major(df.head().rename_axis((None, None, None), axis=1).droplevel((0, 1), axis=1).droplevel((0,1), axis=0),
             "a single index row major")

L,1,2,3,1.1,2.1,3.1,1.2,2.2,3.2,1.3,2.3,3.3,1.4,2.4,3.4,1.5,2.5,3.5
1,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,


In [9]:
    row_major(df.head().rename_axis((None, None, None), axis=1), "spanning multiple index row major")

J,K,L,A,A,A,A,A,A,A,A,A,Z,Z,Z,Z,Z,Z,Z,Z,Z
J,K,L,M,M,M,N,N,N,O,O,O,M,M,M,N,N,N,O,O,O
J,K,L,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
A,M,1,,,,,,,,,,,,,,,,,,
A,M,2,,,,,,,,,,,,,,,,,,
A,M,3,,,,,,,,,,,,,,,,,,
A,N,1,,,,,,,,,,,,,,,,,,
A,N,2,,,,,,,,,,,,,,,,,,


In [10]:
    column_major(df.head().rename_axis((None, None, None), axis=0), "spanning multiple index column major")

10,10,10,A,A,A,A,A,A,A,A,A,Z,Z,Z,Z,Z,Z,Z,Z,Z
100,100,100,M,M,M,N,N,N,O,O,O,M,M,M,N,N,N,O,O,O
1000,1000.1,1000.2,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
A,M,1,,,,,,,,,,,,,,,,,,
A,M,2,,,,,,,,,,,,,,,,,,
A,M,3,,,,,,,,,,,,,,,,,,
A,N,1,,,,,,,,,,,,,,,,,,
A,N,2,,,,,,,,,,,,,,,,,,


In [11]:
    row_major(df.head(), "non-spanning multiple indexes row major", False)

J,K,L,10,A,A,A,A,A,A,A,A,A,Z,Z,Z,Z,Z,Z,Z,Z,Z
J,K,L,100,M,M,M,N,N,N,O,O,O,M,M,M,N,N,N,O,O,O
J,K,L,1000,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
A,M,1,,,,,,,,,,,,,,,,,,,
A,M,2,,,,,,,,,,,,,,,,,,,
A,M,3,,,,,,,,,,,,,,,,,,,
A,N,1,,,,,,,,,,,,,,,,,,,
A,N,2,,,,,,,,,,,,,,,,,,,


In [12]:
    column_major(df.head(), "non-spanning multiple index column major", False)

10,10,10,A,A,A,A,A,A,A,A,A,Z,Z,Z,Z,Z,Z,Z,Z,Z
100,100,100,M,M,M,N,N,N,O,O,O,M,M,M,N,N,N,O,O,O
1000,1000.1,1000.2,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
J,K,L,,,,,,,,,,,,,,,,,,
A,M,1,,,,,,,,,,,,,,,,,,
A,M,2,,,,,,,,,,,,,,,,,,
A,M,3,,,,,,,,,,,,,,,,,,
A,N,1,,,,,,,,,,,,,,,,,,
A,N,2,,,,,,,,,,,,,,,,,,
