In [1]:
import pandas as pd
import numpy as np
import xlrd
import re

In [2]:
digest_table_id = "236.30"
digest_table_year = "2019"
table_file = "tabn236.30.xls"

In [3]:
book = xlrd.open_workbook(table_file,formatting_info=True)
sh = book.sheet_by_index(0)
font = book.font_list

# read in the raw dataset from excel
df = pd.read_excel("tabn236.30.xls", header=None)

In [4]:
# create file name
digest_number = digest_table_id.replace(".", "_")
output_file = f"{digest_table_year}_{digest_number}_activate_step1.xlsx"

## Create Table Info

In [5]:
def create_table_info(sh, df):

    ## Table Title
    prog = re.compile(r"Table (\d{3}\.\d{2})\. (.*)")
    result = prog.match(sh.cell_value(0,0))
    table_title = result.group(2)

    # headnote
    headnote = sh.cell_value(1,0)

    # stub_head
    stub_head = sh.cell_value(2,0)

    # general_note
    general = df[0].str.extract(r"NOTE: (.*)").dropna()
    general_note = general[0].values[0].strip()

    # source
    source = df[0].str.extract(r"SOURCE: (.*)\((.*)\)").dropna()
    source_note = source[0].values[0].strip()

    # last_prepared
    last_prepared = source[1].values[0].strip()

    col_list = [
        'digest_table_id', 
        'digest_table_year', 
        'table_title',
        'headnote',
        'stub_head',
        'general_note',
        'source_note',
        'last_prepared'
    ]

    val_list = [
        digest_table_id, 
        digest_table_year, 
        table_title,
        headnote,
        stub_head,
        general_note,
        source_note,
        last_prepared
    ]

    table_info = pd.DataFrame(np.array([col_list, val_list]))
    return table_info

In [6]:
table_info = create_table_info(sh, df)

## Row Info Tab

In [19]:
sh.cell_value(0,0)

'Table 236.30. Total expenditures for public elementary and secondary education and other related programs, by function and state or jurisdiction: 2016-17'

In [22]:
sh.cell_value(7,0)

'   United States ........'

In [23]:
cell_xf = book.xf_list[sh.cell_xf_index(7,0)]
font[cell_xf.font_index].bold

1

## Create Column Info Sheet

In [7]:
def AA(num, string):
    """Recursively builds column index
    
    Inspired by from this Stackoverflow answer:
    https://stackoverflow.com/a/54837286
    """
    
    r = num % 26
    num = (num - r) // 26
    string = chr(ord("A") + r) + string
    
    if num > 26:
        string = AA(num, string)
    elif num > 0:
        string = chr(ord("A") + num - 1) + string
        
    return string

In [8]:
def header_end(sheet, df):
    """Returns the row number of the integer row"""
    
    for row in range(0,sheet.nrows):
        if list(df.iloc[row,:]) == list(range(1,sheet.ncols+1)):
            return row
        
    print("End of file reached, no integer row")
    return 0

In [9]:
def attach_header(file_name, sheet, df):
    """Returns the same dataframe with formatted column headers"""
    
    header_n = header_end(sheet, df)
    skip = 2
    header = pd.read_excel(file_name, 
                               skiprows=skip, 
                               header=None, 
                               nrows=header_n-skip,
                               usecols=list(range(1,sh.ncols))
                          )
    header = header.ffill(axis=0).ffill(axis=1)
    data = pd.read_excel(file_name,
                         skiprows=header_n + 1,
                         header=None,
                         usecols=list(range(1,sh.ncols))
                        )
    data.columns = pd.MultiIndex.from_arrays(header.values)
    
    return data

In [10]:
data = attach_header(table_file, sh, df)

In [11]:
def create_col_info(df):
    """Returns DataFrame with column information"""
    
    # convert header df to col_info dataframe
    col_info = data.columns.to_frame(index=False)
    is_duplicate = col_info.apply(lambda row: row.duplicated(), axis=1)
    col_info = col_info.where(~is_duplicate, "")
    
    # create extra columns for unused columns index levels
    for x in range(col_info.shape[1], 7):
        col_info.insert(x, x, "")
    
    # label column levels
    col_info.columns = [f"column_level_{col+1}" for col in col_info.columns]
    
    # add table_id and table_year to col_info
    col_info["digest_table_id"] = digest_table_id
    col_info["digest_table_year"] = digest_table_year
    
    # create column_index field
    col_info["column_index"] = [AA(i,"") for i in col_info.index]
    
    return col_info

In [12]:
col_info = create_col_info(data)

In [13]:
def add_footnotes(df, col_info):
    """Adds footnote columns and sorts columns"""
    
    # Extract footnotes from raw df
    footnotes = df[0].str.extract(r"\\([0-9])\\(.*)").dropna()
    footnotes.columns = ["number", "note"]
    footnotes = footnotes.set_index("number")

    # Extract footnotes from raw df
    footnotes = df[0].str.extract(r"\\([0-9])\\(.*)").dropna().set_index(0)
    footnotes_dict = footnotes.to_dict()[1]

    # create column_ref_note columns
    for x in range(1,8):
        col = col_info[f"column_level_{x}"]

        # create a reference column with the footnote number
        refs = col.str.extract(r"\\([0-9])\\")

        # create new column with the reference note
        col_info[f"column_ref_note_{x}"] = refs.replace(footnotes_dict)

        # delete footnote from column_level_x
        col_level = col.str.replace(
            pat = r"\\[0-9]\\",
            repl = ""
        )

        col_info[f"column_level_{x}"] = col_level

    # Remove extra headers
    col_info = col_info.fillna("")

    # list of columns in the desired order
    col_list = [[f"column_level_{x}", f"column_ref_note_{x}"] for x in range(1,8)]
    col_list = list(np.array(col_list).flatten())

    # rearrange column order
    col_info = col_info[
        ['digest_table_id', 'digest_table_year', 'column_index'] + 
        col_list
    ]
    
    return col_info

In [14]:
col_info = add_footnotes(df, col_info)

### Output to Excel

In [15]:
# make column names part of dataframe
col_info = pd.DataFrame(np.vstack([col_info.columns, col_info]))

In [16]:
with pd.ExcelWriter(output_file) as writer:
    table_info.to_excel(
        writer, 
        sheet_name="table_info", 
        index=False,
        header=False
    )
    col_info.to_excel(
        writer, 
        sheet_name="column_info",
        index=False,
        header=False
    )