In [65]:
from pathlib import Path
import pandas as pd
from itertools import chain

In [25]:
DIR = Path("../data/raw/tax_receipts")
files  = list(DIR.glob("*.xlsx"))
files

[WindowsPath('../data/raw/tax_receipts/tr 2019-20.xlsx'),
 WindowsPath('../data/raw/tax_receipts/tr 2020-21.xlsx'),
 WindowsPath('../data/raw/tax_receipts/tr 2021-22.xlsx'),
 WindowsPath('../data/raw/tax_receipts/tr 2022-23.xlsx')]

In [4]:
def find_crores_row_idx(df: pd.DataFrame):
    for idx, row in df.iterrows():
        if "crore" in str(row.values).lower():
            return idx

In [11]:
def load_tax_receipts(fp: Path):
    raw = pd.read_excel(fp).dropna(axis=1, how="all").dropna(
        axis=0, how="all").reset_index(drop=True)
    in_crores_title_row_idx = find_crores_row_idx(raw)
    cln = raw[in_crores_title_row_idx + 1:].dropna(axis=1, how="all")
    return cln.reset_index(drop=True)

In [6]:
def extract_section_mapping(df: pd.DataFrame):
    is_section_number = lambda x: isinstance(x, int) or (isinstance(x, str) and str.isnumeric(x))
    mask = df.iloc[:, 0].str.strip(".").apply(is_section_number)
    heads = df.loc[mask].iloc[:, :2]
    return dict(zip(heads.iloc[:, 0], heads.iloc[:, 1]))

In [24]:
def find_major_head_cell(df: pd.DataFrame) -> tuple[int, str]:
    """Returns the column name of the column containing the 'Major Head' title"""
    def criterion(series): return "major head" in str(series).lower()
    row_mask = df.apply(criterion, axis=1)
    col_mask = df.apply(criterion)
    cell = df.loc[row_mask, col_mask]

    assert len(cell.columns) == 1, "There should be only one major head column"
    assert len(cell.index) == 1, "There should be only one major head row"

    return cell.index.min(), cell.columns[0]

In [29]:
for fp in files:
    print(fp)
    df = load_tax_receipts(fp)
    print(df.shape)
    display(df.head())
    head_end_idx, major_head_col = find_major_head_cell(df)
    print(head_end_idx, major_head_col)
    break

..\data\raw\tax_receipts\tr 2019-20.xlsx
(112, 8)


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,,,,,Actual,Budget,Revised 2018-2019,Budget
1,,,,Major Head,2017-2018,2018-2019,,2019-2020
2,Tax Revenue,,,,,,,
3,1,Corporation Tax,,,,,,
4,,1.01.,Collections,20,486113.18,527750.39,575795.77,657312


1 Unnamed: 3


In [59]:
def get_rows_with_serial_number(df: pd.DataFrame):
    return df.loc[df.iloc[:, 1].str.strip(".").str.contains(".", na=False, regex=False)].index

In [52]:
def get_rows_with_major_head_number(df: pd.DataFrame, major_head_col: str):
    return df.loc[2:, major_head_col].dropna().index

In [72]:
def filter_datapoints_with_serial_and_head_no(df: pd.DataFrame, major_head_col: str):
    mask = list(set(i for i in chain(
        get_rows_with_serial_number(df),
        get_rows_with_major_head_number(df, major_head_col=major_head_col)
    )))
    return df.iloc[mask]