In [1]:
import camelot
import pandas as pd

In [2]:
tables = camelot.read_pdf("payroll_report.pdf",pages='all', flavor='lattice')



In [5]:
tables = list(map(lambda x:x.df, tables))

In [6]:
tables[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Age,Number of new EPF subscribers during the period,,,,,Number of members that ceased subscribing duri...,,,,,Number of exited members who rejoined and resu...,,,,
1,,Male,Female,Others,Not \nAvailable,Total,Male,Female,Others,Not \nAvailable,Total,Male,Female,Others,Not \nAvailable,Total
2,"Less than 18 43,488",,13444,16,-,56948,9121,6269,2,443,15835,339,173,-,-,512
3,18-21,1650881,305266,365,-,1956512,867108,181901,121,8657,1057787,51590,10452,7,2,62051
4,22-25,1865623,407532,390,-,2273545,1507374,364369,156,25230,1897129,144823,37742,13,21,182599
5,26-28,919748,187227,182,-,1107157,976600,221744,69,20322,1218735,133696,28103,4,28,161831
6,29-35,1214635,308803,277,-,1523715,1375062,316507,122,36964,1728655,200279,34884,16,36,235215
7,,"More than 35 12,28,968",310224,335,-,1539527,1401532,348146,93,59877,1809648,153169,27871,12,65,181117
8,Total,6923343,1532496,,"1,565 -",8457404,6136797,1438936,563,151493,7727789,683896,139225,52,152,823325


In [19]:
def is_monthly_epf_df(df: pd.DataFrame):
    '''Returns true if the given dataframe contains EPFO subscribers'''
    is_epf_table = any(df.iloc[:, 1].str.lower().str.contains('new EPF subscriber', na=False, case=False))
    is_monthly_table = any(df.iloc[0, :].str.contains("\w+\s\d{4}", regex=True))
    return is_epf_table and is_monthly_table

def get_epf_dfs(dfs: list[pd.DataFrame]):
    """Returns EPFO Dataframes, from the given list of dataframes"""
    return list(filter(is_monthly_epf_df, dfs))

In [20]:
epf_dfs=get_epf_dfs(tables)

In [39]:
parse_date_head(epf_dfs[0])

0   2022-04-01
Name: 0, dtype: datetime64[ns]

In [38]:
def parse_date_head(df: pd.DataFrame):
    return pd.to_datetime(
        df.iloc[:, 0],
        format="%B %Y",
        errors="coerce"
    ).dropna()
def strip_month_headline(df: pd.DataFrame):

    date_df = parse_date_head(df)
    date_row_idx = date_df.index[0]
    return df.iloc[date_row_idx+1:].reset_index(drop=True)

In [27]:
def exclude_totals_row(df: pd.DataFrame):
    return df[~df.iloc[:, 0].str.contains("total", na=False, case=False)]

In [40]:
def strip_headings(df: pd.DataFrame):
    def find_gender_header_index(df: pd.DataFrame):
        return df.index[df.iloc[:, 2].str.lower()
                        .str.contains('female', na=False, case=False)][0]

    gender_header_idx = find_gender_header_index(df)

    return df.iloc[gender_header_idx:, 1:5] \
        .reset_index(drop=True).fillna(0)

In [30]:
def prep_column_names(df: pd.DataFrame):
    df = df.rename(columns=dict(
        zip(
            df.columns,
            ["epfo_" + head.lower() for head in df.iloc[0].tolist()]
        )
    ))
    return df.drop(df.index[0]).reset_index(drop=True)

In [31]:
def cast_to_numeric(df: pd.DataFrame):
    return df.replace(",", "", regex=True) \
        .apply(pd.to_numeric, errors="coerce")

In [32]:
epf_dfs[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,April 2022,,,,,,,,,,,,,,,
1,Age,Number of new EPF subscribers during the month,,,,,Number of members that ceased subscribing duri...,,,,,Number of exited members who rejoined and resu...,,,,
2,,Male,Female,Others,Not \nAvailable,Total,Male,Female,Others,Not \nAvailable,Total,Male,Female,Others,Not \nAvailable,Total
3,Less than 18,6452,1352,-,-,7804,1184,654,-,-,1838,449,252,-,-,701
4,18-21,212257,55183,-,3,267443,115818,27683,1,1,143503,99019,19651,-,-,118670
5,22-25,196347,77992,1,4,274344,241411,69975,3,3,311392,289641,64049,4,-,353694
6,26-28,85038,31955,1,4,116998,172559,45397,3,8,217967,219714,45750,8,-,265472
7,29-35,121075,56148,1,4,177228,266806,68781,3,61,335651,339875,71544,6,-,411425
8,More than 35,149304,62440,4,13,211761,249613,74780,2,94,324489,306577,72229,3,1,378810
9,Total,770473,,"2,85,070 7",28,1055578,1047391,287270,12,167,1334840,1255275,273475,21,1,1528772


In [42]:
epf_dfs[0] \
    .pipe(strip_month_headline) \
    .pipe(exclude_totals_row) \
    .pipe(strip_headings) \
    .pipe(prep_column_names) \
    .pipe(cast_to_numeric) \
    .rename(columns={"epfo_not \navailable": "epfo_notavailable"})

Unnamed: 0,epfo_male,epfo_female,epfo_others,epfo_notavailable
0,6452,1352,,
1,212257,55183,,3.0
2,196347,77992,1.0,4.0
3,85038,31955,1.0,4.0
4,121075,56148,1.0,4.0
5,149304,62440,4.0,13.0


In [53]:
for df in epf_dfs:
    df_date = parse_date_head(df).iloc[0].date()
    df = df.pipe(strip_month_headline) \
        .pipe(exclude_totals_row) \
        .pipe(strip_headings) \
        .pipe(prep_column_names) \
        .pipe(cast_to_numeric) \
        .rename(columns={"epfo_not \navailable": "epfo_notavailable"})
    df.to_csv(f"{df_date}.csv", index=False)

  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)


'2022-04-01'