In [30]:
%matplotlib widget

In [31]:
import camelot
from pathlib import Path
import pandas as pd
import numpy as np

In [32]:
# source_reports = {"march_2022", "december_2022"}
src_pdf_filename = "december_2022"
payroll_report_pdf = Path.cwd().parent / "data" / "raw" / "payroll_reports" / f"{src_pdf_filename}.pdf"

In [33]:
raw_tables = camelot.read_pdf(str(payroll_report_pdf), pages="all", flavor="lattice")



In [34]:
tables = list(map(lambda t: t.df.copy(), raw_tables))

In [35]:
def basic_cleaning(df: pd.DataFrame):
    df = df.dropna(how="all", axis=1)
    df = df.replace(r"[\r\n\s]+", " ", regex=True) \
        .replace('-', '', regex=True) \
        .replace("", np.nan)
    print("Load Table:", df.shape, "->", df.shape)
    return df

In [36]:
cln_tables = list(map(basic_cleaning, tables))

Load Table: (9, 16) -> (9, 16)
Load Table: (9, 16) -> (9, 16)
Load Table: (9, 16) -> (9, 16)
Load Table: (9, 16) -> (9, 16)
Load Table: (9, 16) -> (9, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (9, 9) -> (9, 9)
Load Table: (9, 9) -> (9, 9)
Load Table: (9, 9) -> (9, 9)
Load Table: (9, 9) -> (9, 9)
Load Table: (9, 9) -> (9, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 18) -> (10, 18)
Load Table: (10, 18) -> (10, 18)
Load Table: (10, 18) -> (10, 18)
Load Table: (10, 18) -> (

In [37]:
def is_monthly(df: pd.DataFrame):
    return df.iloc[0, :].str.contains("\w{3,9}\s\d{4}", regex=True).any()

In [38]:
def is_epf(df: pd.DataFrame):
    return df.iloc[:, 1].str.lower().str.contains('new EPF subscriber', na=False, case=False).any()

In [39]:
monthly_epf_tables = list(filter(is_monthly, filter(is_epf, cln_tables)))

In [40]:
def parse_date_head(df: pd.DataFrame) -> pd.Series:
    return pd.to_datetime(
        df.iloc[:, 0],
        format="%B %Y",
        errors="coerce"
    ).dropna()

In [41]:
def strip_month_headline(df: pd.DataFrame):
    date_df = parse_date_head(df)
    date_row_idx = date_df.index[0]
    return df.iloc[date_row_idx + 1:].reset_index(drop=True)


In [42]:
def exclude_totals_row(df: pd.DataFrame):
    return df[~df.iloc[:, 0].str.contains("total", na=False, case=False)]

In [43]:
def epf_correct_camelot(df: pd.DataFrame):
    # TODO: find a method to resolve this issue with camelot itself.
    if df.iloc[:, 0].str.match(r'([\s\w]+)(\d+) (\d[\d,.]*)$').any():
        df.iloc[7, [0,1]] = df.iloc[:, 0].str.extract(r'(.*) (\d[\d,.]*)$').iloc[7]
        df.iloc[2, [0,1]] = df.iloc[:, 1].str.extract(r'(.*) (\d[\d,.]*)$').iloc[2]
    return df

In [44]:
def prep_row_labels(df: pd.DataFrame, lheads=["head", "gender"]):
    headings = df.iloc[:, 0].str.replace(r"(\d{2})(\d{2})", r"\1-\2", regex=True)
    headings[0:len(lheads)] = lheads
    headings = headings.str.lower()
    return df.rename(index=headings).drop(0, axis=1)

In [45]:
def reshape_epf(df: pd.DataFrame, id_vars=["head", "gender"]):
    df.iloc[0] = df.iloc[0].str.replace("\n", "").replace("", np.nan).ffill()
    df = df.T.melt(id_vars=id_vars, var_name="age")
    categorical_columns = id_vars + ["age"]
    df[categorical_columns] = df[categorical_columns].astype("category")
    # TODO: convert values to integer values
    # df.value = pd.to_numeric(df.value.str.replace(",", "").str.strip(), errors="coerce", downcast="unsigned")
    return df

In [46]:
def display_df(df: pd.DataFrame):
    display(df)
    return df

In [47]:
monthly_epf_tables[0].pipe(strip_month_headline).pipe(exclude_totals_row).pipe(display_df).pipe(epf_correct_camelot).pipe(prep_row_labels).pipe(display_df).pipe(reshape_epf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Age,Number of new EPF subscribers during the month,,,,,Number of members that ceased subscribing duri...,,,,,Number of exited members who rejoined and resu...,,,,
1,,Male,Female,Others,Not Available,Total,Male,Female,Others,Not Available,Total,Male,Female,Others,Not Available,Total
2,,"Less than 18 6,475",1358,,,7833,1216,659,,,1875,468,257,,,725
3,1821,212534,55263,,3,267800,120147,28271,1,1,148420,101355,19997,,,121352
4,2225,196747,78152,1,4,274904,249681,71402,3,3,321089,293409,64732,4,,358145
5,2628,85285,32066,1,4,117356,178061,46293,3,8,224365,222038,46113,8,,268159
6,2935,121667,56418,1,4,178090,275107,70516,3,61,345687,343358,72223,6,,415587
7,"More than 35 1,50,424",,63300,4,13,213741,258615,77029,2,93,335739,310385,73044,3,1,383433


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
head,Number of new EPF subscribers during the month,,,,,Number of members that ceased subscribing duri...,,,,,Number of exited members who rejoined and resu...,,,,
gender,Male,Female,Others,Not Available,Total,Male,Female,Others,Not Available,Total,Male,Female,Others,Not Available,Total
less than 18,6475,1358,,,7833,1216,659,,,1875,468,257,,,725
18-21,212534,55263,,3,267800,120147,28271,1,1,148420,101355,19997,,,121352
22-25,196747,78152,1,4,274904,249681,71402,3,3,321089,293409,64732,4,,358145
26-28,85285,32066,1,4,117356,178061,46293,3,8,224365,222038,46113,8,,268159
29-35,121667,56418,1,4,178090,275107,70516,3,61,345687,343358,72223,6,,415587
more than 35,150424,63300,4,13,213741,258615,77029,2,93,335739,310385,73044,3,1,383433


Unnamed: 0,head,gender,age,value
0,Number of new EPF subscribers during the month,Male,less than 18,6475
1,Number of new EPF subscribers during the month,Female,less than 18,1358
2,Number of new EPF subscribers during the month,Others,less than 18,
3,Number of new EPF subscribers during the month,Not Available,less than 18,
4,Number of new EPF subscribers during the month,Total,less than 18,7833
...,...,...,...,...
85,Number of exited members who rejoined and resu...,Male,more than 35,310385
86,Number of exited members who rejoined and resu...,Female,more than 35,73044
87,Number of exited members who rejoined and resu...,Others,more than 35,3
88,Number of exited members who rejoined and resu...,Not Available,more than 35,1


In [48]:
parse_date_head(monthly_epf_tables[0]).iloc[0]

Timestamp('2022-04-01 00:00:00')

In [49]:
def epf_data_pipeline(df: pd.DataFrame):
    month = parse_date_head(df).iloc[0]
    df = df.pipe(strip_month_headline) \
        .pipe(exclude_totals_row) \
        .pipe(epf_correct_camelot) \
        .pipe(prep_row_labels) \
        .pipe(reshape_epf)
    df["year"] = month.year
    df["month"] = month.month
    df["sector"] = np.nan
    return df

In [50]:
fnl_epf = pd.concat(map(epf_data_pipeline, monthly_epf_tables))

In [51]:
def is_esic(df: pd.DataFrame):
    return df.iloc[:, 5].str.lower().str.contains('Number of newly registered', na=False, case=False).any()

In [52]:
monthly_esic_tables = list(filter(is_monthly, filter(is_esic, tables)))

In [53]:
monthly_esic_tables[0].pipe(strip_month_headline).pipe(exclude_totals_row).pipe(prep_row_labels).pipe(display_df).pipe(reshape_epf).head()

Unnamed: 0,1,2,3,4,5,6,7,8
head,Number of existing employees who paid during \...,,,,Number of newly registered employees & \npayin...,,,
gender,Male,Female,Others,Total,Male,Female,Others,Total
less than 18,4483,2803,,7286,1445,491,,1936
18-21,918773,198623,35,1117431,207870,38307,7,246184
22-25,3544932,885911,117,4430960,302642,71113,10,373765
26-28,3093386,601837,104,3695327,156863,31401,7,188271
29-35,5744816,1163983,196,6908995,194401,51892,8,246301
more than 35,8245820,2431948,451,10678219,177240,55942,13,233195


Unnamed: 0,head,gender,age,value
0,Number of existing employees who paid during t...,Male,less than 18,4483.0
1,Number of existing employees who paid during t...,Female,less than 18,2803.0
2,Number of existing employees who paid during t...,Others,less than 18,
3,Number of existing employees who paid during t...,Total,less than 18,7286.0
4,Number of newly registered employees & paying ...,Male,less than 18,1445.0


In [54]:
def esic_data_pipeline(df: pd.DataFrame):
    month = parse_date_head(df).iloc[0]
    df = df.pipe(strip_month_headline) \
        .pipe(exclude_totals_row) \
        .pipe(prep_row_labels) \
        .pipe(reshape_epf)
    df["year"] = month.year
    df["month"] = month.month
    df["sector"] = np.nan
    return df

In [55]:
fnl_esic = pd.concat(map(esic_data_pipeline, monthly_esic_tables))

In [56]:
fnl_esic

Unnamed: 0,head,gender,age,value,year,month,sector
0,Number of existing employees who paid during t...,Male,less than 18,4483,2022,4,
1,Number of existing employees who paid during t...,Female,less than 18,2803,2022,4,
2,Number of existing employees who paid during t...,Others,less than 18,,2022,4,
3,Number of existing employees who paid during t...,Total,less than 18,7286,2022,4,
4,Number of newly registered employees & paying ...,Male,less than 18,1445,2022,4,
...,...,...,...,...,...,...,...
43,Number of existing employees who paid during t...,Total,more than 35,10564641,2022,12,
44,Number of newly registered employees & paying ...,Male,more than 35,201668,2022,12,
45,Number of newly registered employees & paying ...,Female,more than 35,68303,2022,12,
46,Number of newly registered employees & paying ...,Others,more than 35,8,2022,12,


In [57]:
def is_nps(df: pd.DataFrame):
    return df.iloc[:, 1].str.lower().str.replace("\n", "").str.contains('existing subscribers', na=False,case=False).any()


In [58]:
monthly_nps_tables = list(filter(is_monthly, filter(is_nps, tables)))

In [59]:
from functools import partial
prep_nps_row_labels = partial(prep_row_labels, lheads=["head", "sector", "gender"])
# reshape_nps = partial(reshape_epf, id_vars=["head", "gender", "sector"])

In [60]:
def reshape_nps(df: pd.DataFrame, id_vars=["head", "gender", "sector"]):
    df = df.T.replace("", np.nan).ffill()
    df = df.melt(id_vars=id_vars, var_name="age")
    categorical_columns = id_vars + ["age"]
    df[categorical_columns] = df[categorical_columns].astype("category")
    return df

In [61]:
monthly_nps_tables[0].pipe(strip_month_headline).pipe(exclude_totals_row).pipe(prep_nps_row_labels).pipe(display_df).pipe(reshape_nps)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
head,Total Existing \nSubscribers \ncontributing \n...,New Subscribers contributing during the month,,,,,,,,,,,,,,,
sector,,Central Govt,,,,,State Govt,,,,,Non-Govt (Corporate Sector),,,,,Total New \nSubscribers
gender,,Male,Female,Transgender,Non-\nIRA,Total,Male,Female,Transgender,Non-IRA,Total,Male,Female,Transgender,Non-IRA,Total,
18-21,12538,394,66,-,,- 460,371,93,-,,- 464,79,35,-,,- 114,1038
22-25,234188,3309,621,-,,"- 3,930",1819,818,-,,"- 2,637",2319,1152,-,,"- 3,471",10038
26-28,511356,3427,542,-,,"- 3,969",2712,1508,-,,"- 4,220",2647,1075,-,,"- 3,722",11911
29-35,2121014,2350,479,-,,"- 2,829",6085,4332,-,,"- 10,417",5570,1418,1,,"- 6,989",20235
> 35,3096941,696,272,-,,- 968,6794,4574,-,,"- 11,368",7780,1230,-,,"- 9,010",21346
non-ira,1830,-,-,-,-,-,-,-,,- 1,1,-,-,-,-,,- 1


Unnamed: 0,head,gender,sector,age,value
0,Total Existing \nSubscribers \ncontributing \n...,,,18-21,12538
1,New Subscribers contributing during the month,Male,Central Govt,18-21,394
2,New Subscribers contributing during the month,Female,Central Govt,18-21,66
3,New Subscribers contributing during the month,Transgender,Central Govt,18-21,-
4,New Subscribers contributing during the month,Non-\nIRA,Central Govt,18-21,-
...,...,...,...,...,...
97,New Subscribers contributing during the month,Female,Non-Govt (Corporate Sector),non-ira,-
98,New Subscribers contributing during the month,Transgender,Non-Govt (Corporate Sector),non-ira,-
99,New Subscribers contributing during the month,Non-IRA,Non-Govt (Corporate Sector),non-ira,-
100,New Subscribers contributing during the month,Total,Non-Govt (Corporate Sector),non-ira,-


In [62]:
def nps_data_pipeline(df: pd.DataFrame):
    month = parse_date_head(df).iloc[0]
    df = df.pipe(strip_month_headline) \
        .pipe(exclude_totals_row) \
        .pipe(prep_nps_row_labels) \
        .pipe(reshape_nps)
    df["year"] = month.year
    df["month"] = month.month
    for col in ["head", 'gender', 'sector']:
        df[col] = df[col].str.replace("\n", "")
    df["age"] = df["age"].str.replace("> 35", "more than 35")
    return df

In [63]:
fnl_nps = pd.concat(map(nps_data_pipeline, monthly_nps_tables))

In [64]:
fnl_nps

Unnamed: 0,head,gender,sector,age,value,year,month
0,Total Existing Subscribers contributing during...,,,18-21,12538,2022,4
1,New Subscribers contributing during the month,Male,Central Govt,18-21,394,2022,4
2,New Subscribers contributing during the month,Female,Central Govt,18-21,66,2022,4
3,New Subscribers contributing during the month,Transgender,Central Govt,18-21,-,2022,4
4,New Subscribers contributing during the month,Non-IRA,Central Govt,18-21,-,2022,4
...,...,...,...,...,...,...,...
97,New Subscribers contributing during the month,Female,Non-Govt (Corporate Sector),non-ira,-,2022,12
98,New Subscribers contributing during the month,Transgender,Non-Govt (Corporate Sector),non-ira,-,2022,12
99,New Subscribers contributing during the month,Non-IRA,Non-Govt (Corporate Sector),non-ira,-,2022,12
100,New Subscribers contributing during the month,Total,Non-Govt (Corporate Sector),non-ira,-,2022,12


In [65]:
fnl_df = pd.concat([fnl_nps, fnl_epf, fnl_esic])

In [66]:
interim_path = Path.cwd().parent / "data" / "interim" / f"{src_pdf_filename}.csv"
fnl_df.to_csv(interim_path, index=False)