In [210]:
%matplotlib widget

In [211]:
import camelot
from pathlib import Path
import pandas as pd
import numpy as np

In [212]:
src_pdf_filename = "march_2022"
payroll_report_pdf = Path.cwd().parent / "data" / "raw" / "payroll_reports" / f"{src_pdf_filename}.pdf"

In [213]:
raw_tables = camelot.read_pdf(str(payroll_report_pdf), pages="all", flavor="lattice")



In [214]:
tables = list(map(lambda t: t.df.copy(), raw_tables))

In [215]:
def basic_cleaning(df: pd.DataFrame):
    df = df.dropna(how="all", axis=1)
    df = df.replace(r"[\r\n\s]+", " ", regex=True) \
        .replace('-', '', regex=True) \
        .replace("", np.nan)
    print("Load Table:", df.shape, "->", df.shape)
    return df

In [216]:
cln_tables = list(map(basic_cleaning, tables))

Load Table: (9, 16) -> (9, 16)
Load Table: (9, 16) -> (9, 16)
Load Table: (9, 16) -> (9, 16)
Load Table: (9, 16) -> (9, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (10, 16) -> (10, 16)
Load Table: (9, 9) -> (9, 9)
Load Table: (9, 9) -> (9, 9)
Load Table: (9, 9) -> (9, 9)
Load Table: (9, 9) -> (9, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> (10, 9)
Load Table: (10, 9) -> 

In [217]:
def is_monthly(df: pd.DataFrame):
    return df.iloc[0, :].str.contains("\w{3,9}\s\d{4}", regex=True).any()

In [218]:
def is_epf(df: pd.DataFrame):
    return df.iloc[:, 1].str.lower().str.contains('new EPF subscriber', na=False, case=False).any()

In [219]:
monthly_epf_tables = list(filter(is_monthly, filter(is_epf, cln_tables)))

In [220]:
def parse_date_head(df: pd.DataFrame) -> pd.Series:
    return pd.to_datetime(
        df.iloc[:, 0],
        format="%B %Y",
        errors="coerce"
    ).dropna()

In [221]:
def strip_month_headline(df: pd.DataFrame):
    date_df = parse_date_head(df)
    date_row_idx = date_df.index[0]
    return df.iloc[date_row_idx + 1:].reset_index(drop=True)


In [222]:
def exclude_totals_row(df: pd.DataFrame):
    return df[~df.iloc[:, 0].str.contains("total", na=False, case=False)]

In [223]:
def epf_correct_camelot(df: pd.DataFrame):
    # TODO: find a method to resolve this issue with camelot itself.
    if df.iloc[:, 0].str.match(r'([\s\w]+) (\d) (\d[\d,.]*)$').any():
        df.iloc[7, [0,1]] = df.iloc[:, 0].str.extract(r'(.*) (\d[\d,.]*)$').iloc[7]
        df.iloc[2, [0,1]] = df.iloc[:, 1].str.extract(r'(.*) (\d[\d,.]*)$').iloc[2]
    return df

In [224]:
def prep_row_labels(df: pd.DataFrame, lheads=["head", "gender"]):
    headings = df.iloc[:, 0].str.replace(r"(\d{2})(\d{2})", r"\1-\2", regex=True)
    headings[0:len(lheads)] = lheads
    headings = headings.str.lower()
    return df.rename(index=headings).drop(0, axis=1)

In [225]:
def reshape_epf(df: pd.DataFrame, id_vars=["head", "gender"]):
    df.iloc[0] = df.iloc[0].str.replace("\n", "").replace("", np.nan).ffill()
    df = df.T.melt(id_vars=id_vars, var_name="age")
    categorical_columns = id_vars + ["age"]
    df[categorical_columns] = df[categorical_columns].astype("category")
    # TODO: convert values to integer values
    # df.value = pd.to_numeric(df.value.str.replace(",", "").str.strip(), errors="coerce", downcast="unsigned")
    return df

In [226]:
def display_df(df: pd.DataFrame):
    display(df)
    return df

In [227]:
monthly_epf_tables[0].pipe(strip_month_headline).pipe(exclude_totals_row).pipe(display_df).pipe(epf_correct_camelot).pipe(prep_row_labels).pipe(display_df).pipe(reshape_epf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Age,Number of new EPF subscribers during the month,,,,,Number of members that ceased subscribing duri...,,,,,Number of exited members who rejoined and resu...,,,,
1,,Male,Female,Others,Not Available,Total,Male,Female,Others,Not Available,Total,Male,Female,Others,Not Available,Total
2,Less than 18,5113,2253,,2,7368,1381,961,,1,2343,583,413,,,996
3,1821,159264,45618,1,2,204885,125257,25808,1,2,151068,91712,18738,,,110450
4,2225,151502,60961,2,2,212467,244727,59732,4,6,304469,232357,51407,3,,283767
5,2628,66699,24307,1,3,91010,168592,39945,2,26,208565,174474,36473,1,,210948
6,2935,93449,42673,1,4,136127,250978,60885,5,148,312016,260081,54954,5,,315040
7,More than 35,109095,44920,2,4,154021,267823,75393,7,309,343532,238353,55431,5,1,293790


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
head,Number of new EPF subscribers during the month,,,,,Number of members that ceased subscribing duri...,,,,,Number of exited members who rejoined and resu...,,,,
gender,Male,Female,Others,Not Available,Total,Male,Female,Others,Not Available,Total,Male,Female,Others,Not Available,Total
less than 18,5113,2253,,2,7368,1381,961,,1,2343,583,413,,,996
18-21,159264,45618,1,2,204885,125257,25808,1,2,151068,91712,18738,,,110450
22-25,151502,60961,2,2,212467,244727,59732,4,6,304469,232357,51407,3,,283767
26-28,66699,24307,1,3,91010,168592,39945,2,26,208565,174474,36473,1,,210948
29-35,93449,42673,1,4,136127,250978,60885,5,148,312016,260081,54954,5,,315040
more than 35,109095,44920,2,4,154021,267823,75393,7,309,343532,238353,55431,5,1,293790


Unnamed: 0,head,gender,age,value
0,Number of new EPF subscribers during the month,Male,less than 18,5113
1,Number of new EPF subscribers during the month,Female,less than 18,2253
2,Number of new EPF subscribers during the month,Others,less than 18,
3,Number of new EPF subscribers during the month,Not Available,less than 18,2
4,Number of new EPF subscribers during the month,Total,less than 18,7368
...,...,...,...,...
85,Number of exited members who rejoined and resu...,Male,more than 35,238353
86,Number of exited members who rejoined and resu...,Female,more than 35,55431
87,Number of exited members who rejoined and resu...,Others,more than 35,5
88,Number of exited members who rejoined and resu...,Not Available,more than 35,1


In [228]:
parse_date_head(monthly_epf_tables[0]).iloc[0]

Timestamp('2021-04-01 00:00:00')

In [229]:
def epf_data_pipeline(df: pd.DataFrame):
    month = parse_date_head(df).iloc[0]
    df = df.pipe(strip_month_headline) \
        .pipe(exclude_totals_row) \
        .pipe(epf_correct_camelot) \
        .pipe(prep_row_labels) \
        .pipe(reshape_epf)
    df["year"] = month.year
    df["month"] = month.month
    df["sector"] = np.nan
    return df

In [230]:
fnl_epf = pd.concat(map(epf_data_pipeline, monthly_epf_tables))

In [231]:
def is_esic(df: pd.DataFrame):
    return df.iloc[:, 5].str.lower().str.contains('Number of newly registered', na=False, case=False).any()

In [232]:
monthly_esic_tables = list(filter(is_monthly, filter(is_esic, tables)))

In [233]:
monthly_esic_tables[0].pipe(strip_month_headline).pipe(exclude_totals_row).pipe(prep_row_labels).pipe(display_df).pipe(reshape_epf).head()

Unnamed: 0,1,2,3,4,5,6,7,8
head,Number of existing employees who paid during t...,,,,Number of newly registered employees & \npayin...,,,
gender,Male,Female,Others,Total,Male,Female,Others,Total
less than 18,3697,3293,,6990,1287,1179,,2466
18-21,659718,137628,16,797362,164651,33390,1,198042
22-25,3130525,733868,105,3864498,252353,60649,7,313009
26-28,2935539,564793,92,3500424,130367,27451,3,157821
29-35,5387458,1056183,202,6443843,163346,44383,8,207737
more than 35,7796460,2194719,420,9991599,153023,46333,11,199367


Unnamed: 0,head,gender,age,value
0,Number of existing employees who paid during t...,Male,less than 18,3697.0
1,Number of existing employees who paid during t...,Female,less than 18,3293.0
2,Number of existing employees who paid during t...,Others,less than 18,
3,Number of existing employees who paid during t...,Total,less than 18,6990.0
4,Number of newly registered employees & paying ...,Male,less than 18,1287.0


In [234]:
def esic_data_pipeline(df: pd.DataFrame):
    month = parse_date_head(df).iloc[0]
    df = df.pipe(strip_month_headline) \
        .pipe(exclude_totals_row) \
        .pipe(prep_row_labels) \
        .pipe(reshape_epf)
    df["year"] = month.year
    df["month"] = month.month
    df["sector"] = np.nan
    return df

In [235]:
fnl_esic = pd.concat(map(esic_data_pipeline, monthly_esic_tables))

In [236]:
fnl_esic

Unnamed: 0,head,gender,age,value,year,month,sector
0,Number of existing employees who paid during t...,Male,less than 18,3697,2021,4,
1,Number of existing employees who paid during t...,Female,less than 18,3293,2021,4,
2,Number of existing employees who paid during t...,Others,less than 18,,2021,4,
3,Number of existing employees who paid during t...,Total,less than 18,6990,2021,4,
4,Number of newly registered employees & paying ...,Male,less than 18,1287,2021,4,
...,...,...,...,...,...,...,...
43,Number of existing employees who paid during t...,Total,more than 35,10074164,2022,3,
44,Number of newly registered employees & paying ...,Male,more than 35,194763,2022,3,
45,Number of newly registered employees & paying ...,Female,more than 35,63191,2022,3,
46,Number of newly registered employees & paying ...,Others,more than 35,11,2022,3,


In [237]:
def is_nps(df: pd.DataFrame):
    return df.iloc[:, 1].str.lower().str.replace("\n", "").str.contains('existing subscribers', na=False,case=False).any()


In [238]:
monthly_nps_tables = list(filter(is_monthly, filter(is_nps, tables)))

In [239]:
from functools import partial
prep_nps_row_labels = partial(prep_row_labels, lheads=["head", "sector", "gender"])
# reshape_nps = partial(reshape_epf, id_vars=["head", "gender", "sector"])

In [240]:
def reshape_nps(df: pd.DataFrame, id_vars=["head", "gender", "sector"]):
    df = df.T.replace("", np.nan).ffill()
    df = df.melt(id_vars=id_vars, var_name="age")
    categorical_columns = id_vars + ["age"]
    df[categorical_columns] = df[categorical_columns].astype("category")
    return df

In [241]:
monthly_nps_tables[0].pipe(strip_month_headline).pipe(exclude_totals_row).pipe(prep_nps_row_labels).pipe(display_df).pipe(reshape_nps)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
head,Total Existing \nSubscribers \ncontributing du...,New Subscribers contributing during the month,,,,,,,,,,,,,,,
sector,,Central Govt,,,,,State Govt,,,,,Non-Govt (Corporate Sector),,,,,Total New \nSubscribers
gender,,Male,Female,Transgender,Non-IRA,Total,Male,Female,Transgender,Non-IRA,Total,Male,Female,Transgender,Non-IRA,Total,
18-21,15807,210,35,-,,- 245,607,130,-,,- 737,33,39,-,,- 72,1054
22-25,255214,2098,312,-,,"- 2,410",3522,1586,-,,"- 5,108",1190,846,-,,"- 2,036",9554
26-28,545355,2362,376,-,,"- 2,738",4078,2329,-,,"- 6,407",1404,745,-,,"- 2,149",11294
29-35,2057265,2533,606,-,,"- 3,139",10052,6240,-,,"- 16,292",4812,1035,-,,"- 5,847",25278
> 35,2576417,547,204,-,,- 751,3054,1860,-,,"- 4,914",2620,379,-,,"- 2,999",8664
non-ira,1720,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-


Unnamed: 0,head,gender,sector,age,value
0,Total Existing \nSubscribers \ncontributing du...,,,18-21,15807
1,New Subscribers contributing during the month,Male,Central Govt,18-21,210
2,New Subscribers contributing during the month,Female,Central Govt,18-21,35
3,New Subscribers contributing during the month,Transgender,Central Govt,18-21,-
4,New Subscribers contributing during the month,Non-IRA,Central Govt,18-21,-
...,...,...,...,...,...
97,New Subscribers contributing during the month,Female,Non-Govt (Corporate Sector),non-ira,-
98,New Subscribers contributing during the month,Transgender,Non-Govt (Corporate Sector),non-ira,-
99,New Subscribers contributing during the month,Non-IRA,Non-Govt (Corporate Sector),non-ira,-
100,New Subscribers contributing during the month,Total,Non-Govt (Corporate Sector),non-ira,-


In [242]:
def nps_data_pipeline(df: pd.DataFrame):
    month = parse_date_head(df).iloc[0]
    df = df.pipe(strip_month_headline) \
        .pipe(exclude_totals_row) \
        .pipe(prep_nps_row_labels) \
        .pipe(reshape_nps)
    df["year"] = month.year
    df["month"] = month.month
    for col in ["head", 'gender', 'sector']:
        df[col] = df[col].str.replace("\n", "")
    df["age"] = df["age"].str.replace("> 35", "more than 35")
    return df

In [243]:
fnl_nps = pd.concat(map(nps_data_pipeline, monthly_nps_tables))

In [244]:
fnl_nps

Unnamed: 0,head,gender,sector,age,value,year,month
0,Total Existing Subscribers contributing during...,,,18-21,15807,2021,4
1,New Subscribers contributing during the month,Male,Central Govt,18-21,210,2021,4
2,New Subscribers contributing during the month,Female,Central Govt,18-21,35,2021,4
3,New Subscribers contributing during the month,Transgender,Central Govt,18-21,-,2021,4
4,New Subscribers contributing during the month,Non-IRA,Central Govt,18-21,-,2021,4
...,...,...,...,...,...,...,...
97,New Subscribers contributing during the month,Female,Non-Govt (Corporate Sector),non-ira,-,2022,3
98,New Subscribers contributing during the month,Transgender,Non-Govt (Corporate Sector),non-ira,-,2022,3
99,New Subscribers contributing during the month,Non-IRA,Non-Govt (Corporate Sector),non-ira,-,2022,3
100,New Subscribers contributing during the month,Total,Non-Govt (Corporate Sector),non-ira,-,2022,3


In [245]:
fnl_df = pd.concat([fnl_nps, fnl_epf, fnl_esic])

In [246]:
interim_path = Path.cwd().parent / "data" / "interim" / f"{src_pdf_filename}.csv"
fnl_df.to_csv(interim_path, index=False)