In [73]:
import pandas as pd
from pathlib import Path
import numpy as np

In [74]:
data_path = Path.cwd().parent / "data"
interim_data_dir = data_path / "interim"
interim_data_dir.mkdir(parents=True, exist_ok=True)
final_data_dir = data_path / "processed"
final_data_dir.mkdir(parents=True, exist_ok=True)

In [75]:
src_files = [interim_data_dir / "december_2022.csv", interim_data_dir / "march_2022.csv"]


## Read interim data

In [76]:
df = pd.concat(map(pd.read_csv, src_files))
df

Unnamed: 0,head,gender,sector,age,value,year,month
0,Total Existing Subscribers contributing during...,,,18-21,12538,2022,4
1,New Subscribers contributing during the month,Male,Central Govt,18-21,394,2022,4
2,New Subscribers contributing during the month,Female,Central Govt,18-21,66,2022,4
3,New Subscribers contributing during the month,Transgender,Central Govt,18-21,-,2022,4
4,New Subscribers contributing during the month,Non-IRA,Central Govt,18-21,-,2022,4
...,...,...,...,...,...,...,...
2875,Number of existing employees who paid during t...,Total,,more than 35,10074164,2022,3
2876,Number of newly registered employees & paying ...,Male,,more than 35,194763,2022,3
2877,Number of newly registered employees & paying ...,Female,,more than 35,63191,2022,3
2878,Number of newly registered employees & paying ...,Others,,more than 35,11,2022,3


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5040 entries, 0 to 2879
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   head    5040 non-null   object
 1   gender  4914 non-null   object
 2   sector  2016 non-null   object
 3   age     5040 non-null   object
 4   value   4688 non-null   object
 5   year    5040 non-null   int64 
 6   month   5040 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 315.0+ KB


## Handle categories

In [78]:
df = df.astype({
    "head": "category",
    "gender": "category",
    "sector": "category",
    "age": "category"
})

In [79]:
for h in df["head"].unique():
    print(h)

Total Existing Subscribers contributing during the month
New Subscribers contributing during the month
Number of new EPF subscribers during the month
Number of members that ceased subscribing during the month
Number of exited members who rejoined and resubscribed during the month
Number of existing employees who paid during the month
Number of newly registered employees & paying contribution during the month


In [80]:
df["head"] = df["head"].cat.rename_categories({
    "Total Existing Subscribers contributing during the month": "Total Existing Subscribers contributing to NPS during the month",
    "New Subscribers contributing during the month": "New Subscribers contributing to NPS during the month",
    "Number of members that ceased subscribing during the month": "Number of EPF members that ceased subscribing during the month",
    "Number of exited members who rejoined and resubscribed during the month": "Number of exited EPF members who rejoined and resubscribed during the month",
    "Number of existing employees who paid during the month": "Number of existing employees who paid ESIS contribuition during the month",
    "Number of newly registered employees & paying contribution during the month": "Number of newly registered employees & paying ESIS contribution during the month"
})


## Convert values to integer type

In [81]:
df.value = df.value.str.replace(r"[,\s]+", "", regex=True).str.strip()

In [82]:
df.value = df.value.str.replace(r"^\-$", "", regex=True)

In [83]:
df.value = df.value.replace("", 0).replace(np.nan, 0).astype("int")

In [89]:
df[(df.value == 0) & (df.gender.isin(["Male", "Female"]))].age.unique()

['non-ira']
Categories (7, object): ['18-21', '22-25', '26-28', '29-35', 'less than 18', 'more than 35', 'non-ira']

## Final Data

In [90]:
df

Unnamed: 0,head,gender,sector,age,value,year,month
0,Total Existing Subscribers contributing to NPS...,,,18-21,12538,2022,4
1,New Subscribers contributing to NPS during the...,Male,Central Govt,18-21,394,2022,4
2,New Subscribers contributing to NPS during the...,Female,Central Govt,18-21,66,2022,4
3,New Subscribers contributing to NPS during the...,Transgender,Central Govt,18-21,0,2022,4
4,New Subscribers contributing to NPS during the...,Non-IRA,Central Govt,18-21,0,2022,4
...,...,...,...,...,...,...,...
2875,Number of existing employees who paid ESIS con...,Total,,more than 35,10074164,2022,3
2876,Number of newly registered employees & paying ...,Male,,more than 35,194763,2022,3
2877,Number of newly registered employees & paying ...,Female,,more than 35,63191,2022,3
2878,Number of newly registered employees & paying ...,Others,,more than 35,11,2022,3


### Export Data

In [93]:
df.to_parquet(final_data_dir / "march_2021_to_december_2022.parquet")
df.to_csv(final_data_dir / "march_2021_to_december_2022.csv", index=False)