## Section 1: Imports and global settings

In [1]:
import pandas as pd
from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

## Section 2: Load and concatenate enrolment data

In [2]:
enrolment_path = Path("../data/enrolment")
enrolment_files = sorted(enrolment_path.glob("*.csv"))

df_enrolment = pd.concat(
    (pd.read_csv(f) for f in enrolment_files),
    ignore_index=True
)

df_enrolment.shape

(1006029, 7)

In [3]:
df_enrolment.head()
df_enrolment.columns
df_enrolment.dtypes

date              object
state             object
district          object
pincode            int64
age_0_5            int64
age_5_17           int64
age_18_greater     int64
dtype: object

## Section 3: Load and concatenate demographic updates

In [4]:
demo_path = Path("../data/demographic")
demo_files = sorted(demo_path.glob("*.csv"))

df_demo_updates = pd.concat(
    (pd.read_csv(f) for f in demo_files),
    ignore_index=True
)

df_demo_updates.shape
df_demo_updates.columns
df_demo_updates.dtypes

date             object
state            object
district         object
pincode           int64
demo_age_5_17     int64
demo_age_17_      int64
dtype: object

## Section 4: Load and concatenate biometric updates

In [5]:
bio_path = Path("../data/biometric")
bio_files = sorted(bio_path.glob("*.csv"))

df_bio_updates = pd.concat(
    (pd.read_csv(f) for f in bio_files),
    ignore_index=True
)

df_bio_updates.shape
df_bio_updates.columns
df_bio_updates.dtypes

date            object
state           object
district        object
pincode          int64
bio_age_5_17     int64
bio_age_17_      int64
dtype: object

## Section 5: Schema Alignment and Validation

### The following columns are present in all datasets (enrolment, demographic updates, biometric updates):

- date
- state
- district
- pincode