In [206]:
import pandas as pd
import pandera as pa
import typing
from datetime import datetime, date
import requests
import json

from typing import List, Type

from pandera import Column, DataFrameSchema
from pandera.typing import Series

from pydantic import BaseModel, validator, Field
from pydantic import PositiveInt, ValidationError
from pydantic_extra_types.country import CountryAlpha2
from pydantic.functional_validators import AfterValidator
from pydantic.dataclasses import dataclass
# from pydantic.main import ModelMetaClass

### Defining the goals

1. Define a pydantic date class to ensure that date format is 'YYYY-MM-DD' ie '%Y-%m-%d'
2. complete *get_holiday_calendar(country, year)* function which requests holiday. Define the function parameters (year and country) with pydantic types
3. Define *HolidayCalendar* which inherits from pydantic *BaseModel* (or from pandera `DataFrameSchema`)
4. Use functional programming paradigm for the following:
   - *concatenate_dataframes(df1, df2)* which concatenates 2 dataframes of the same pydantic type
   - *filter_country()*, *filter_year()*, *filter_global()*
   - *get_new_holiday()* [ to define ]

More:
- Verify that date is not before 1980

Refs:
- https://www.inwt-statistics.com/blog/pandas-dataframe-validation-with-pydantic

### Define pydantic dates

In [58]:
def validate_yyyymmdd_dateformat(v: str):
    assert datetime.strptime(v, '%Y-%m-%d'), f"{v} format is not YYYY-MM-DD"
    return v

def validate_dateformat(v: str, date_format: str = '%Y-%m-%d'):
    assert datetime.strptime(v, date_format), f"{v} format is not {date_format}"
    return v

yyyymmdd_date_format = typing.Annotated[str, AfterValidator(validate_yyyymmdd_dateformat)]
calendar_date_format = typing.Annotated[str, AfterValidator(lambda v: validate_dateformat(v, date_format='%Y'))]

class YearDateModel(BaseModel):
    date: calendar_date_format

In [56]:
print(YearDateModel(date='2024')) 

date='2024'


### Requesting data

In [51]:
def get_holiday_calendar(country: CountryAlpha2, year: YearDateModel):
    HOLIDAY_URL = "https://date.nager.at/api/v2/publicholidays"
    url = f"{HOLIDAY_URL}/{year}/{country}"
    resp = requests.get(url)

    if resp.status_code != 200:
        raise requests.exceptions.RequestException(f"An error has occured when requesting {url}")

    return pd.json_normalize(resp.json())    

In [52]:
t = get_holiday_calendar(country='US', year=2024)

In [139]:
t.head()

Unnamed: 0,date,localName,name,countryCode,fixed,global,counties,launchYear,type
0,2024-01-01,New Year's Day,New Year's Day,US,False,True,,,0
1,2024-01-15,"Martin Luther King, Jr. Day","Martin Luther King, Jr. Day",US,False,True,,,0
2,2024-02-19,Washington's Birthday,Presidents Day,US,False,True,,,0
3,2024-03-29,Good Friday,Good Friday,US,False,False,"[US-CT, US-DE, US-HI, US-IN, US-KY, US-LA, US-...",,0
4,2024-03-29,Good Friday,Good Friday,US,False,False,[US-TX],,0


### Define dataclass for Holiday

In [79]:
def validate_county_format(v: str):
    assert len(v) == 2 and v.isalpha(), f"County needs to be a 2-letter symbols"
    return v

county_format = typing.Annotated[str, AfterValidator(validate_county_format)]

class CountyFormatModel(BaseModel):
    county: county_format

In [80]:
print(CountyFormatModel(county='CT'))

county='CT'


In [196]:
def validate_country_county_format(v: str):
    tsplit = v.split('-')

    ERROR_MSG = f"{v} needs to be formatted <COUNTRY>-<COUNTY> ex: 'US-CT'"
    if len(tsplit) != 2:
        raise ValueError(ERROR_MSG)

    country, county = tsplit
    is_valid_country = isinstance(CountryAlpha2(country), CountryAlpha2)
    is_valid_county = isinstance(CountyFormatModel(county=county), CountyFormatModel)
    assert is_valid_country and is_valid_county, ERROR_MSG
    return v

country_county_format = typing.Annotated[str, AfterValidator(validate_country_county_format)]

class CountryCountyFormat(BaseModel):
    country_county: country_county_format | None

In [197]:
print(CountryCountyFormat(country_county='US-CT'))
print(CountryCountyFormat(country_county=None))

country_county='US-CT'
country_county=None


In [104]:
class OptionalYYYYMMDDFormat(BaseModel):
    launch_year: yyyymmdd_date_format | None

In [105]:
print(OptionalYYYYMMDDFormat(launch_year=None))
print(OptionalYYYYMMDDFormat(launch_year='2024-10-12'))

launch_year=None
launch_year='2024-10-12'


In [150]:
## --- with pydantic dataclasses
# @dataclass
# class Holiday(BaseModel):
#     date: yyyymmdd_date_format
#     localName: str
#     name: str
#     countryCode: CountryAlpha2
#     fixed: bool
#     _global: bool
#     counties: List[CountryCountyFormat]
#     launchYear: OptionalYYYYMMDDFormat
#     _type: PositiveInt
    

In [177]:
class Holiday(BaseModel):
    date: yyyymmdd_date_format
    localName: str
    name: str
    countryCode: CountryAlpha2
    fixed: bool
    _global: bool
    # counties: List[CountryCountyFormat]
    # launchYear: OptionalYYYYMMDDFormat
    _type: PositiveInt
    

In [151]:
## --- with pandera --- WRONG
# holiday_schema = DataFrameSchema({
#     "date": Column(yyyymmdd_date_format),
#     "localName": Column(str),
#     "name": Column(str),
#     "fixed": Column(bool),
#     "global": Column(bool),
#     "counties": Column(List[CountryCountyFormat], required=False),
#     "launchYear": Column(OptionalYYYYMMDDFormat, required=False),
#     "type": Column(PositiveInt),
# })

In [152]:
## --- with pandera and pydantic
# class Holiday(pandera.DataFrameModel):
#     date: Series[yyyymmdd_date_format]
#     localName: Series[str]
#     name: Series[str]
#     countryCode: Series[CountryAlpha2]
#     fixed: Series[bool]
#     # _global: Series[bool]
#     counties: Series[List[CountryCountyFormat]]
#     launchYear: Series[OptionalYYYYMMDDFormat]
#     # _type: Series[PositiveInt]
    

In [178]:
t0 = t.rename(columns={'global': '_global', 'type': '_type'})
# t0 = t0.drop(columns=['_global', '_type', 'counties', 'launchYear'])
t0 = t0.drop(columns=['counties', 'launchYear'])

In [179]:
t1 = t0.iloc[0].to_dict()
print(t1)

{'date': '2024-01-01', 'localName': "New Year's Day", 'name': "New Year's Day", 'countryCode': 'US', 'fixed': False, '_global': True, '_type': '0'}


In [180]:

Holiday(**t1)

Holiday(date='2024-01-01', localName="New Year's Day", name="New Year's Day", countryCode='US', fixed=False)

In [212]:
Holiday(t)

TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given

In [189]:
# --- list of dict
t.to_dict(orient='records')

In [190]:
class HolidayDataFrame(BaseModel):
    dct_list: List[Holiday]

In [193]:
HolidayDataFrame(dct_list=t.to_dict(orient='records'))

HolidayDataFrame(dct_list=[Holiday(date='2024-01-01', localName="New Year's Day", name="New Year's Day", countryCode='US', fixed=False), Holiday(date='2024-01-15', localName='Martin Luther King, Jr. Day', name='Martin Luther King, Jr. Day', countryCode='US', fixed=False), Holiday(date='2024-02-19', localName="Washington's Birthday", name='Presidents Day', countryCode='US', fixed=False), Holiday(date='2024-03-29', localName='Good Friday', name='Good Friday', countryCode='US', fixed=False), Holiday(date='2024-03-29', localName='Good Friday', name='Good Friday', countryCode='US', fixed=False), Holiday(date='2024-05-27', localName='Memorial Day', name='Memorial Day', countryCode='US', fixed=False), Holiday(date='2024-06-19', localName='Juneteenth National Independence Day', name='Juneteenth National Independence Day', countryCode='US', fixed=False), Holiday(date='2024-07-04', localName='Independence Day', name='Independence Day', countryCode='US', fixed=False), Holiday(date='2024-09-02', l

### Concatenate Dataframes

In [230]:
def is_basemodel_instance(df: pd.DataFrame, dataframe_class: Type[BaseModel]) -> bool:
    """
    Function that checks whether the dataframe df is an instance of the BaseModel
    """
    for index, row in df.iterrows():
        try:
            dataframe_class(**row.to_dict())  # Validate each row
        except ValidationError as e:
            # raise ValueError(f"Row {index} is invalid: {e}")
            return False
    return True

In [247]:
print(is_basemodel_instance(df=t, dataframe_class=Holiday)) # FIXME: should be strictly the same type
print(is_basemodel_instance(df=t0, dataframe_class=Holiday))
print(is_basemodel_instance(df=pd.DataFrame([]), dataframe_class=Holiday)) # base case always True

True
True
True


In [243]:
def concat_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, dataframe_class: Type[BaseModel]):
    # check if df1 and df2 have the same dataframe_class
    is_df1_valid = is_basemodel_instance(df1, Holiday)
    is_df2_valid = is_basemodel_instance(df2, Holiday)

    assert is_df1_valid, f"df1 is not an instance of {dataframe_class.__name__}"
    assert is_df2_valid, f"df2 is not an instance of {dataframe_class.__name__}"

    return pd.concat([df1, df2], ignore_index=True)

In [251]:
years = ['2024']
countries = ['US', 'CA', 'GB', 'AU', 'CN'] 

In [252]:
dfs = pd.DataFrame([])
for year in years:
    for country in countries:
        df = get_holiday_calendar(year=year, country=country)
        dfs = concat_dataframes(dfs, df, Holiday)

Unnamed: 0,date,localName,name,countryCode,fixed,global,counties,launchYear,type
0,2024-01-01,New Year's Day,New Year's Day,US,False,True,,,0
1,2024-01-15,"Martin Luther King, Jr. Day","Martin Luther King, Jr. Day",US,False,True,,,0
2,2024-02-19,Washington's Birthday,Presidents Day,US,False,True,,,0
3,2024-03-29,Good Friday,Good Friday,US,False,False,"[US-CT, US-DE, US-HI, US-IN, US-KY, US-LA, US-...",,0
4,2024-03-29,Good Friday,Good Friday,US,False,False,[US-TX],,0
...,...,...,...,...,...,...,...,...,...
84,2024-04-05,清明节,Qingming Festival (Tomb-Sweeping Day),CN,False,True,,,0
85,2024-05-01,劳动节,Labour Day,CN,False,True,,,0
86,2024-06-10,端午节,Dragon Boat Festival,CN,False,True,,,0
87,2024-09-17,中秋节,Mid-Autumn Festival,CN,False,True,,,0


### Filtering

In [255]:
au_mask = dfs['countryCode'] == 'AU'
dfs[au_mask].head()

Unnamed: 0,date,localName,name,countryCode,fixed,global,counties,launchYear,type
58,2024-01-01,New Year's Day,New Year's Day,AU,False,True,,,0
59,2024-01-26,Australia Day,Australia Day,AU,False,True,,,0
60,2024-03-04,Labour Day,Labour Day,AU,False,False,[AU-WA],,0
61,2024-03-11,Canberra Day,Canberra Day,AU,False,False,[AU-ACT],,0
62,2024-03-11,March Public Holiday,March Public Holiday,AU,False,False,[AU-SA],,0


In [265]:
def filter_by_country(df: pd.DataFrame, country: CountryAlpha2, country_col: str):
    assert country_col in df.columns, f"{country_col} not in dataframe"
    assert isinstance(CountryAlpha2(country), CountryAlpha2), f"{country} not a valid country symbol"

    country_mask = df[country_col] == country
    return df[country_mask]

In [270]:
def filter_by_column_exact_value(df: pd.DataFrame, value: str | int | bool, column_name: str): # value: int, str, date
    assert column_name in df.columns, f"{column_name} not in dataframe"

    mask = df[column_name] == value
    return df[mask]

In [273]:
filter_by_column_exact_value(filter_by_column_exact_value(dfs, 'AU', 'countryCode'), False, 'global')

Unnamed: 0,date,localName,name,countryCode,fixed,global,counties,launchYear,type
60,2024-03-04,Labour Day,Labour Day,AU,False,False,[AU-WA],,0
61,2024-03-11,Canberra Day,Canberra Day,AU,False,False,[AU-ACT],,0
62,2024-03-11,March Public Holiday,March Public Holiday,AU,False,False,[AU-SA],,0
63,2024-03-11,Eight Hours Day,Eight Hours Day,AU,False,False,[AU-TAS],,0
64,2024-03-11,Labour Day,Labour Day,AU,False,False,[AU-VIC],,0
66,2024-03-30,Easter Eve,Holy Saturday,AU,False,False,"[AU-ACT, AU-NSW, AU-NT, AU-QLD, AU-SA, AU-VIC]",,0
67,2024-03-31,Easter Sunday,Easter Sunday,AU,False,False,"[AU-ACT, AU-NSW, AU-NT, AU-QLD, AU-SA, AU-VIC,...",,0
70,2024-05-06,May Day,May Day,AU,False,False,[AU-NT],,0
71,2024-05-06,Labour Day,Labour Day,AU,False,False,[AU-QLD],,0
72,2024-05-27,Reconciliation Day,Reconciliation Day,AU,False,False,[AU-ACT],,0


True