# Processing Religion Data

In [70]:
from typing import Iterator
import pandas as pd
import json

filepath: str = 'out/counties.json'

file = open(filepath, 'r', encoding='utf-8')

counties: list[dict[str, any]] = json.load(file)

file.close()

filepath = 'assets/religion/religion.csv'

df: pd.DataFrame = pd.read_csv(
    filepath,
    delimiter=';',
    keep_default_na=True,
    dtype={
        'OBS_VALUE': 'Int64',
        'TEL_SZ_ADAT': 'category',
        'TERUL_GEO5': 'string',
        'TIME_PERIOD': 'Int64',
    },
    usecols=[
        'OBS_VALUE',
        'TEL_SZ_ADAT',
        'TERUL_GEO5',
        'TIME_PERIOD',
    ],
)

df['OBS_VALUE'].fillna(0, inplace=True)

# renaming columns
df = df.rename(
    columns={
        'TERUL_GEO5': 'id',
        'TEL_SZ_ADAT': 'religion',
        'TIME_PERIOD': 'year',
        'OBS_VALUE': 'value',
    },
)

# defining more human readable categories
RELIGION_CATEGORIES: dict[str, str] = {
    'RE_C': 'catholic',
    'RE_GC': 'greek_catholic',
    'RE_RC': 'roman_catholic',
    'RE_CA': 'calvinist',
    'RE_LU': 'lutheran',
    'RE_CD': 'other_christian',
    'RE_OC': 'orthodox_christian',
    'RE_J': 'jewish',
    'RE_NOT': 'atheist',
    'RE_OCD': 'other',
}

# swiping the original categories for new ones
df['religion'] = df['religion'].map(lambda x: RELIGION_CATEGORIES[x])


def get_county_id(district_id: str) -> str:
    for county in counties: 
        if district_id in county['ids']:
            return county['id']
    return ''

df['county id'] = df['id'].map(get_county_id)

        
        
def create_data_frame_by_year() -> Iterator[pd.DataFrame]:
    for year in [2011, 2022]:
        df_year = pd.DataFrame({'id': []})
        
        for religion in RELIGION_CATEGORIES.values():
            df_year[religion] = []
        
        for upcoming_id in pd.unique(df['id']):
            df_upcoming = df[(df['id'] == upcoming_id) & (df['year'] == year)]
            
            row = {'id': upcoming_id}
            
            for religion in RELIGION_CATEGORIES.values():
                row[religion] = max(df_upcoming[df_upcoming['religion'] == religion]['value'])
            
            df_year.loc[len(df_year.index)] = row
            
        yield df_year

df_2011, df_2022 = tuple(create_data_frame_by_year())

df_2011.to_json('out/religion_2011.json', orient='records', indent=4, index=False)

df_2022.to_json('out/religion_2022.json', orient='records', indent=4, index=False)
