# Data Cleaning

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

In [2]:
RAW_DIR: Path = Path("../data/raw")
PROCESSED_DIR: Path = Path("../data/processed")

In [3]:
def load_data(raw_dir: Path, date_format: str, encoding: str)-> pd.DataFrame:
    dfs:list[pd.DataFrame] = []
    for file in raw_dir.glob("*.csv"):
        df = pd.read_csv(
            file,
            date_format=date_format,
            encoding=encoding,
            parse_dates=["Dátum"],
            index_col="Dátum"
        )
        dfs.append(df)
    combined = pd.concat(dfs, ignore_index=False)
    return combined.sort_index()

def save_data(df: pd.DataFrame, output_file: Path):
    df.to_csv(PROCESSED_DIR/output_file, float_format="%.2f")

## Clean BUX Data

In [4]:
df = load_data(RAW_DIR/"bux", "%Y.%m.%d.", 'utf-8')
df = df.drop(columns=["Név"])
df = df.resample("B").asfreq().interpolate(method="time")
save_data(df, "bux.csv")

# df.plot();
# plt.tight_layout(); plt.show()

## Clean KSH GDP Data

### Prepare table to read

In [5]:
def load_ksh_data(raw_dir: Path, column_in: str) -> pd.DataFrame:
    columns = ["Dátum", column_in]
    data = []
    quarter_map = {"I": 1, "II": 2, "III": 3, "IV": 4}
        
    for file_name in raw_dir.glob("*.csv"):
        with open(file_name, "r", encoding="iso-8859-2") as file:
            # Ignore the first line
            next(file)
            
            # Save the title line
            column_num = 0
            headers = next(file)
            for i, header in enumerate(headers.split(";")):
                if header.strip() == column_in:
                    column_num = i

            year = 0
            for num, line in enumerate(file):
                cells = line.split(";")
                row = []
                
                for i, cell in enumerate(cells):
                    cell = cell.strip()
                    if i == 0:
                        if cell != "":
                            year = int(cell[:-1])
                        else:
                            cell = year
                    elif i == 1:
                        try:
                            quarter = quarter_map[cell[:-1]]
                            period = f"{year}Q{quarter}"
                            date = pd.Period(period, freq="Q").start_time
                            row.append(date.strftime("%Y-%m-%d"))
                        except KeyError:
                            print(f"Skipping row {num}: Invalid quarter '{cell[:-1]}'")
                            break
                    elif i == column_num:
                        try:
                            value = float(cell.replace(",", "."))
                            row.append(value)
                        except ValueError:
                            row = []
                            print(f"Skipping row {num}: Invalid float '{cell}'")
                            break
                if len(row) > 0:
                    data.append(row)
    
    df = pd.DataFrame(data, columns=columns)
    df['Dátum'] = pd.to_datetime(df['Dátum'], format='%Y-%m-%d')
    df.set_index('Dátum', inplace=True)
    return df
                            

In [None]:
kind = "gdp"
raw_dir = RAW_DIR/"ksh"/kind
column_in = "Szezonálisan és naptárhatással kiigazított és kiegyensúlyozott adatok (előző negyedév=100,0%)"
df = load_ksh_data(raw_dir, column_in)
save_data(df, f"{kind}.csv")

Skipping row 117: Invalid float ''
Skipping row 118: Invalid float ''
Skipping row 119: Invalid float ''
Skipping row 120: Invalid quarter 'IIV'


## Clean KSH Inflation Data

In [20]:
file_name = "1995m01_2025_m04_core_inflation.csv"
path = raw_dir = RAW_DIR/"ksh"/"inflation"/file_name

columns = ["Dátum", "Havi maginfláció"]
data = []

with open(path, "r", encoding="iso-8859-2") as file:
    headers = []
    for header in next(file).split(","):
        header = header.strip()
        if header != "":
            headers.append(header)
    
    for num, line in enumerate(file):
        year = ""
        for i, cell in enumerate(line.split(",")):
            cell = cell.strip()
            if i == 0:
                year = cell
            else:
                try:
                    value = float(cell.replace(",", "."))
                    data.append([f"{year}-{headers[i-1]}", value])
                except ValueError:
                    print(f"Skipping row {num}: Invalid float '{cell}'")
                    break

df = pd.DataFrame(data, columns=columns)
df['Dátum'] = pd.to_datetime(df['Dátum'], format='%Y-%m')
df.set_index('Dátum', inplace=True)

save_data(df, "inflation.csv")

Skipping row 30: Invalid float ''
