In [49]:
import os
import re
import logging
import requests
from bs4 import BeautifulSoup
import logging
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
import os
import csv
import chardet
import logging
import shutil
import polars as pl
import glob
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine, Table, MetaData
import logging

In [2]:
BASE_URL = "https://siros.anac.gov.br/siros/registros/diversos/vra/"
SAVE_DIR = "hydrogen"

# Overview

This script automates the process of scraping CSV flight data files from a website that organizes its content in year directories. Basically, it does the grunt work of fetching, filtering, and downloading CSV files so you don’t have to click around like a caveman.

## What it does

- **Logging setup:**  
  Sets up logging with timestamps and levels. If things go wrong—or even if they go right—you'll know exactly when the drama unfolded.

- **Link extraction:**  
  Fetches all hyperlinks from a webpage, then uses smart filtering to isolate year-based directories and csv file links. 

- **Parallel downloads:**  
  Leverages a thread pool (up to 4 threads) to download csv files concurrently. Because who has time to wait for downloads to finish one at a time? 

## Why it does it

This code saves you from manual labor and the tedium of clicking through endless pages. It ensures that your csv files are neatly organized and downloaded in record time, so you can focus on the real work—like analyzing data.

In short: automate the tedious parts and get on with the important stuff.


In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

def get_links(url: str) -> list:
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
    return links

def get_year_links(base_url: str) -> list:
    all_links = get_links(base_url)
    year_links = [link for link in all_links if link.rstrip('/').split('/')[-1].isdigit()]
    logging.info(f"Found {len(year_links)} year directory link(s): {year_links}")
    return year_links

def get_csv_links(year_url: str) -> list:
    all_links = get_links(year_url)
    csv_links = [link for link in all_links if link.lower().endswith('.csv')]
    logging.debug(f"Found {len(csv_links)} CSV file(s) in {year_url}")
    return csv_links

def download_csv(csv_url: str, csv_path: str) -> None:
    """
    Download a CSV file from csv_url to csv_path on disk.
    Used as the 'target' for executor.submit(...) calls.
    """
    logging.info(f"Downloading {csv_url} -> {csv_path}")
    resp = requests.get(csv_url, stream=True)
    resp.raise_for_status()
    with open(csv_path, 'wb') as f:
        for chunk in resp.iter_content(chunk_size=8192):
            f.write(chunk)
    logging.debug(f"Finished downloading {csv_url}")

def scrape_flights(base_url: str, save_dir: str) -> None:
    """
    Main function:
      1. Fetch year directories
      2. For each year, fetch CSV links
      3. Use ThreadPoolExecutor to download CSVs in parallel
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir, exist_ok=True)
        logging.info(f"Created main directory: {save_dir}")

    year_links = get_year_links(base_url)

    # Create a thread pool with up to 4 concurrent threads
    with ThreadPoolExecutor(max_workers=4) as executor:
        for year_link in year_links:
            year = year_link.rstrip('/').split('/')[-1]
            csv_links = get_csv_links(year_link)

            for csv_link in csv_links:
                csv_name = csv_link.split('/')[-1]
                # Optionally prepend the year to avoid collisions:
                # csv_name = f"{year}_{csv_name}"

                csv_path = os.path.join(save_dir, csv_name)
                # Submit each download to the thread pool
                executor.submit(download_csv, csv_link, csv_path)

    logging.info("All files downloaded successfully.")

In [4]:
scrape_flights(BASE_URL, SAVE_DIR)

2025-03-02 15:23:49,157 [INFO] Created main directory: hydrogen
2025-03-02 15:23:49,416 [INFO] Found 26 year directory link(s): ['https://siros.anac.gov.br/siros/registros/diversos/vra/2000/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2001/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2002/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2003/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2004/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2005/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2006/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2007/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2008/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2009/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2010/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2011/', 'https://siros.anac.gov.br/siros/registros/diversos/vra/2012/', 'https://siros.anac.gov.br/siros/regist

# Overview

This script converts CSV files into a standardized format by detecting their original encoding and delimiter, then rewriting them as UTF-8 files with semicolon delimiters. Basically, it does the heavy lifting of cleaning up messy CSVs so you don’t have to spend your day playing detective with file formats.

## What it does

- **Encoding detection:**  
  Reads a chunk of each file in binary mode and uses chardet to guess the file’s encoding. If it can’t figure it out, it defaults to UTF-8—because sometimes even computers need a fallback plan.

- **Delimiter detection:**  
  Samples the file in text mode and employs csv.sniffer to identify the correct delimiter. If the sniffer fails, it defaults to a comma. 

- **File conversion:**  
  Streams the CSV content row by row from the source file, using the detected settings, and writes it to a new file with forced UTF-8 encoding and semicolon delimiters. This streaming approach keeps memory usage in check, even for huge files.

- **Batch processing:**  
  Iterates over a folder of CSV files, converting each one and logging every success and misstep along the way.

## Why it does it

This code saves you from the nightmare of dealing with CSV files in every possible flavor of encoding and delimiter. It standardizes your data, making it ready for analysis without the hassle of manual cleanup. 


In [5]:
def detect_file_encoding(file_path, read_bytes=65536):
    """
    Detect the encoding of a file by reading up to `read_bytes` from it in binary mode.
    Falls back to 'utf-8' if detection is inconclusive.
    """
    with open(file_path, 'rb') as rb:
        raw_data = rb.read(read_bytes)
        detect_result = chardet.detect(raw_data)
        detected = detect_result.get("encoding")
        if not detected:
            logging.warning(f"Could not detect encoding for {file_path}; defaulting to 'utf-8'")
            return "utf-8"
        return detected

def detect_file_delimiter(file_path, encoding, possible_delimiters=[",",";","\t","|"], read_chars=8192):
    """
    Detect the delimiter by reading up to `read_chars` from the file in text mode (with `encoding`).
    Falls back to ',' if Sniffer fails.
    """
    with open(file_path, "r", encoding=encoding, errors="replace") as f:
        sample = f.read(read_chars)
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=possible_delimiters)
        return dialect.delimiter
    except csv.Error:
        logging.warning(f"Could not sniff delimiter for {file_path}; defaulting to ','")
        return ","

def convert_single_csv(
    source_path, target_path,
    force_delimiter=";", force_encoding="utf-8",
    chunk_size=8192
):
    """
    Reads the CSV at `source_path` with detected encoding/delimiter
    and writes it to `target_path` in `force_encoding` & `force_delimiter`.
    
    Uses a streaming approach so we don't load entire files into memory.
    """
    # 1) Detect encoding
    detected_encoding = detect_file_encoding(source_path)
    # 2) Detect delimiter
    detected_delimiter = detect_file_delimiter(source_path, detected_encoding)

    logging.info(
        f"Converting '{source_path}' | Detected Encoding='{detected_encoding}' "
        f"Delimiter='{detected_delimiter}' -> {target_path} (utf-8 / ';')"
    )

    # 3) Stream read + write
    with open(source_path, "r", encoding=detected_encoding, errors="replace") as csv_in, \
         open(target_path, "w", encoding=force_encoding, newline="") as csv_out:

        reader = csv.reader(csv_in, delimiter=detected_delimiter)
        writer = csv.writer(csv_out, delimiter=force_delimiter)

        for row in reader:
            writer.writerow(row)

def convert_csvs_to_utf8_semicolon(
    source_folder="hydrogen",
    target_folder="helium"
):
    """
    1. Iterates over .csv files in `source_folder`.
    2. Detects encoding/delimiter for each file.
    3. Streams the data out to `target_folder` as UTF-8 / ';' delimiter.
    """
    if not os.path.exists(target_folder):
        os.makedirs(target_folder, exist_ok=True)
        logging.info(f"Created target folder: {target_folder}")

    files_processed = 0
    for filename in os.listdir(source_folder):
        if filename.lower().endswith(".csv"):
            source_path = os.path.join(source_folder, filename)
            target_path = os.path.join(target_folder, filename)

            try:
                convert_single_csv(
                    source_path=source_path,
                    target_path=target_path,
                    force_delimiter=";",
                    force_encoding="utf-8"
                )
                files_processed += 1
            except Exception as e:
                logging.error(f"Failed to convert {filename}: {e}")

    logging.info(f"Conversion complete. {files_processed} file(s) processed.")

In [6]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

convert_csvs_to_utf8_semicolon(source_folder="hydrogen", target_folder="helium")

2025-03-02 16:20:48,072 [INFO] Created target folder: helium
2025-03-02 16:20:48,186 [INFO] Converting 'hydrogen/VRA_20072.csv' | Detected Encoding='ISO-8859-1' Delimiter=',' -> helium/VRA_20072.csv (utf-8 / ';')
2025-03-02 16:20:48,294 [INFO] Converting 'hydrogen/VRA_2023_02.csv' | Detected Encoding='utf-8' Delimiter=';' -> helium/VRA_2023_02.csv (utf-8 / ';')
2025-03-02 16:20:48,670 [INFO] Converting 'hydrogen/VRA_200311.csv' | Detected Encoding='ISO-8859-1' Delimiter=',' -> helium/VRA_200311.csv (utf-8 / ';')
2025-03-02 16:20:48,757 [INFO] Converting 'hydrogen/VRA_2010_06.csv' | Detected Encoding='utf-8' Delimiter=';' -> helium/VRA_2010_06.csv (utf-8 / ';')
2025-03-02 16:20:49,122 [INFO] Converting 'hydrogen/VRA_2024_04.csv' | Detected Encoding='utf-8' Delimiter=';' -> helium/VRA_2024_04.csv (utf-8 / ';')
2025-03-02 16:20:49,435 [INFO] Converting 'hydrogen/VRA_2014_09.csv' | Detected Encoding='utf-8' Delimiter=';' -> helium/VRA_2014_09.csv (utf-8 / ';')
2025-03-02 16:20:49,858 [INFO

# Overview

This script tidies up your CSV filenames by enforcing a strict naming convention: "VRA_YYYY_MM.csv", ensuring consistency across your data files.

## What it does

- **Filename evaluation:**  
  Scans through all CSV files in the designated folder and checks whether each filename matches one of the recognized patterns. If a file is already formatted correctly, it leaves it alone; otherwise, it prepares a new name.

- **Pattern matching magic:**  
  Uses regex to detect three scenarios:
  1. Files already in the ideal "VRA_YYYY_MM.csv" format.
  2. Files in a "VRA_YYYYMM.csv" format (six digits) which are split into year and month.
  3. Files in a "VRA_YYYYM.csv" format (five digits) where the month is zero-padded.
  
  If a file doesn't match any known pattern, it gets skipped with a warning.

- **In-place renaming:**  
  Renames files directly, even overwriting existing files if necessary. 

## Why it does it

With a uniform naming scheme, your data is easier to manage and reference, meaning less time cleaning up and more time focusing on the important work.

In [7]:
def standardize_csv_filenames(source_folder="helium"):
    """
    Reads all .csv files from `source_folder` and renames them in place to match 
    the pattern 'VRA_YYYY_MM.csv'. The rules are:
    
      1) If already 'VRA_YYYY_MM.csv', leave it unchanged.
      2) If 'VRA_YYYYMM.csv' (6 digits after underscore), split into year and month.
      3) If 'VRA_YYYYM.csv' (5 digits after underscore), split and zero-pad the month.
      4) Otherwise, skip (and log a warning).
      
    If a file with the new name already exists, it will be overwritten.
    """
    # Regex patterns for matching file naming conventions
    already_ok_pattern = re.compile(r'^VRA_(\d{4})_(\d{2})\.csv$', re.IGNORECASE)
    six_digit_pattern   = re.compile(r'^VRA_(\d{6})\.csv$', re.IGNORECASE)  # e.g., VRA_200010.csv
    five_digit_pattern  = re.compile(r'^VRA_(\d{5})\.csv$', re.IGNORECASE)  # e.g., VRA_20001.csv

    files_processed = 0
    for filename in os.listdir(source_folder):
        if not filename.lower().endswith(".csv"):
            continue  # Skip non-CSV files

        source_path = os.path.join(source_folder, filename)
        new_name = None

        # Check if file is already in the correct format.
        if already_ok_pattern.match(filename):
            new_name = filename  # Leave it unchanged.
        else:
            # Check for a six-digit pattern: VRA_YYYYMM.csv
            m_six = six_digit_pattern.match(filename)
            # Check for a five-digit pattern: VRA_YYYYM.csv
            m_five = five_digit_pattern.match(filename)

            if m_six:
                digits = m_six.group(1)  # e.g. '200010'
                year = digits[:4]         # '2000'
                month = digits[4:]        # '10'
                new_name = f"VRA_{year}_{month}.csv"
            elif m_five:
                digits = m_five.group(1)  # e.g. '20001'
                year = digits[:4]         # '2000'
                month = digits[4:]        # '1'
                month = month.zfill(2)    # zero-pad to '01'
                new_name = f"VRA_{year}_{month}.csv"
            else:
                logging.warning(f"Skipping file '{filename}': does not match known patterns.")
                continue

        # Rename file in place if needed.
        if new_name and new_name != filename:
            target_path = os.path.join(source_folder, new_name)
            # Overwrite if target file already exists.
            if os.path.exists(target_path):
                os.remove(target_path)
            os.rename(source_path, target_path)
            logging.info(f"Renamed '{filename}' -> '{new_name}'")
            files_processed += 1
        else:
            logging.info(f"No renaming needed for '{filename}'")
            
    logging.info(f"All done. Processed {files_processed} file(s).")

In [8]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
standardize_csv_filenames(source_folder="helium")

2025-03-02 16:22:28,051 [INFO] Renamed 'VRA_20072.csv' -> 'VRA_2007_02.csv'
2025-03-02 16:22:28,051 [INFO] No renaming needed for 'VRA_2023_02.csv'
2025-03-02 16:22:28,052 [INFO] Renamed 'VRA_200311.csv' -> 'VRA_2003_11.csv'
2025-03-02 16:22:28,052 [INFO] No renaming needed for 'VRA_2010_06.csv'
2025-03-02 16:22:28,052 [INFO] No renaming needed for 'VRA_2024_04.csv'
2025-03-02 16:22:28,053 [INFO] No renaming needed for 'VRA_2014_09.csv'
2025-03-02 16:22:28,053 [INFO] No renaming needed for 'VRA_2016_10.csv'
2025-03-02 16:22:28,053 [INFO] Renamed 'VRA_20004.csv' -> 'VRA_2000_04.csv'
2025-03-02 16:22:28,054 [INFO] Renamed 'VRA_20046.csv' -> 'VRA_2004_06.csv'
2025-03-02 16:22:28,054 [INFO] No renaming needed for 'VRA_2022_04.csv'
2025-03-02 16:22:28,055 [INFO] Renamed 'VRA_20075.csv' -> 'VRA_2007_05.csv'
2025-03-02 16:22:28,055 [INFO] No renaming needed for 'VRA_2020_02.csv'
2025-03-02 16:22:28,055 [INFO] No renaming needed for 'VRA_2010_02.csv'
2025-03-02 16:22:28,055 [INFO] Renamed 'VRA

In [9]:
def check_all_vra_files(folder="helium"):
    """
    Verifies that all files named 'VRA_YYYY_MM.csv' exist for
    every year from 2000 to 2025 and every month (01..12).

    Logs any missing files or confirms all are present.
    """
    missing = []
    for year in range(2000, 2026):
        for month in range(1, 13):
            month_str = str(month).zfill(2)  # zero-pad, e.g. 1 -> '01'
            expected_name = f"VRA_{year}_{month_str}.csv"
            expected_path = os.path.join(folder, expected_name)
            if not os.path.isfile(expected_path):
                missing.append(expected_name)

    if missing:
        logging.info(f"Missing {len(missing)} file(s) in '{folder}': {missing}")
    else:
        logging.info(f"All files from 2000-01 to 2025-12 are present in '{folder}'.")

In [10]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
check_all_vra_files(folder="helium")

2025-03-02 16:22:38,747 [INFO] Missing 11 file(s) in 'helium': ['VRA_2025_02.csv', 'VRA_2025_03.csv', 'VRA_2025_04.csv', 'VRA_2025_05.csv', 'VRA_2025_06.csv', 'VRA_2025_07.csv', 'VRA_2025_08.csv', 'VRA_2025_09.csv', 'VRA_2025_10.csv', 'VRA_2025_11.csv', 'VRA_2025_12.csv']


# Overview

This script scans a folder of CSV files to check if their columns follow the expected patterns for two different eras—files from 2000–2009 versus 2010–2025. It logs the standard column names for each period and flags any files that dare to deviate from the norm.

## What it does

- **Log folder setup:**  
  It makes sure the log folder exists, creating it if necessary. 

- **Column extraction:**  
  Iterates over every CSV file in the source folder, extracting column names using a CSV reader (powered by Polars). It uses the filename (assuming the format "VRA_YYYY...") to determine the file’s era.

- **Pattern grouping:**  
  Files are split into two groups: 2000–2009 and 2010–2025. Each group should have a consistent set of column names. If not, the script notes the discrepancies.

- **Logging discrepancies:**  
  It writes all this info into a log file, detailing the expected column names for each group and listing any files that broke the rules.

## Why it does it

Because manually checking CSV column patterns is as fun as watching paying bills. This utility automates the process, ensuring your data conforms to expected standards and letting you know exactly which files are off the reservation.

In [11]:
def log_column_patterns(source_folder="helium", log_folder="logging"):
    """
    Iterates over all CSV files in the source_folder and extracts their column names.
    Files from 2000–2009 are assumed to follow one column pattern and files from 2010–2025 a different one.
    It writes a log file (column_patterns.log) in log_folder with the following:
      - The expected column names for files in each time period.
      - Any discrepancies if some files within a group have different columns.
    """
    # Ensure the log folder exists.
    if not os.path.exists(log_folder):
        os.makedirs(log_folder, exist_ok=True)
        logging.info(f"Created log folder: {log_folder}")
    
    # Prepare containers to hold the expected columns for each pattern
    pattern1_columns = None  # for 2000-2009
    pattern2_columns = None  # for 2010-2025
    discrepancies = []       # list of (filename, group, columns)

    # Process each CSV file in the source folder.
    for filename in os.listdir(source_folder):
        if not filename.lower().endswith(".csv"):
            continue  # Skip non-CSV files
        
        source_path = os.path.join(source_folder, filename)
        
        # Try to extract the year from the filename. Assumes format "VRA_YYYY..."
        match = re.search(r'VRA_(\d{4})', filename, re.IGNORECASE)
        if not match:
            logging.warning(f"Could not extract year from filename: {filename}")
            continue
        
        year = int(match.group(1))
        
        try:
            df = pl.read_csv(source_path)
        except Exception as e:
            logging.error(f"Error reading {filename}: {e}")
            continue
        
        cols = df.columns  # list of column names
        
        # Group files by year pattern.
        if year < 2010:
            # 2000-2009
            if pattern1_columns is None:
                pattern1_columns = cols
            else:
                if pattern1_columns != cols:
                    discrepancies.append((filename, "2000-2009", cols))
        else:
            # 2010-2025
            if pattern2_columns is None:
                pattern2_columns = cols
            else:
                if pattern2_columns != cols:
                    discrepancies.append((filename, "2010-2025", cols))
    
    # Write out the log file.
    log_filename = os.path.join(log_folder, "column_patterns.log")
    with open(log_filename, "w") as log_file:
        log_file.write("Column Patterns Log\n")
        log_file.write("===================\n\n")
        
        if pattern1_columns is not None:
            log_file.write("Pattern for years 2000-2009:\n")
            log_file.write(", ".join(pattern1_columns) + "\n\n")
        else:
            log_file.write("No files found for years 2000-2009.\n\n")
        
        if pattern2_columns is not None:
            log_file.write("Pattern for years 2010-2025:\n")
            log_file.write(", ".join(pattern2_columns) + "\n\n")
        else:
            log_file.write("No files found for years 2010-2025.\n\n")
        
        if discrepancies:
            log_file.write("Discrepancies found:\n")
            for fname, group, cols in discrepancies:
                log_file.write(f"{fname} in group {group} has columns: {cols}\n")
        else:
            log_file.write("No discrepancies found among files in each group.\n")
    
    logging.info(f"Column patterns logged to {log_filename}")

In [12]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
log_column_patterns(source_folder="helium", log_folder="logging")

2025-03-02 16:23:14,129 [INFO] Created log folder: logging
2025-03-02 16:23:14,172 [ERROR] Error reading VRA_2023_02.csv: found more fields than defined in 'Schema'

Consider setting 'truncate_ragged_lines=True'.
2025-03-02 16:23:14,182 [ERROR] Error reading VRA_2010_06.csv: found more fields than defined in 'Schema'

Consider setting 'truncate_ragged_lines=True'.
2025-03-02 16:23:14,186 [ERROR] Error reading VRA_2024_04.csv: found more fields than defined in 'Schema'

Consider setting 'truncate_ragged_lines=True'.
2025-03-02 16:23:14,190 [ERROR] Error reading VRA_2014_09.csv: found more fields than defined in 'Schema'

Consider setting 'truncate_ragged_lines=True'.
2025-03-02 16:23:14,198 [ERROR] Error reading VRA_2016_10.csv: found more fields than defined in 'Schema'

Consider setting 'truncate_ragged_lines=True'.
2025-03-02 16:23:14,203 [ERROR] Error reading VRA_2022_04.csv: found more fields than defined in 'Schema'

Consider setting 'truncate_ragged_lines=True'.
2025-03-02 16:23:

In [29]:
old_mapping = {
    "ICAO Empresa Aérea":       "empresa_aerea",      
    "Número Voo":               "numero_voo",  
    "Código Autorização (DI)":  "codigo_DI",
    "Código Tipo Linha":        "codigo_tipo_linha", 
    "ICAO Aeródromo Origem":    "aeroporto_origem",   
    "ICAO Aeródromoo Destino":  "aeroporto_destino",  
    "Partida Prevista":         "partida_prevista", 
    "Partida Real":             "partida_real",  
    "Chegada Prevista":         "chegada_prevista", 
    "Chegada Real":             "chegada_real", 
    "Situação Voo":             "situacao_voo",
    "Código Justificativa":     "justificativa"
}

new_columns = {
    "long_empresa_aerea",
    "modelo_equipamento",
    "numero_assentos",
    "descricao_aeroporto_origem",
    "descricao_aeroporto_destino",
    "referencia",
    "situacao_partida",
    "situacao_chegada"    
}

new_mapping = {
    "Sigla ICAO Empresa Aérea":     "empresa_aerea",
    "Empresa Aérea":                "long_empresa_aerea",
    "Número Voo":                   "numero_voo",
    "Código DI":                    "codigo_DI",
    "Código Tipo Linha":            "codigo_tipo_linha",
    "Modelo Equipamento":           "modelo_equipamento",
    "Número de Assentos":           "numero_assentos",
    "Sigla ICAO Aerodromo Origem":  "aeroporto_origem",
    "Descrição Aeroporto Origem":   "descricao_aeroporto_origem",
    "Partida Prevista":             "partida_prevista",
    "Partida Real":                 "partida_real",
    "Sigla ICAO Aeroporto Destino": "aeroporto_destino",
    "Descrição Aeroporto Destino":  "descricao_aeroporto_destino",
    "Chegada Prevista":             "chegada_prevista",
    "Chegada Real":                 "chegada_real",
    "Situação Voo":                 "situacao_voo",
    "Justificativa":                "justificativa",
    "Referência":                   "referencia",
    "Situação Partida":             "situacao_partida",
    "Situação Chegada":             "situacao_chegada"
}

# Desired column order
desired_order = [
    "empresa_aerea",
    "long_empresa_aerea",
    "numero_voo",
    "codigo_DI",
    "codigo_tipo_linha",
    "modelo_equipamento",
    "numero_assentos",
    "aeroporto_origem",
    "descricao_aeroporto_origem",
    "partida_prevista",
    "partida_real",
    "aeroporto_destino",
    "descricao_aeroporto_destino",
    "chegada_prevista",
    "chegada_real",
    "situacao_voo",
    "justificativa",
    "referencia",
    "situacao_partida",
    "situacao_chegada"
]

# Overview

This script processes CSV files following the "VRA_YYYY_MM.csv" naming convention within a designated folder. It sets up detailed logging to capture every success and failure, then iterates over each file for years 2000–2009. Using Polars, it reads each CSV as strings, renames columns based on a predefined mapping, adds any missing new columns, and overwrites the file—all while recording its every move.

## What it does

- **Logging configuration:**  
  Configures logging to output to "csv_processing.log" with timestamps and log levels, ensuring that every action and error is documented.

- **File iteration:**  
  Loops through each month for each year from 2000 to 2009, constructing the expected filename and checking if it exists in the input folder ("helium").

- **CSV processing:**  
  For every found file, the script:
  - Reads the CSV with Polars, forcing all columns to be treated as strings.
  - Renames the columns based on an `old_mapping` dictionary.
  - Adds any missing columns from `new_columns`, populating them with `None`.
  - Overwrites the original CSV with the updated data using semicolon delimiters.

- **Error handling:**  
  Logs every successful overwrite, and if something goes wrong, logs an error with the file's name and the exception encountered.

## Why it does it

This script eliminates the tedious manual labor of updating CSV files by automating the process. It enforces consistency and proper formatting so you can watch PSG dominate Liverpool in the entire game and still, somehow, some inexplicable how, lose the game.

In [20]:
# Configure logging
log_file = "csv_processing.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Folder containing the files
input_folder = "helium"

def process_csv_files():
    for year in range(2000, 2010):  # From 2000 to 2009
        for month in range(1, 13):  # January to December
            file_name = f"VRA_{year}_{month:02d}.csv"
            file_path = os.path.join(input_folder, file_name)
            
            if os.path.exists(file_path):
                try:
                    # Read CSV with all columns as string using Polars
                    df = pl.read_csv(file_path, separator=";", encoding="utf-8", dtypes={col: pl.Utf8 for col in old_mapping.keys()})

                    # Rename columns based on mapping
                    df = df.rename(old_mapping)

                    # Add missing new columns filled with None
                    for col in new_columns:
                        if col not in df.columns:
                            df = df.with_columns(pl.lit(None).alias(col))

                    # Overwrite the existing CSV
                    df.write_csv(file_path, separator=";", include_header=True)

                    logging.info(f"Overwritten: {file_name}")

                except Exception as e:
                    logging.error(f"Error processing {file_name}: {e}")


In [21]:
logging.info("CSV processing (overwrite mode) started.")
process_csv_files()
logging.info("CSV processing (overwrite mode) completed.")

2025-03-02 17:00:07,179 [INFO] CSV processing (overwrite mode) started.
  df = pl.read_csv(file_path, separator=";", encoding="utf-8", dtypes={col: pl.Utf8 for col in old_mapping.keys()})
2025-03-02 17:00:07,222 [INFO] Overwritten: VRA_2000_01.csv
2025-03-02 17:00:07,259 [INFO] Overwritten: VRA_2000_02.csv
2025-03-02 17:00:07,295 [INFO] Overwritten: VRA_2000_03.csv
2025-03-02 17:00:07,334 [INFO] Overwritten: VRA_2000_04.csv
2025-03-02 17:00:07,368 [INFO] Overwritten: VRA_2000_05.csv
2025-03-02 17:00:07,399 [INFO] Overwritten: VRA_2000_06.csv
2025-03-02 17:00:07,433 [INFO] Overwritten: VRA_2000_07.csv
2025-03-02 17:00:07,470 [INFO] Overwritten: VRA_2000_08.csv
2025-03-02 17:00:07,504 [INFO] Overwritten: VRA_2000_09.csv
2025-03-02 17:00:07,538 [INFO] Overwritten: VRA_2000_10.csv
2025-03-02 17:00:07,574 [INFO] Overwritten: VRA_2000_11.csv
2025-03-02 17:00:07,608 [INFO] Overwritten: VRA_2000_12.csv
2025-03-02 17:00:07,642 [INFO] Overwritten: VRA_2001_01.csv
2025-03-02 17:00:07,677 [INFO] O

In [22]:
# Configure logging
log_file = "csv_processing_2010_beyond.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Paths
input_folder = "helium"
output_folder = "test1"

# Ensure output directory exists
os.makedirs(output_folder, exist_ok=True)

def process_csv_files():
    for year in range(2010, 2100):  # From 2010 onwards (assuming future-proofing)
        for month in range(1, 13):  # January to December
            file_name = f"VRA_{year}_{month:02d}.csv"
            file_path = os.path.join(input_folder, file_name)
            
            if os.path.exists(file_path):
                try:
                    # Read CSV with all columns as string using Polars
                    df = pl.read_csv(file_path, separator=";", encoding="utf-8", dtypes={col: pl.Utf8 for col in new_mapping.keys()})

                    # Rename columns based on mapping
                    df = df.rename(new_mapping)

                    # Save the transformed CSV
                    output_path = os.path.join(output_folder, file_name)
                    df.write_csv(output_path, separator=";", include_header=True)

                    logging.info(f"Successfully processed: {file_name}")

                except Exception as e:
                    logging.error(f"Error processing {file_name}: {e}")

In [23]:
logging.info("CSV processing (2010 and beyond) started.")
process_csv_files()
logging.info("CSV processing (2010 and beyond) completed.")

2025-03-02 17:04:42,560 [INFO] CSV processing (2010 and beyond) started.
  df = pl.read_csv(file_path, separator=";", encoding="utf-8", dtypes={col: pl.Utf8 for col in new_mapping.keys()})
2025-03-02 17:04:42,702 [INFO] Successfully processed: VRA_2010_01.csv
2025-03-02 17:04:42,809 [INFO] Successfully processed: VRA_2010_02.csv
2025-03-02 17:04:42,922 [INFO] Successfully processed: VRA_2010_03.csv
2025-03-02 17:04:43,031 [INFO] Successfully processed: VRA_2010_04.csv
2025-03-02 17:04:43,145 [INFO] Successfully processed: VRA_2010_05.csv
2025-03-02 17:04:43,260 [INFO] Successfully processed: VRA_2010_06.csv
2025-03-02 17:04:43,385 [INFO] Successfully processed: VRA_2010_07.csv
2025-03-02 17:04:43,507 [INFO] Successfully processed: VRA_2010_08.csv
2025-03-02 17:04:43,632 [INFO] Successfully processed: VRA_2010_09.csv
2025-03-02 17:04:43,761 [INFO] Successfully processed: VRA_2010_10.csv
2025-03-02 17:04:43,891 [INFO] Successfully processed: VRA_2010_11.csv
2025-03-02 17:04:44,036 [INFO]

In [24]:
# Configure logging
log_file = "csv_processing_2010_beyond.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Folder containing the files
input_folder = "helium"

def process_csv_files():
    for year in range(2010, 2100):  # From 2010 onwards
        for month in range(1, 13):  # January to December
            file_name = f"VRA_{year}_{month:02d}.csv"
            file_path = os.path.join(input_folder, file_name)
            
            if os.path.exists(file_path):
                try:
                    # Read CSV with all columns as string using Polars
                    df = pl.read_csv(file_path, separator=";", encoding="utf-8", dtypes={col: pl.Utf8 for col in new_mapping.keys()})

                    # Rename columns based on mapping
                    df = df.rename(new_mapping)

                    # Overwrite the existing file
                    df.write_csv(file_path, separator=";", include_header=True)

                    logging.info(f"Overwritten: {file_name}")

                except Exception as e:
                    logging.error(f"Error processing {file_name}: {e}")

In [25]:
logging.info("CSV processing (overwrite mode) started.")
process_csv_files()
logging.info("CSV processing (overwrite mode) completed.")

2025-03-02 17:07:01,642 [INFO] CSV processing (overwrite mode) started.
  df = pl.read_csv(file_path, separator=";", encoding="utf-8", dtypes={col: pl.Utf8 for col in new_mapping.keys()})
2025-03-02 17:07:01,792 [INFO] Overwritten: VRA_2010_01.csv
2025-03-02 17:07:01,900 [INFO] Overwritten: VRA_2010_02.csv
2025-03-02 17:07:02,023 [INFO] Overwritten: VRA_2010_03.csv
2025-03-02 17:07:02,147 [INFO] Overwritten: VRA_2010_04.csv
2025-03-02 17:07:02,269 [INFO] Overwritten: VRA_2010_05.csv
2025-03-02 17:07:02,384 [INFO] Overwritten: VRA_2010_06.csv
2025-03-02 17:07:02,514 [INFO] Overwritten: VRA_2010_07.csv
2025-03-02 17:07:02,650 [INFO] Overwritten: VRA_2010_08.csv
2025-03-02 17:07:02,786 [INFO] Overwritten: VRA_2010_09.csv
2025-03-02 17:07:02,917 [INFO] Overwritten: VRA_2010_10.csv
2025-03-02 17:07:03,055 [INFO] Overwritten: VRA_2010_11.csv
2025-03-02 17:07:03,203 [INFO] Overwritten: VRA_2010_12.csv
2025-03-02 17:07:03,342 [INFO] Overwritten: VRA_2011_01.csv
2025-03-02 17:07:03,472 [INFO] O

In [27]:
# Configure logging
log_file = "csv_column_check.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Folder containing the files
input_folder = "helium"

def check_csv_columns():
    reference_columns = None
    mismatched_files = []

    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)

        if file_name.endswith(".csv") and os.path.isfile(file_path):
            try:
                # Read CSV without loading data, only header
                df = pl.read_csv(file_path, separator=";", encoding="utf-8", infer_schema_length=0)
                current_columns = set(df.columns)

                if reference_columns is None:
                    reference_columns = current_columns
                    logging.info(f"Reference columns set from: {file_name}")

                elif reference_columns != current_columns:
                    mismatched_files.append(file_name)
                    logging.warning(f"Column mismatch in {file_name}")

            except Exception as e:
                logging.error(f"Error reading {file_name}: {e}")

    if not mismatched_files:
        logging.info("All files have the same columns (ignoring order).")
    else:
        logging.warning(f"Files with column mismatches: {mismatched_files}")


In [28]:

    logging.info("CSV column consistency check started.")
    check_csv_columns()
    logging.info("CSV column consistency check completed.")


2025-03-02 17:09:25,205 [INFO] CSV column consistency check started.
2025-03-02 17:09:25,214 [INFO] Reference columns set from: VRA_2002_02.csv
2025-03-02 17:09:34,087 [INFO] All files have the same columns (ignoring order).
2025-03-02 17:09:34,089 [INFO] CSV column consistency check completed.


In [34]:
# Configure logging
log_file = "csv_column_ordering.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Folder containing the files
input_folder = "helium"

def reorder_csv_columns():
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)

        if file_name.endswith(".csv") and os.path.isfile(file_path):
            try:
                # Read CSV assuming all columns are strings
                df = pl.read_csv(file_path, separator=";", encoding="utf-8", infer_schema_length=0, dtypes={})

                # Convert all columns to string (forcing type consistency)
                df = df.with_columns([df[col].cast(pl.Utf8) for col in df.columns])

                # Ensure all desired columns are present, filling missing with None
                for col in desired_order:
                    if col not in df.columns:
                        df = df.with_columns(pl.lit(None).cast(pl.Utf8).alias(col))

                # Reorder columns
                df = df.select([col for col in desired_order if col in df.columns])

                # Overwrite original file with reordered columns
                df.write_csv(file_path, separator=";", include_header=True)

                logging.info(f"Reordered columns in: {file_name}")

            except Exception as e:
                logging.error(f"Error processing {file_name}: {e}")


In [35]:
logging.info("CSV column reordering started.")
reorder_csv_columns()
logging.info("CSV column reordering completed.")

2025-03-02 18:45:45,436 [INFO] CSV column reordering started.
  df = pl.read_csv(file_path, separator=";", encoding="utf-8", infer_schema_length=0, dtypes={})
2025-03-02 18:45:45,456 [INFO] Reordered columns in: VRA_2002_02.csv
2025-03-02 18:45:45,484 [INFO] Reordered columns in: VRA_2006_03.csv
2025-03-02 18:45:45,507 [INFO] Reordered columns in: VRA_2001_12.csv
2025-03-02 18:45:45,556 [INFO] Reordered columns in: VRA_2023_02.csv
2025-03-02 18:45:45,574 [INFO] Reordered columns in: VRA_2004_11.csv
2025-03-02 18:45:45,592 [INFO] Reordered columns in: VRA_2008_06.csv
2025-03-02 18:45:45,607 [INFO] Reordered columns in: VRA_2004_05.csv
2025-03-02 18:45:45,692 [INFO] Reordered columns in: VRA_2010_06.csv
2025-03-02 18:45:45,750 [INFO] Reordered columns in: VRA_2024_04.csv
2025-03-02 18:45:45,845 [INFO] Reordered columns in: VRA_2014_09.csv
2025-03-02 18:45:45,866 [INFO] Reordered columns in: VRA_2006_05.csv
2025-03-02 18:45:45,946 [INFO] Reordered columns in: VRA_2016_10.csv
2025-03-02 18

# Overview

This script automatically adds "ano" (year) and "mes" (month) columns to CSV files stored in the "helium" folder. It extracts these values directly from the filename, which is expected to be in the format "VRA_YYYY_MM.csv", ensuring your data carries its own temporal markers without any manual fuss.

## What it does

- **Logging configuration:**  
  Sets up logging to record actions and errors in "csv_add_ano_mes.log" with clear timestamps and log levels. Nothing slips past this log.

- **File scanning and pattern matching:**  
  Iterates over all files in the input folder, using a regex to identify CSV files that follow the "VRA_YYYY_MM.csv" naming convention. Only valid files get processed.

- **CSV processing:**  
  For each matching file:
  - Extracts the year ("ano") and month ("mes") from the filename.
  - Reads the CSV file with Polars, treating all columns as strings.
  - Converts every column in the file to string format for uniformity.
  - Adds new "ano" and "mes" columns populated with the extracted values.
  - Overwrites the original CSV with the updated data.

- **Error handling:**  
  Logs successful modifications as well as any errors encountered during processing, ensuring you're always informed about the script's performance.

## Why it does it

Because manually inserting year and month data into every CSV file is a waste of time. This script enforces consistency and ensures your datasets are self-contained and ready for analysis, because maybe, just maybe, that was exactly the kind of task Salah was doing againt PSG, being so loaded with this task that he forgot to play.

In [36]:
# Configure logging
log_file = "csv_add_ano_mes.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Folder containing the files
input_folder = "helium"

def add_ano_mes_columns():
    pattern = re.compile(r"VRA_(\d{4})_(\d{2})\.csv")  # Regex to extract year and month

    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)

        match = pattern.match(file_name)
        if match and file_name.endswith(".csv") and os.path.isfile(file_path):
            try:
                # Extract year and month
                ano, mes = match.groups()

                # Read CSV assuming all columns are strings
                df = pl.read_csv(file_path, separator=";", encoding="utf-8", infer_schema_length=0, dtypes={})

                # Convert all columns to string
                df = df.with_columns([df[col].cast(pl.Utf8) for col in df.columns])

                # Add "ano" and "mes" columns
                df = df.with_columns(
                    pl.lit(ano).cast(pl.Utf8).alias("ano"),
                    pl.lit(mes).cast(pl.Utf8).alias("mes")
                )

                # Overwrite the original CSV
                df.write_csv(file_path, separator=";", include_header=True)

                logging.info(f"Added 'ano' and 'mes' to: {file_name}")

            except Exception as e:
                logging.error(f"Error processing {file_name}: {e}")

In [37]:
logging.info("Adding 'ano' and 'mes' columns started.")
add_ano_mes_columns()
logging.info("Adding 'ano' and 'mes' columns completed.")

2025-03-02 18:50:06,460 [INFO] Adding 'ano' and 'mes' columns started.
  df = pl.read_csv(file_path, separator=";", encoding="utf-8", infer_schema_length=0, dtypes={})
2025-03-02 18:50:06,493 [INFO] Added 'ano' and 'mes' to: VRA_2002_02.csv
2025-03-02 18:50:06,519 [INFO] Added 'ano' and 'mes' to: VRA_2006_03.csv
2025-03-02 18:50:06,543 [INFO] Added 'ano' and 'mes' to: VRA_2001_12.csv
2025-03-02 18:50:06,600 [INFO] Added 'ano' and 'mes' to: VRA_2023_02.csv
2025-03-02 18:50:06,621 [INFO] Added 'ano' and 'mes' to: VRA_2004_11.csv
2025-03-02 18:50:06,644 [INFO] Added 'ano' and 'mes' to: VRA_2008_06.csv
2025-03-02 18:50:06,665 [INFO] Added 'ano' and 'mes' to: VRA_2004_05.csv
2025-03-02 18:50:06,754 [INFO] Added 'ano' and 'mes' to: VRA_2010_06.csv
2025-03-02 18:50:06,815 [INFO] Added 'ano' and 'mes' to: VRA_2024_04.csv
2025-03-02 18:50:06,913 [INFO] Added 'ano' and 'mes' to: VRA_2014_09.csv
2025-03-02 18:50:06,937 [INFO] Added 'ano' and 'mes' to: VRA_2006_05.csv
2025-03-02 18:50:07,017 [INFO

# Overview

This script forces all values in CSV files located in the "helium" folder to be strings. It reads each CSV with Polars, converts every column to string type, and then overwrites the original file. All actions are logged in "csv_force_string.log" for accountability.

## What it does

- **Logging configuration:**  
  Sets up logging to record every action and error in "csv_force_string.log" with clear timestamps and log levels. Nothing escapes the log.

- **File iteration:**  
  Scans the "helium" folder for CSV files. It processes only those files that end with ".csv" and ensures they're actual files, not directories.

- **CSV conversion:**  
  For each CSV file:
  - Reads the file using Polars, initially treating all columns as strings.
  - Explicitly converts every column to string type.
  - Overwrites the original file with the new, string-only version using semicolon delimiters.

- **Error handling:**  
  Logs successful conversions and any errors encountered during processing, ensuring you’re fully informed of every move.

## Why it does it

Because ambiguity in data types is unacceptable. This script enforces strict consistency by converting every CSV value to a string, preemptively eliminating type-related issues in your data processing pipeline.

In [38]:
# Configure logging
log_file = "csv_force_string.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Folder containing the files
input_folder = "helium"

def convert_all_to_string():
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)

        if file_name.endswith(".csv") and os.path.isfile(file_path):
            try:
                # Read CSV assuming all columns as string
                df = pl.read_csv(file_path, separator=";", encoding="utf-8", infer_schema_length=0, dtypes={})

                # Convert all columns to string
                df = df.with_columns([df[col].cast(pl.Utf8) for col in df.columns])

                # Overwrite the original file
                df.write_csv(file_path, separator=";", include_header=True)

                logging.info(f"Converted all values to string in: {file_name}")

            except Exception as e:
                logging.error(f"Error processing {file_name}: {e}")

In [39]:
logging.info("Forcing all values to string started.")
convert_all_to_string()
logging.info("Forcing all values to string completed.")

2025-03-02 18:51:53,728 [INFO] Forcing all values to string started.
  df = pl.read_csv(file_path, separator=";", encoding="utf-8", infer_schema_length=0, dtypes={})
2025-03-02 18:51:53,748 [INFO] Converted all values to string in: VRA_2002_02.csv
2025-03-02 18:51:53,768 [INFO] Converted all values to string in: VRA_2006_03.csv
2025-03-02 18:51:53,791 [INFO] Converted all values to string in: VRA_2001_12.csv
2025-03-02 18:51:53,841 [INFO] Converted all values to string in: VRA_2023_02.csv
2025-03-02 18:51:53,858 [INFO] Converted all values to string in: VRA_2004_11.csv
2025-03-02 18:51:53,879 [INFO] Converted all values to string in: VRA_2008_06.csv
2025-03-02 18:51:53,898 [INFO] Converted all values to string in: VRA_2004_05.csv
2025-03-02 18:51:53,990 [INFO] Converted all values to string in: VRA_2010_06.csv
2025-03-02 18:51:54,046 [INFO] Converted all values to string in: VRA_2024_04.csv
2025-03-02 18:51:54,143 [INFO] Converted all values to string in: VRA_2014_09.csv
2025-03-02 18: