In [6]:
import os
import chardet
import pandas as pd

# Specify input (raw data) and output (UTF-8 converted) directories
raw_dir = "../data/raw"
encoded_dir = "../data/encoded"

# Create the `encoded/` directory if it does not exist
os.makedirs(encoded_dir, exist_ok=True)

def detect_encoding(file_path, sample_size=10000):
    """Detect the encoding of a file using chardet."""
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read(sample_size))
    return result["encoding"]

def convert_to_utf8(input_file, output_file):
    """Detect encoding, read CSV, and save as UTF-8."""
    # Detect encoding
    encoding = detect_encoding(input_file)
    print(f"Detected encoding: {encoding} for {input_file}")
    
    df = pd.read_csv(input_file, encoding=encoding)
    
    # Save as UTF-8
    df.to_csv(output_file, encoding="utf-8", index=False)
    print(f"Saved UTF-8 file: {output_file}")

# Convert all CSV files in `raw/` to `encoded/`
def process_all_files(raw_dir, encoded_dir):
    for filename in os.listdir(raw_dir):
        if filename.endswith(".csv"):  # Process only CSV files
            input_path = os.path.join(raw_dir, filename)
            output_path = os.path.join(encoded_dir, filename)
            convert_to_utf8(input_path, output_path)

# Execute
process_all_files(raw_dir, encoded_dir)


Detected encoding: CP932 for ../data/raw/Shimane Prefecture_20231_20234.csv
Saved UTF-8 file: ../data/encoded/Shimane Prefecture_20231_20234.csv
Detected encoding: Windows-1252 for ../data/raw/Tokyo_20231_20234_en.csv
Saved UTF-8 file: ../data/encoded/Tokyo_20231_20234_en.csv
