In [2]:
import os
import chardet
import pandas as pd

# Specify input (raw data) and output (UTF-8 converted) directories
raw_dir = "../data/raw/housing_market/"
encoded_dir = "../data/encoded/housing_market/"

# Create the `encoded/` directory if it does not exist
os.makedirs(encoded_dir, exist_ok=True)

def detect_encoding(file_path, sample_size=10000):
    """Detect encoding using a sample of the file."""
    with open(file_path, "rb") as f:
        raw_data = f.read(sample_size)
        result = chardet.detect(raw_data)
    
    encoding = result["encoding"]
    if encoding not in ["utf-8", "utf-8-sig", "CP932", "Shift_JIS", "Windows-1252"]:
        print(f"⚠ Warning: Unexpected encoding {encoding}. Defaulting to Windows-1252.")
        encoding = "Windows-1252"

    return encoding

def convert_to_utf8(input_file, output_file):
    """Convert CSV to UTF-8 without modifying data types."""
    encoding = detect_encoding(input_file)
    print(f"Detected encoding: {encoding} for {input_file}")
    
    try:
        df = pd.read_csv(input_file, encoding=encoding, dtype=str, low_memory=False)
        df.to_csv(output_file, encoding="utf-8", index=False)
        print(f"Saved UTF-8 file: {output_file}")
    except Exception as e:
        print(f"Error processing {input_file}: {e}")

# Process all CSV files
def process_all_files(raw_dir, encoded_dir):
    for filename in os.listdir(raw_dir):
        if filename.endswith(".csv"):
            input_path = os.path.join(raw_dir, filename)
            output_path = os.path.join(encoded_dir, filename)
            convert_to_utf8(input_path, output_path)

# Execute
process_all_files(raw_dir, encoded_dir)
print("All files encoded!!")

Detected encoding: Windows-1252 for ../data/raw/housing_market/28_Hyogo Prefecture_20053_20243_en.csv
Saved UTF-8 file: ../data/encoded/housing_market/28_Hyogo Prefecture_20053_20243_en.csv
Detected encoding: Windows-1252 for ../data/raw/housing_market/08_Ibaraki Prefecture_20053_20243_en.csv
Saved UTF-8 file: ../data/encoded/housing_market/08_Ibaraki Prefecture_20053_20243_en.csv
Detected encoding: Windows-1252 for ../data/raw/housing_market/36_Tokushima Prefecture_20053_20243_en.csv
Saved UTF-8 file: ../data/encoded/housing_market/36_Tokushima Prefecture_20053_20243_en.csv
Detected encoding: Windows-1252 for ../data/raw/housing_market/23_Aichi Prefecture_20053_20243_en.csv
Saved UTF-8 file: ../data/encoded/housing_market/23_Aichi Prefecture_20053_20243_en.csv
Detected encoding: Windows-1252 for ../data/raw/housing_market/38_Ehime Prefecture_20053_20243_en.csv
Saved UTF-8 file: ../data/encoded/housing_market/38_Ehime Prefecture_20053_20243_en.csv
Detected encoding: Windows-1252 for ..