In [3]:
import os
import pandas as pd
import regex as re
from unidecode import unidecode
from tqdm import tqdm

# Compile regex patterns once for efficiency
SUFFIXES_RE = re.compile(r'\b(inc|corp|ltd|llc|company|co|sas)\b', re.IGNORECASE)
SPECIAL_CHARS_RE = re.compile(r'[\p{P}\p{S}]', re.IGNORECASE)
MULTIPLE_SPACES_RE = re.compile(r'\s+')
NON_LATIN_RE = re.compile(r'[^\p{Latin}\s\d]', re.IGNORECASE)
NUMBERS_RE = re.compile(r'\d+')

def preprocess_name(name):
    if isinstance(name, float) or pd.isnull(name):
        return None, False, []

    name = name.lower()
    numbers = NUMBERS_RE.findall(name)
    name_cleaned = SUFFIXES_RE.sub('', name)
    name_cleaned = SPECIAL_CHARS_RE.sub('', name_cleaned)
    name_cleaned = MULTIPLE_SPACES_RE.sub(' ', name_cleaned).strip()

    if not name_cleaned:
        return None, False, numbers

    if NON_LATIN_RE.search(name_cleaned):
        name_transliterated = unidecode(name_cleaned)
        return name_transliterated, True, numbers
    else:
        return name_cleaned, False, numbers

def process_folder(input_folder, output_folder='orbis_processed'):
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            input_path = os.path.join(input_folder, filename)
            df = pd.read_csv(input_path)

            if 'name' not in df.columns:
                print(f"Skipping {filename}: no 'name' column")
                continue

            tqdm.pandas(desc=f"Processing {filename}")
            results = df['name'].progress_apply(preprocess_name)
            df['processed_name'] = results.apply(lambda x: x[0])
            df['is_transliterated'] = results.apply(lambda x: x[1])
            df['numbers'] = results.apply(lambda x: x[2])

            output_path = os.path.join(output_folder, filename)
            df.to_csv(output_path, index=False)

In [4]:
process_folder('/Users/wiktorrajca/Desktop/Research/URAP_Fedyk/data/Orbis_Data/BvD_ID_and_Name')

Processing BvD_ID_and_Name45.csv: 100%|██████████| 10000000/10000000 [00:45<00:00, 219839.67it/s]
Processing BvD_ID_and_Name44.csv: 100%|██████████| 10000000/10000000 [00:44<00:00, 225032.55it/s]
Processing BvD_ID_and_Name8.csv: 100%|██████████| 10000000/10000000 [00:45<00:00, 219303.23it/s]
Processing BvD_ID_and_Name46.csv: 100%|██████████| 10000000/10000000 [00:39<00:00, 250603.34it/s]
Processing BvD_ID_and_Name47.csv: 100%|██████████| 4249480/4249480 [00:15<00:00, 276209.08it/s]
Processing BvD_ID_and_Name9.csv: 100%|██████████| 10000000/10000000 [00:46<00:00, 215921.44it/s]
Processing BvD_ID_and_Name43.csv: 100%|██████████| 10000000/10000000 [00:40<00:00, 245503.91it/s]
Processing BvD_ID_and_Name42.csv: 100%|██████████| 10000000/10000000 [00:42<00:00, 233815.67it/s]
Processing BvD_ID_and_Name40.csv: 100%|██████████| 10000000/10000000 [00:47<00:00, 209333.77it/s]
Processing BvD_ID_and_Name41.csv: 100%|██████████| 10000000/10000000 [00:41<00:00, 241387.07it/s]
Processing BvD_ID_and_Na