In [1]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

### Input company names

In [2]:
test_company_names = [
    "TRASOLUX SARL",
    "DÄNISCHER TRANSPORT GmbH",
    "ABS BONIFER POLSKA S",
    "Vögele Logistik Deutschland GmbH",
    "Müller Transporte AG",
    "CARGOTRANS INTERNATIONAL LTD",
    "H&M Logistics France",
    "TNT Express Deutschland GmbH",
    "Daimler AG",
    "Daimler Group France SARL",
    "Daimler Holding GmbH",
    "MÉTROLOGÍA EXPRESS S.A.",
    "Kühne & Nagel International AG",
    "SCHENKER B.V.",
    "Schenker Deutschland GmbH",
    "A.B.C. Transport Inc.",
    "A-B-C Logistics Ltd.",
    "ABC. Transport, Polska",
    "ABC Logística España S.L.",
    "Fracht & Fuhre GmbH & Co. KG",
    "Fracht+Fuhre GmbH",
    "LKW WALTER Internationale Transportorganisation AG",
    "L.K.W. Walter GmbH",
    "LKW-WALTER GmbH Deutschland",
    "VAN DER VALK TRANSPORT B.V.",
    "VanDerValk Transp. GmbH",
    "Transpórtes Van Der Valk SARL",
    "Ñ-LOGIC TRANSPORTES SA",
    "CÔTE D’AZUR LOGISTIQUE SARL",
    "P&O Ferrymasters Ltd",
    "P & O FERRIES Deutschland",
    "P&O-Ferries GmbH",
    "GRÜNBERG EXPRESS S.A.R.L.",
    "Grunberg Logistics France",
    "GRUNBERG logistik gmbh",
    "V.T. Logistics Inc",
    "VT Logistics GmbH",
    "VT-Logistik Poland"
]


### Names Cleaning

In [3]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
You are a data cleaning assistant. Your task is to clean and standardize company names by removing noise and normalizing the format.
Given a list of company names, return only the cleaned names in the same order, one per line, without explanations.

Cleaning instructions:
- Fix obvious typos and character substitutions (e.g., "N1KE" → "nike", "Coka Cola" → "coca cola")
- Remove extra whitespace, punctuation, or special characters (e.g., ",", ".", "@", "...")
- Normalize accented characters to ASCII equivalents (e.g., é → e, ç → c)
- Remove:
  - Legal suffixes (e.g., GmbH, SARL, S.A., Inc., Ltd., AG, B.V., etc.)
  - Country or location identifiers (e.g., Polska, France, Deutschland, USA)
  - Special characters (e.g., &, -, _, ., ,, /, etc.)
  - Standalone numbers and single-letter tokens
- Normalize casing: all output should be in lowercase
- Standardize to the main company entity name (e.g., all variants of “Coca-Cola” become “coca cola”)

Return only the cleaned and standardized company name on each line in the same order as input.

Examples:
- "Coca-Cola Inc." → coca cola
- "N1KE Int’l Ltd" → nike
- "SoundWave GmbH & Co. KG" → soundwave
- "Katoen Natie Belgium" → katoen natie

Company names:
{company_list}
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
formatted_prompt = prompt_template.format(company_list=test_company_names)
model = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.0)  
response_text = model.invoke(formatted_prompt)
cleaned_names = response_text.content 
cleaned_names = cleaned_names.strip().split('\n') 

In [4]:
cleaned_names

['trasolux  ',
 'danischer transport  ',
 'abs bonifer  ',
 'vogele logistik  ',
 'muller transporte  ',
 'cargotrans international  ',
 'hm logistics  ',
 'tnt express  ',
 'daimler  ',
 'daimler group  ',
 'daimler holding  ',
 'metrologia express  ',
 'kuhne nagel international  ',
 'schenker  ',
 'schenker deutschland  ',
 'abc transport  ',
 'abc logistics  ',
 'fracht fuhre  ',
 'fracht fuhre  ',
 'lkw walter internationale transportorganisation  ',
 'lkw walter  ',
 'lkw walter  ',
 'van der valk transport  ',
 'vandervalk transp  ',
 'transportes van der valk  ',
 'n logic transportes  ',
 'cote d azur logistique  ',
 'p o ferrymasters  ',
 'p o ferries  ',
 'p o ferries  ',
 'grunberg express  ',
 'grunberg logistics  ',
 'grunberg logistik  ',
 'vt logistics  ',
 'vt logistics  ',
 'vt logistik']

### Generalize names

In [5]:
PROMPT_TEMPLATE_GERNERALIZE = """
You are a name generalizer assistant. Your task is to generalize cleaned company names by normalizing them to the shortest format.
Given a list of company names, return a dictionary of give names as keys and the generalized names as values in the same order, one per line, without explanations.

Generalizing instructions:
- Keep only company name without any suffices (e.g., "vinfast automobile" → "vinfast", "vinfast auto" → "vinfast", "vinfast car" → "vinfast")
- Remove:
  - Special characters (e.g., &, -, _, ., ,, /, etc.)
  
Examples:
- "coca cola beverage" → coca cola
- "vinfast transportorganisation" → vinfast
- "vinfast transport" → vinfast
- "grab logistica" → grab

Company names:
{cleaned_company_list}
"""

prompt_template_generalize = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_GERNERALIZE)
formatted_prompt_generalize = prompt_template_generalize.format(cleaned_company_list=cleaned_names)
response_text_generalize = model.invoke(formatted_prompt_generalize)
generalized_names = response_text_generalize.content 

In [6]:
import ast
group_name_dict = ast.literal_eval(generalized_names)
for key in group_name_dict.keys():
    group_name_dict[key] = group_name_dict[key].replace(" ", "")
group_name_dict

{'trasolux': 'trasolux',
 'danischer transport': 'danischer',
 'abs bonifer': 'absbonifer',
 'vogele logistik': 'vogele',
 'muller transporte': 'muller',
 'cargotrans international': 'cargotrans',
 'hm logistics': 'hm',
 'tnt express': 'tnt',
 'daimler': 'daimler',
 'daimler group': 'daimler',
 'daimler holding': 'daimler',
 'metrologia express': 'metrologia',
 'kuhne nagel international': 'kuhnenagel',
 'schenker': 'schenker',
 'schenker deutschland': 'schenker',
 'abc transport': 'abc',
 'abc logistics': 'abc',
 'fracht fuhre': 'frachtfuhre',
 'lkw walter internationale transportorganisation': 'lkwwalter',
 'lkw walter': 'lkwwalter',
 'van der valk transport': 'vandervalk',
 'vandervalk transp': 'vandervalk',
 'transportes van der valk': 'vandervalk',
 'n logic transportes': 'nlogic',
 'cote d azur logistique': 'cotedazur',
 'p o ferrymasters': 'po',
 'p o ferries': 'po',
 'grunberg express': 'grunberg',
 'grunberg logistics': 'grunberg',
 'grunberg logistik': 'grunberg',
 'vt logist

In [7]:
# group keys with the same value to a list
grouped_names = {}
for key, value in group_name_dict.items():
    if value not in grouped_names:
        grouped_names[value] = []
    grouped_names[value].append(key)

In [8]:
grouped_names

{'trasolux': ['trasolux'],
 'danischer': ['danischer transport'],
 'absbonifer': ['abs bonifer'],
 'vogele': ['vogele logistik'],
 'muller': ['muller transporte'],
 'cargotrans': ['cargotrans international'],
 'hm': ['hm logistics'],
 'tnt': ['tnt express'],
 'daimler': ['daimler', 'daimler group', 'daimler holding'],
 'metrologia': ['metrologia express'],
 'kuhnenagel': ['kuhne nagel international'],
 'schenker': ['schenker', 'schenker deutschland'],
 'abc': ['abc transport', 'abc logistics'],
 'frachtfuhre': ['fracht fuhre'],
 'lkwwalter': ['lkw walter internationale transportorganisation',
  'lkw walter'],
 'vandervalk': ['van der valk transport',
  'vandervalk transp',
  'transportes van der valk'],
 'nlogic': ['n logic transportes'],
 'cotedazur': ['cote d azur logistique'],
 'po': ['p o ferrymasters', 'p o ferries'],
 'grunberg': ['grunberg express', 'grunberg logistics', 'grunberg logistik'],
 'vt': ['vt logistics', 'vt logistik']}