## Cleaning Company Names

Let's look at the Company names

In [None]:
import os
import pandas as pd
import glob
from tqdm import tqdm

In [3]:
folder_path = "excel_files"

# All the Excel files
xlsx_files = glob.glob(os.path.join(folder_path, "*.xlsx"))

data_list = []

excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]

In [4]:
# Extracting Company Names

for file_name in tqdm(excel_files, desc="Processing files", unit="file"):
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_excel(file_path)

    # Helper function to extract a single value or blank
    def get_value(element):
        values = df.loc[df['Element Name'] == element, 'Fact Value'].values
        return values[0] if len(values) > 0 else ''

    # Extract all required fields
    corporate_identity = get_value('CorporateIdentityNumber')
    company_name = get_value('NameOfTheCompany')

    # Append to data list
    data_list.append([
        corporate_identity, company_name,
    ])



Processing files: 100%|██████████████████████████████████████████████████████████| 1174/1174 [07:56<00:00,  2.46file/s]


In [40]:
# Create DataFrame
df = pd.DataFrame(data_list, columns=['CIN', 'Company Name'])
df

Unnamed: 0,CIN,Company Name
0,L74140MH2008PLC177884,360 ONE WAM LIMITED
1,L67120MH1993PLC074411,3I Infotech Limited
2,L31300KA1987PLC013543,3M INDIA LIMITED
3,L67190MH2007PLC289249,5paisa Capital Limited
4,L29142TN1988PLC015586,63 moons technologies limited
5,L37060MH1984PLC055433,Aarti Drugs Limited
6,L24110GJ1984PLC007301,Aarti Industries Limited
7,L24100GJ2019PLC110964,Aarti Pharmalabs Limited
8,L65922RJ2011PLC034297,Aavas Financiers Limited
9,L74900MH2009PLC231660,ABANS HOLDINGS LIMITED


In [41]:
df.shape

(1174, 2)

In [42]:
df.isnull().sum()

CIN             0
Company Name    0
dtype: int64

In [43]:
empty_counts = (df.fillna('').applymap(str).applymap(str.strip) == '').sum() 
print(empty_counts) 
df[df.fillna('').applymap(str).applymap(str.strip).eq('').any(axis=1)]

CIN             0
Company Name    0
dtype: int64


  empty_counts = (df.fillna('').applymap(str).applymap(str.strip) == '').sum()
  df[df.fillna('').applymap(str).applymap(str.strip).eq('').any(axis=1)]


Unnamed: 0,CIN,Company Name


In [44]:
df['Company Name'][10:20]

10                                ABB India Limited
11                  Accelya Solutions India Limited
12                                      ACC Limited
13            Action Construction Equipment Limited
14                   Adani Energy Solutions Limited
15                        Adani Enterprises Limited
16                       Adani Green Energy Limited
17    Adani Ports and Special Economic Zone Limited
18                              ADANI POWER LIMITED
19                          Adani Total Gas Limited
Name: Company Name, dtype: object

In [47]:
# Optional manual substitutions for specific cases
manual_subs = {
    'Eih Limited': 'EIH Limited',
    'Gfl Limited': 'GFL Limited',
    'Iti Limited': 'ITI Limited',
    'Hmt Limited': 'HMT Limited',
    'Iifl Securities Limited': 'IIFL Securities Limited',
    'Ksb Limited': 'KSB Limited',
    'Nhpc Limited': 'NHPC Limited',
    'Pcbl Limited': 'PCBL Limited',
    'Sis Limited': 'SIS Limited',
}


def clean_name(name):
    if pd.isna(name) or not isinstance(name, str):
        return ''

    name = name.strip()

    # Convert all-uppercase or all-lowercase to title case
    if name.isupper() or name.islower():
        name = name.title()

    # Apply substitutions if matched
    name = manual_subs.get(name, name)

    # Standard formatting
    name = (
        name
        .replace('Ltd.', 'Limited')
    )
    return name



In [48]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df['Cleaned Name'] = df['Company Name'].apply(clean_name)
df[['Company Name', 'Cleaned Name']]


Unnamed: 0,Company Name,Cleaned Name
0,360 ONE WAM LIMITED,360 One Wam Limited
1,3I Infotech Limited,3I Infotech Limited
2,3M INDIA LIMITED,3M India Limited
3,5paisa Capital Limited,5paisa Capital Limited
4,63 moons technologies limited,63 Moons Technologies Limited
5,Aarti Drugs Limited,Aarti Drugs Limited
6,Aarti Industries Limited,Aarti Industries Limited
7,Aarti Pharmalabs Limited,Aarti Pharmalabs Limited
8,Aavas Financiers Limited,Aavas Financiers Limited
9,ABANS HOLDINGS LIMITED,Abans Holdings Limited


In [49]:
# Step 1: Replace original column
df['Company Name'] = df['Cleaned Name']

# Step 2: Drop the temporary cleaned column
df.drop(columns='Cleaned Name', inplace=True)

# Step 3: Sort by company name
df.sort_values(by='Company Name', inplace=True)

# Optional: Reset index if needed
df.reset_index(drop=True, inplace=True)


In [50]:
clean_name_dict = dict(zip(df['CIN'], df['Company Name']))

In [51]:
for file_name in tqdm(excel_files, desc="Updating company names", unit="file"):
    file_path = os.path.join(folder_path, file_name)
    df_file = pd.read_excel(file_path)

    # Find matching CIN
    cin = df_file.loc[df_file['Element Name'] == 'CorporateIdentityNumber', 'Fact Value']
    if not cin.empty:
        cin_val = cin.values[0]
        cleaned_name = clean_name_dict.get(cin_val)
        
        # Replace company name if cleaned version exists
        if cleaned_name:
            mask = df_file['Element Name'] == 'NameOfTheCompany'
            df_file.loc[mask, 'Fact Value'] = cleaned_name

            # Save back to Excel
            df_file.to_excel(file_path, index=False)

Updating company names: 100%|████████████████████████████████████████████████████| 1174/1174 [15:53<00:00,  1.23file/s]
