In [18]:
import pandas as pd
import numpy as np
from datetime import datetime

!pip install fuzzywuzzy python-Levenshtein
from fuzzywuzzy import process



In [19]:
df= pd.read_csv('branch_dim.csv')

In [20]:
df.shape

(200, 5)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   branch_id    200 non-null    int64 
 1   branch_name  200 non-null    object
 2   city         200 non-null    object
 3   state        200 non-null    object
 4   region       188 non-null    object
dtypes: int64(1), object(4)
memory usage: 7.9+ KB


In [22]:
df.describe

<bound method NDFrame.describe of      branch_id             branch_name        city             state  region
0            1     Jaipur North Branch      Jaipur         Rajasthan   oNrth
1            2   Bengaluru East Branch   Bengaluru       Maharashtra    West
2            3      Thane South Branch       thane           haryana   North
3            4    Srinagar West Branch    Srinagar            Odisha    East
4            5  Vijayawada East Branch  Vijayawada    Madhya Pradesh    West
..         ...                     ...         ...               ...     ...
195        196  Hyderabad South Branch   Hyderabad           Mizoram    East
196        197     Nashik North Branch      Nashik        Chandigarh   North
197        198     Chennai East Branch     Chennai  Himachal Pradesh   Norht
198        199     Mumbai South Branch      Mumbai           Gujarat     NaN
199        200      Rajkot East Branch      Rajkot        CHANDIGARH   North

[200 rows x 5 columns]>

In [23]:
df.head()

Unnamed: 0,branch_id,branch_name,city,state,region
0,1,Jaipur North Branch,Jaipur,Rajasthan,oNrth
1,2,Bengaluru East Branch,Bengaluru,Maharashtra,West
2,3,Thane South Branch,thane,haryana,North
3,4,Srinagar West Branch,Srinagar,Odisha,East
4,5,Vijayawada East Branch,Vijayawada,Madhya Pradesh,West


In [24]:
df.isnull().sum()

branch_id       0
branch_name     0
city            0
state           0
region         12
dtype: int64

In [25]:
df.duplicated().sum()

np.int64(0)

In [26]:
# clean city names (this alone will fix 'thane', 'LUDHIANA', ' mumbai ', etc.)
df['city'] = df['city'].astype(str).str.strip().str.title()

In [27]:
df['city'].unique()

array(['Jaipur', 'Bengaluru', 'Thane', 'Srinagar', 'Vijayawada',
       'Ludhiana', 'Chennai', 'Surat', 'Ahmedabad', 'Nashik', 'Hyderabad',
       'Lucknow', 'Mumbai', 'Pune', 'Varanasi', 'Agra', 'Indore',
       'Madurai', 'Kochi', 'Nagpur', 'Ranchi', 'Aurangabad', 'Mysuru',
       'Kanpur', 'Kolkata', 'Ghaziabad', 'Delhi', 'Visakhapatnam',
       'Bhopal', 'Rajkot', 'Patna', 'Vadodara', 'Guwahati', 'Coimbatore'],
      dtype=object)

In [28]:
city_state_map = {
    'Jaipur': 'Rajasthan',
    'Bengaluru': 'Karnataka',
    'Thane': 'Maharashtra',
    'Srinagar': 'Jammu & Kashmir',
    'Vijayawada': 'Andhra Pradesh',
    'Ludhiana': 'Punjab',
    'Chennai': 'Tamil Nadu',
    'Surat': 'Gujarat',
    'Ahmedabad': 'Gujarat',
    'Nashik': 'Maharashtra',
    'Hyderabad': 'Telangana',
    'Lucknow': 'Uttar Pradesh',
    'Mumbai': 'Maharashtra',
    'Pune': 'Maharashtra',
    'Varanasi': 'Uttar Pradesh',
    'Agra': 'Uttar Pradesh',
    'Indore': 'Madhya Pradesh',
    'Madurai': 'Tamil Nadu',
    'Kochi': 'Kerala',
    'Nagpur': 'Maharashtra',
    'Ranchi': 'Jharkhand',
    'Aurangabad': 'Maharashtra',
    'Mysuru': 'Karnataka',
    'Kanpur': 'Uttar Pradesh',
    'Kolkata': 'West Bengal',
    'Ghaziabad': 'Uttar Pradesh',
    'Delhi': 'Delhi',
    'Visakhapatnam': 'Andhra Pradesh',
    'Bhopal': 'Madhya Pradesh',
    'Rajkot': 'Gujarat',
    'Patna': 'Bihar',
    'Vadodara': 'Gujarat',
    'Guwahati': 'Assam',
    'Coimbatore': 'Tamil Nadu'
}

df['state'] = df['city'].map(city_state_map)


In [29]:
df['city'].unique()

array(['Jaipur', 'Bengaluru', 'Thane', 'Srinagar', 'Vijayawada',
       'Ludhiana', 'Chennai', 'Surat', 'Ahmedabad', 'Nashik', 'Hyderabad',
       'Lucknow', 'Mumbai', 'Pune', 'Varanasi', 'Agra', 'Indore',
       'Madurai', 'Kochi', 'Nagpur', 'Ranchi', 'Aurangabad', 'Mysuru',
       'Kanpur', 'Kolkata', 'Ghaziabad', 'Delhi', 'Visakhapatnam',
       'Bhopal', 'Rajkot', 'Patna', 'Vadodara', 'Guwahati', 'Coimbatore'],
      dtype=object)

In [30]:
df['region'].unique()

array(['oNrth', 'West', ' North', 'East', 'North', nan, 'South', ' East',
       'EAST', '  North  ', 'WEST', 'eWst', 'Eats', 'West ', 'South ',
       'Nroth', ' West', 'SOUTH', 'Esat', 'NORTH', 'south', 'oSuth',
       'Norht', '  East  ', '  South  '], dtype=object)

In [32]:
df['region'] = None

In [33]:
state_region_map = {
    'Rajasthan': 'North',
    'Uttar Pradesh': 'North',
    'Delhi': 'North',
    'Punjab': 'North',
    'Jammu & Kashmir': 'North',

    'Tamil Nadu': 'South',
    'Kerala': 'South',
    'Karnataka': 'South',
    'Andhra Pradesh': 'South',
    'Telangana': 'South',

    'Maharashtra': 'West',
    'Gujarat': 'West',

    'Bihar': 'East',
    'Jharkhand': 'East',
    'West Bengal': 'East',
    'Assam': 'East'
}

df['region'] = df['state'].map(state_region_map)

In [34]:
df['region'].unique()

array(['North', 'South', 'West', nan, 'East'], dtype=object)

In [43]:
df.head(67)

Unnamed: 0,branch_id,branch_name,city,state,region
0,1,Jaipur North Branch,Jaipur,Rajasthan,North
1,2,Bengaluru East Branch,Bengaluru,Karnataka,South
2,3,Thane South Branch,Thane,Maharashtra,West
3,4,Srinagar West Branch,Srinagar,Jammu & Kashmir,North
4,5,Vijayawada East Branch,Vijayawada,Andhra Pradesh,South
...,...,...,...,...,...
62,63,Visakhapatnam West Branch,Visakhapatnam,Andhra Pradesh,South
63,64,Ahmedabad East Branch,Ahmedabad,Gujarat,West
64,65,Agra East Branch,Agra,Uttar Pradesh,North
65,66,Indore South Branch,Indore,Madhya Pradesh,


In [36]:
df.isnull().sum()

branch_id       0
branch_name     0
city            0
state           0
region         12
dtype: int64

In [39]:
df.dtypes 

branch_id       int64
branch_name    object
city           object
state          object
region         object
dtype: object

In [40]:
df[df['region'].isna()]['state'].unique()

array(['Madhya Pradesh'], dtype=object)

In [44]:
state_region_map = {
    # North
    'Rajasthan': 'North',
    'Uttar Pradesh': 'North',
    'Delhi': 'North',
    'Punjab': 'North',
    'Jammu & Kashmir': 'North',

    # South
    'Tamil Nadu': 'South',
    'Kerala': 'South',
    'Karnataka': 'South',
    'Andhra Pradesh': 'South',
    'Telangana': 'South',

    # West
    'Maharashtra': 'West',
    'Gujarat': 'West',
    'Madhya Pradesh': 'West',   # ⭐ ADD THIS LINE

    # East
    'Bihar': 'East',
    'Jharkhand': 'East',
    'West Bengal': 'East',
    'Assam': 'East'
}


In [45]:
df['region'] = df['state'].map(state_region_map)


In [46]:
df['region'].isnull().sum()


np.int64(0)

In [47]:
df.head(60)

Unnamed: 0,branch_id,branch_name,city,state,region
0,1,Jaipur North Branch,Jaipur,Rajasthan,North
1,2,Bengaluru East Branch,Bengaluru,Karnataka,South
2,3,Thane South Branch,Thane,Maharashtra,West
3,4,Srinagar West Branch,Srinagar,Jammu & Kashmir,North
4,5,Vijayawada East Branch,Vijayawada,Andhra Pradesh,South
5,6,Ludhiana Central Branch,Ludhiana,Punjab,North
6,7,Chennai West Branch,Chennai,Tamil Nadu,South
7,8,Surat South Branch,Surat,Gujarat,West
8,9,Ludhiana South Branch,Ludhiana,Punjab,North
9,10,Ahmedabad West Branch,Ahmedabad,Gujarat,West


In [48]:
df.dtypes

branch_id       int64
branch_name    object
city           object
state          object
region         object
dtype: object

In [49]:
df.isnull().sum()

branch_id      0
branch_name    0
city           0
state          0
region         0
dtype: int64

In [51]:
df.to_csv(r"C:\Users\swath\Downloads\branch.csv", index=False)