In [1]:
import pandas as pd
import numpy as np

!pip install fuzzywuzzy python-Levenshtein
from fuzzywuzzy import process



In [2]:
df = pd.read_csv('banker_dim.csv')

In [3]:
df.shape

(800, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   banker_id         800 non-null    int64 
 1   name              800 non-null    object
 2   role              800 non-null    object
 3   branch_id         800 non-null    int64 
 4   experience_years  800 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 31.4+ KB


In [5]:
df.head()

Unnamed: 0,banker_id,name,role,branch_id,experience_years
0,1,Aarav Singh,Relationship Manager,101,0
1,2,Vihaan Kapoor,Loan Officer,31,8
2,3,Aadhya Jain,Relationship Manager,125,3
3,4,Aditya Sharma,rBanch Manager,109,11
4,5,Ananya Gupta,Data Analyst,182,-1


In [5]:
df.isnull().sum()

banker_id           0
name                0
role                0
branch_id           0
experience_years    0
dtype: int64

In [6]:
df.dtypes

banker_id            int64
name                object
role                object
branch_id            int64
experience_years     int64
dtype: object

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
#remove extra spaces, fix casing, handle weird characters.
# basic name cleaning
df['name'] = (df['name'].astype(str).str.strip().str.replace(r'\s+', ' ', regex=True).str.title())
    


In [9]:
#Fix obvious role typos (like rBranch Manager)
df['role'].unique()


array(['Relationship Manager', 'Loan Officer', 'rBanch Manager',
       'Data Analyst', 'Branch Manager', 'Fraud Analyst',
       'relationship manager', 'Teller', 'Data Analyst ',
       'RELATIONSHIP MANAGER', 'teller', 'Dtaa Analyst',
       '  Data Analyst  ', 'Tleler', 'Relationship aMnager',
       ' Fraud Analyst', 'Data Analyts', 'Loa nOfficer', 'Loan fOficer',
       ' Loan Officer', 'FRAUD ANALYST', '  Fraud Analyst  ',
       '  Branch Manager  ', 'Fraud Anlayst', 'Teller ', 'Fraud Analyst ',
       ' Data Analyst', 'TELLER', 'Fraud nAalyst',
       ' Relationship Manager', 'Barnch Manager',
       '  Relationship Manager  ', ' Branch Mnaager', 'Data Aanlyst',
       'Lona Officer', 'Telelr', 'BRANCH MANAGER', ' Branch Manager',
       'Data Anlayst', 'Loan Officer ', 'Reltaionship Manager',
       'Fraud Aanlyst', 'Relationship Manager ', 'loan officer',
       'Laon Officer', 'fraud analyst', 'erlationship manager',
       'Data nAalyst', 'data analyst', 'Farud Analyst', '

In [10]:
#Define clean valid roles
valid_roles = [
    'Branch Manager',
    'Relationship Manager',
    'Data Analyst',
    'Fraud Analyst',
    'Loan Officer',
    'Teller'
]

In [11]:
#Normalize dirty roles
df['role_clean'] = (df['role'].astype(str).str.strip().str.lower())
    

In [12]:
#Fuzzy Match to Correct Role
def fuzzy_fix_role(text):
    match, score = process.extractOne(text, [r.lower() for r in valid_roles])
    # score is similarity (0â€“100)
    if score >= 70:   # threshold
        # return properly formatted role
        for vr in valid_roles:
            if vr.lower() == match:
                return vr
    return text.title()

In [13]:
df['role'] = df['role_clean'].apply(fuzzy_fix_role)

In [14]:
df.head(25)

Unnamed: 0,banker_id,name,role,branch_id,experience_years,role_clean
0,1,Aarav Singh,Relationship Manager,101,0,relationship manager
1,2,Vihaan Kapoor,Loan Officer,31,8,loan officer
2,3,Aadhya Jain,Relationship Manager,125,3,relationship manager
3,4,Aditya Sharma,Branch Manager,109,11,rbanch manager
4,5,Ananya Gupta,Data Analyst,182,-1,data analyst
5,6,Rohan Singh,Data Analyst,92,6,data analyst
6,7,Kavya Das,Branch Manager,130,7,branch manager
7,8,Vivaan Naidu,Relationship Manager,26,2,relationship manager
8,9,Aadhya Gowda,Relationship Manager,172,8,relationship manager
9,10,Vihaan Chopra,Loan Officer,29,6,loan officer


In [15]:
df['role'].unique()


array(['Relationship Manager', 'Loan Officer', 'Branch Manager',
       'Data Analyst', 'Fraud Analyst', 'Teller'], dtype=object)

In [16]:
df.drop(columns=['role_clean'], inplace=True)

In [17]:
df.dtypes

banker_id            int64
name                object
role                object
branch_id            int64
experience_years     int64
dtype: object

In [18]:
df.isnull().sum()

banker_id           0
name                0
role                0
branch_id           0
experience_years    0
dtype: int64

In [19]:
#Convert negative experience years to 0
#This keeps the data complete with no missing values.
df['experience_years'] = df['experience_years'].clip(lower=0)


In [20]:
df.head(30)

Unnamed: 0,banker_id,name,role,branch_id,experience_years
0,1,Aarav Singh,Relationship Manager,101,0
1,2,Vihaan Kapoor,Loan Officer,31,8
2,3,Aadhya Jain,Relationship Manager,125,3
3,4,Aditya Sharma,Branch Manager,109,11
4,5,Ananya Gupta,Data Analyst,182,0
5,6,Rohan Singh,Data Analyst,92,6
6,7,Kavya Das,Branch Manager,130,7
7,8,Vivaan Naidu,Relationship Manager,26,2
8,9,Aadhya Gowda,Relationship Manager,172,8
9,10,Vihaan Chopra,Loan Officer,29,6


In [21]:
df.to_csv(r"C:\Users\swath\Downloads\banker.csv", index=False)

In [22]:
df.shape

(800, 5)