### Data Cleaning Pipeline for Sri Lankan Employee Data

In [49]:
import pandas as pd
import numpy as np

## Load the dataset

In [50]:
df = pd.read_csv('sri_lanka_employees.csv')
df.head()

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary
0,25795,Nuwan Gunawardena,4/1/2022,Jaffna,HR,73200.0
1,10860,Rashmi Karunaratne,4/1/2024,Colombo,Finance,59800.0
2,86820,Nuwan Jayasinghe,12/1/2021,Kandy,Finance,59500.0
3,64886,Ruwan Perera,12/1/2019,Colombo,Finance,77600.0
4,16265,Ruwan Rathnayake,6/1/2019,Jaffna,IT,


## Basic overview and structure

In [51]:
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Employee_ID  100 non-null    int64  
 1   Name         100 non-null    object 
 2   Join_Date    100 non-null    object 
 3   City         100 non-null    object 
 4   Department   100 non-null    object 
 5   Salary       80 non-null     float64
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary
count,100.0,100,100,100,100,80.0
unique,,69,68,4,5,
top,,Ishara Rathnayake,4/1/2022,Jaffna,HR,
freq,,3,4,29,25,
mean,58544.92,,,,,96157.5
std,27846.019844,,,,,31583.971961
min,10769.0,,,,,50400.0
25%,33793.5,,,,,69875.0
50%,63351.0,,,,,88750.0
75%,79912.0,,,,,122975.0


In [52]:
df.dtypes

Employee_ID      int64
Name            object
Join_Date       object
City            object
Department      object
Salary         float64
dtype: object

In [53]:
df["Name"].value_counts()

Name
Ishara Rathnayake     3
Nuwan Jayasinghe      3
Malsha Bandara        3
Rashmi Karunaratne    3
Janith Gunawardena    3
                     ..
Ruwan Silva           1
Suresh Karunaratne    1
Suresh De Alwis       1
Tharushi Fernando     1
Kasun De Alwis        1
Name: count, Length: 69, dtype: int64

## Handling duplicates

In [54]:
df = df.drop_duplicates()
df.shape

(100, 6)

## Handling missing values

In [55]:
# Fill missing salaries with median
df['Salary'] = df['Salary'].fillna(df['Salary'].median())
df

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary
0,25795,Nuwan Gunawardena,4/1/2022,Jaffna,HR,73200.0
1,10860,Rashmi Karunaratne,4/1/2024,Colombo,Finance,59800.0
2,86820,Nuwan Jayasinghe,12/1/2021,Kandy,Finance,59500.0
3,64886,Ruwan Perera,12/1/2019,Colombo,Finance,77600.0
4,16265,Ruwan Rathnayake,6/1/2019,Jaffna,IT,88750.0
...,...,...,...,...,...,...
95,77563,Thilini De Alwis,2/1/2017,Kandy,HR,71400.0
96,12695,Kasun De Alwis,9/1/2022,Galle,Marketing,73900.0
97,58190,Suresh Fernando,5/1/2019,Galle,Operations,79700.0
98,15258,Rashmi Rathnayake,1/1/2021,Kandy,HR,88750.0


## String cleaning and standardization

In [56]:
df["Name"].str.strip().str.title()

0      Nuwan Gunawardena
1     Rashmi Karunaratne
2       Nuwan Jayasinghe
3           Ruwan Perera
4       Ruwan Rathnayake
             ...        
95      Thilini De Alwis
96        Kasun De Alwis
97       Suresh Fernando
98     Rashmi Rathnayake
99        Malsha Bandara
Name: Name, Length: 100, dtype: object

FINANCE
finance
Finance
---> FINANCE

In [57]:
df['Department'] = df['Department'].str.upper()

In [58]:
df

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary
0,25795,Nuwan Gunawardena,4/1/2022,Jaffna,HR,73200.0
1,10860,Rashmi Karunaratne,4/1/2024,Colombo,FINANCE,59800.0
2,86820,Nuwan Jayasinghe,12/1/2021,Kandy,FINANCE,59500.0
3,64886,Ruwan Perera,12/1/2019,Colombo,FINANCE,77600.0
4,16265,Ruwan Rathnayake,6/1/2019,Jaffna,IT,88750.0
...,...,...,...,...,...,...
95,77563,Thilini De Alwis,2/1/2017,Kandy,HR,71400.0
96,12695,Kasun De Alwis,9/1/2022,Galle,MARKETING,73900.0
97,58190,Suresh Fernando,5/1/2019,Galle,OPERATIONS,79700.0
98,15258,Rashmi Rathnayake,1/1/2021,Kandy,HR,88750.0


## Date/time handling

In [59]:
df['Join_Date'] = pd.to_datetime(df['Join_Date'], errors='coerce')
df

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary
0,25795,Nuwan Gunawardena,2022-04-01,Jaffna,HR,73200.0
1,10860,Rashmi Karunaratne,2024-04-01,Colombo,FINANCE,59800.0
2,86820,Nuwan Jayasinghe,2021-12-01,Kandy,FINANCE,59500.0
3,64886,Ruwan Perera,2019-12-01,Colombo,FINANCE,77600.0
4,16265,Ruwan Rathnayake,2019-06-01,Jaffna,IT,88750.0
...,...,...,...,...,...,...
95,77563,Thilini De Alwis,2017-02-01,Kandy,HR,71400.0
96,12695,Kasun De Alwis,2022-09-01,Galle,MARKETING,73900.0
97,58190,Suresh Fernando,2019-05-01,Galle,OPERATIONS,79700.0
98,15258,Rashmi Rathnayake,2021-01-01,Kandy,HR,88750.0


In [60]:
df['Year_Joined'] = df['Join_Date'].dt.year
df

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary,Year_Joined
0,25795,Nuwan Gunawardena,2022-04-01,Jaffna,HR,73200.0,2022
1,10860,Rashmi Karunaratne,2024-04-01,Colombo,FINANCE,59800.0,2024
2,86820,Nuwan Jayasinghe,2021-12-01,Kandy,FINANCE,59500.0,2021
3,64886,Ruwan Perera,2019-12-01,Colombo,FINANCE,77600.0,2019
4,16265,Ruwan Rathnayake,2019-06-01,Jaffna,IT,88750.0,2019
...,...,...,...,...,...,...,...
95,77563,Thilini De Alwis,2017-02-01,Kandy,HR,71400.0,2017
96,12695,Kasun De Alwis,2022-09-01,Galle,MARKETING,73900.0,2022
97,58190,Suresh Fernando,2019-05-01,Galle,OPERATIONS,79700.0,2019
98,15258,Rashmi Rathnayake,2021-01-01,Kandy,HR,88750.0,2021


## Data transformation examples

In [61]:
pivot_df = df.pivot_table(index='Department', values='Salary', aggfunc='mean')
pivot_df.head()

Unnamed: 0_level_0,Salary
Department,Unnamed: 1_level_1
FINANCE,97077.777778
HR,96058.0
IT,75800.0
MARKETING,96768.75
OPERATIONS,97317.391304


## Filtering and selection examples

In [62]:
# Example filters
high_salary = df[df['Salary'] > 90000]
recent_joiners = df[df['Join_Date'] > '2020-01-01']
subset = df.loc[df['Department'] == 'FINANCE', ['Name', 'Salary', 'Join_Date']]

subset.head()

Unnamed: 0,Name,Salary,Join_Date
1,Rashmi Karunaratne,59800.0,2024-04-01
2,Nuwan Jayasinghe,59500.0,2021-12-01
3,Ruwan Perera,77600.0,2019-12-01
10,Dinusha Karunaratne,143600.0,2016-12-01
14,Dinusha Karunaratne,88750.0,2021-11-01


Using Query

In [63]:
# 1. High salary employees
high_salary = df.query('Salary > 90000')

# 2. Recent joiners after Jan 1, 2020
recent_joiners = df.query("Join_Date > '2020-01-01'")

# 3. Subset of employees in Finance department with selected columns
finance_dept = df.query("Department == 'FINANCE'")[['Name', 'Salary', 'Join_Date']]

# Display results
print(f"High salary employees: {len(high_salary)}")
print(f"Recent joiners: {len(recent_joiners)}")
print(f"Finance department subset: {len(finance_dept)}")

High salary employees: 39
Recent joiners: 59
Finance department subset: 18


### Exporting Final Cleaned Dataset

In [64]:
df

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary,Year_Joined
0,25795,Nuwan Gunawardena,2022-04-01,Jaffna,HR,73200.0,2022
1,10860,Rashmi Karunaratne,2024-04-01,Colombo,FINANCE,59800.0,2024
2,86820,Nuwan Jayasinghe,2021-12-01,Kandy,FINANCE,59500.0,2021
3,64886,Ruwan Perera,2019-12-01,Colombo,FINANCE,77600.0,2019
4,16265,Ruwan Rathnayake,2019-06-01,Jaffna,IT,88750.0,2019
...,...,...,...,...,...,...,...
95,77563,Thilini De Alwis,2017-02-01,Kandy,HR,71400.0,2017
96,12695,Kasun De Alwis,2022-09-01,Galle,MARKETING,73900.0,2022
97,58190,Suresh Fernando,2019-05-01,Galle,OPERATIONS,79700.0,2019
98,15258,Rashmi Rathnayake,2021-01-01,Kandy,HR,88750.0,2021


In [65]:
df.to_csv("employees_data_cleaned_stepwise.csv")

### Single pipeline for all cleaning tasks (Method chaining)

In [None]:
import pandas as pd

df_new = (
    pd.read_csv('sri_lanka_employees.csv')
    .drop_duplicates()
    .assign(
        Salary=lambda x: x['Salary'].fillna(x['Salary'].median()),
        Name=lambda x: x['Name'].str.strip().str.title(),
        Department=lambda x: x['Department'].str.upper(),
        Join_Date=lambda x: pd.to_datetime(x['Join_Date'], errors='coerce'),
        Year_Joined=lambda x: pd.to_datetime(x['Join_Date'], errors='coerce').dt.year
    )
    .to_csv('sri_lanka_employees_cleaned_chain.csv', index=False)
)

Comparing the data frame to see if both methods gave similar results

In [91]:
df

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary,Year_Joined
0,25795,Nuwan Gunawardena,2022-04-01,Jaffna,HR,73200.0,2022
1,10860,Rashmi Karunaratne,2024-04-01,Colombo,FINANCE,59800.0,2024
2,86820,Nuwan Jayasinghe,2021-12-01,Kandy,FINANCE,59500.0,2021
3,64886,Ruwan Perera,2019-12-01,Colombo,FINANCE,77600.0,2019
4,16265,Ruwan Rathnayake,2019-06-01,Jaffna,IT,88750.0,2019
...,...,...,...,...,...,...,...
95,77563,Thilini De Alwis,2017-02-01,Kandy,HR,71400.0,2017
96,12695,Kasun De Alwis,2022-09-01,Galle,MARKETING,73900.0,2022
97,58190,Suresh Fernando,2019-05-01,Galle,OPERATIONS,79700.0,2019
98,15258,Rashmi Rathnayake,2021-01-01,Kandy,HR,88750.0,2021


In [92]:
df_new = pd.read_csv("sri_lanka_employees_cleaned_chain.csv")

In [93]:
df

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary,Year_Joined
0,25795,Nuwan Gunawardena,2022-04-01,Jaffna,HR,73200.0,2022
1,10860,Rashmi Karunaratne,2024-04-01,Colombo,FINANCE,59800.0,2024
2,86820,Nuwan Jayasinghe,2021-12-01,Kandy,FINANCE,59500.0,2021
3,64886,Ruwan Perera,2019-12-01,Colombo,FINANCE,77600.0,2019
4,16265,Ruwan Rathnayake,2019-06-01,Jaffna,IT,88750.0,2019
...,...,...,...,...,...,...,...
95,77563,Thilini De Alwis,2017-02-01,Kandy,HR,71400.0,2017
96,12695,Kasun De Alwis,2022-09-01,Galle,MARKETING,73900.0,2022
97,58190,Suresh Fernando,2019-05-01,Galle,OPERATIONS,79700.0,2019
98,15258,Rashmi Rathnayake,2021-01-01,Kandy,HR,88750.0,2021


In [86]:
df_new

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary,Year_Joined
0,25795,Nuwan Gunawardena,2022-04-01,Jaffna,HR,73200.0,2022
1,10860,Rashmi Karunaratne,2024-04-01,Colombo,FINANCE,59800.0,2024
2,86820,Nuwan Jayasinghe,2021-12-01,Kandy,FINANCE,59500.0,2021
3,64886,Ruwan Perera,2019-12-01,Colombo,FINANCE,77600.0,2019
4,16265,Ruwan Rathnayake,2019-06-01,Jaffna,IT,88750.0,2019
...,...,...,...,...,...,...,...
95,77563,Thilini De Alwis,2017-02-01,Kandy,HR,71400.0,2017
96,12695,Kasun De Alwis,2022-09-01,Galle,MARKETING,73900.0,2022
97,58190,Suresh Fernando,2019-05-01,Galle,OPERATIONS,79700.0,2019
98,15258,Rashmi Rathnayake,2021-01-01,Kandy,HR,88750.0,2021


In [94]:
df == df_new

Unnamed: 0,Employee_ID,Name,Join_Date,City,Department,Salary,Year_Joined
0,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...
95,True,True,True,True,True,True,True
96,True,True,True,True,True,True,True
97,True,True,True,True,True,True,True
98,True,True,True,True,True,True,True


In [88]:
df_comparison = df == df_new
df_comparison.sum()

Employee_ID    100
Name           100
Join_Date      100
City           100
Department     100
Salary         100
Year_Joined    100
dtype: int64