### Data Cleaning


In [1]:
#Read CSV
import pandas as pd
data = pd.read_csv('sample_data.csv')
print(data)

                                   Name   Age  Gender   Salary   Join_Date  \
0                    John Doe    \n\n\n  25.0  Female    50000  15-01-2020   
1                            Jane Smith   NaN  Female    60000  10-03-2022   
2  Alice \n\nJohnson                     32.0     NaN    75000  20-07-2021   
3             Bob    \n\n\n       Brown  45.0    Male  $80,000  02-05-2019   
4                            Ella Davis  28.0  Femule    90000  30-01-2023   
5                    John Doe    \n\n\n  25.0  Female    50000  15-01-2020   

                                         Description  
0  Data Scientist with a background in machine le...  
1  Software Engineer passionate about web develop...  
2                                                NaN  
3  Marketing Manager with experience in digital m...  
4                     Experienced Financial Analyst.  
5  Data Scientist with a background in machine le...  


#### Removing Leading and Trailing Whitespaces:

In [2]:
data['Name'] = data['Name'].str.strip()
print(data)

                        Name   Age  Gender   Salary   Join_Date  \
0                   John Doe  25.0  Female    50000  15-01-2020   
1                 Jane Smith   NaN  Female    60000  10-03-2022   
2          Alice \n\nJohnson  32.0     NaN    75000  20-07-2021   
3  Bob    \n\n\n       Brown  45.0    Male  $80,000  02-05-2019   
4                 Ella Davis  28.0  Femule    90000  30-01-2023   
5                   John Doe  25.0  Female    50000  15-01-2020   

                                         Description  
0  Data Scientist with a background in machine le...  
1  Software Engineer passionate about web develop...  
2                                                NaN  
3  Marketing Manager with experience in digital m...  
4                     Experienced Financial Analyst.  
5  Data Scientist with a background in machine le...  


#### Addressing Inconsistent Formatting:

In [3]:
# Remove leading and trailing spaces
data['Name'] = data['Name'].str.replace("\n","")
data['Name'] = data['Name'].str.replace("\s+"," ", regex=True)


# Convert strings to lowercase
data['Description'] = data['Description'].str.lower()
data['Gender'] = data['Gender'].str.lower()
print(data)

            Name   Age  Gender   Salary   Join_Date  \
0       John Doe  25.0  female    50000  15-01-2020   
1     Jane Smith   NaN  female    60000  10-03-2022   
2  Alice Johnson  32.0     NaN    75000  20-07-2021   
3      Bob Brown  45.0    male  $80,000  02-05-2019   
4     Ella Davis  28.0  femule    90000  30-01-2023   
5       John Doe  25.0  female    50000  15-01-2020   

                                         Description  
0  data scientist with a background in machine le...  
1  software engineer passionate about web develop...  
2                                                NaN  
3  marketing manager with experience in digital m...  
4                     experienced financial analyst.  
5  data scientist with a background in machine le...  


In [4]:
type(data['Salary'][0])

str

In [5]:
# remove currency symbols and punctuations
data['Salary'] = data['Salary'].str.replace('$', '').str.replace(',', '').astype(float)
data

Unnamed: 0,Name,Age,Gender,Salary,Join_Date,Description
0,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...
1,Jane Smith,,female,60000.0,10-03-2022,software engineer passionate about web develop...
2,Alice Johnson,32.0,,75000.0,20-07-2021,
3,Bob Brown,45.0,male,80000.0,02-05-2019,marketing manager with experience in digital m...
4,Ella Davis,28.0,femule,90000.0,30-01-2023,experienced financial analyst.
5,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...


In [6]:
type(data['Salary'][0])

numpy.float64

#### Remove duplicates

In [7]:
data = data.drop_duplicates()
data

Unnamed: 0,Name,Age,Gender,Salary,Join_Date,Description
0,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...
1,Jane Smith,,female,60000.0,10-03-2022,software engineer passionate about web develop...
2,Alice Johnson,32.0,,75000.0,20-07-2021,
3,Bob Brown,45.0,male,80000.0,02-05-2019,marketing manager with experience in digital m...
4,Ella Davis,28.0,femule,90000.0,30-01-2023,experienced financial analyst.


#### Handle Missing Values

In [8]:
# Check for missing values
print(data.isnull().sum())

Name           0
Age            1
Gender         1
Salary         0
Join_Date      0
Description    1
dtype: int64


In [9]:
# Drop rows with any missing values
data_cleaned = data.dropna()
data_cleaned

Unnamed: 0,Name,Age,Gender,Salary,Join_Date,Description
0,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...
3,Bob Brown,45.0,male,80000.0,02-05-2019,marketing manager with experience in digital m...
4,Ella Davis,28.0,femule,90000.0,30-01-2023,experienced financial analyst.


In [10]:
# Fill missing values with a specific value
data_filled = data.fillna(0)
data_filled

Unnamed: 0,Name,Age,Gender,Salary,Join_Date,Description
0,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...
1,Jane Smith,0.0,female,60000.0,10-03-2022,software engineer passionate about web develop...
2,Alice Johnson,32.0,0,75000.0,20-07-2021,0
3,Bob Brown,45.0,male,80000.0,02-05-2019,marketing manager with experience in digital m...
4,Ella Davis,28.0,femule,90000.0,30-01-2023,experienced financial analyst.


In [11]:
# Interpolate missing values
data_interpolated = data.interpolate()

data_interpolated

Unnamed: 0,Name,Age,Gender,Salary,Join_Date,Description
0,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...
1,Jane Smith,28.5,female,60000.0,10-03-2022,software engineer passionate about web develop...
2,Alice Johnson,32.0,,75000.0,20-07-2021,
3,Bob Brown,45.0,male,80000.0,02-05-2019,marketing manager with experience in digital m...
4,Ella Davis,28.0,femule,90000.0,30-01-2023,experienced financial analyst.


In [12]:
# Replace missing values with the mean of the column
data_mean_filled = data.fillna(data.mean())
data_mean_filled
# Error.. Why?

TypeError: Could not convert ['15-01-202010-03-202220-07-202102-05-201930-01-2023'] to numeric

In [13]:
# Impute categorical columns with mode
data_imputed = data.copy()
data_imputed['Gender'].fillna(data_imputed['Gender'].mode()[0], inplace=True)
data_imputed['Description'].fillna(data_imputed['Description'].mode()[0], inplace=True)

# Impute numerical columns with mean
data_imputed['Age'].fillna(data_imputed['Age'].mean(), inplace=True)

data_imputed

Unnamed: 0,Name,Age,Gender,Salary,Join_Date,Description
0,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...
1,Jane Smith,32.5,female,60000.0,10-03-2022,software engineer passionate about web develop...
2,Alice Johnson,32.0,female,75000.0,20-07-2021,data scientist with a background in machine le...
3,Bob Brown,45.0,male,80000.0,02-05-2019,marketing manager with experience in digital m...
4,Ella Davis,28.0,femule,90000.0,30-01-2023,experienced financial analyst.


In [14]:
data = data_imputed
data

Unnamed: 0,Name,Age,Gender,Salary,Join_Date,Description
0,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...
1,Jane Smith,32.5,female,60000.0,10-03-2022,software engineer passionate about web develop...
2,Alice Johnson,32.0,female,75000.0,20-07-2021,data scientist with a background in machine le...
3,Bob Brown,45.0,male,80000.0,02-05-2019,marketing manager with experience in digital m...
4,Ella Davis,28.0,femule,90000.0,30-01-2023,experienced financial analyst.


In [None]:
!pip install fuzzywuzzy

In [17]:
# fix typing errors
from fuzzywuzzy import fuzz


def correct_typos(word, word_list):
    best_match = max(word_list, key=lambda x: fuzz.ratio(word, x))
    return best_match

list_of_possible_gender = ["male", "female"]
# Apply the function to the column
data['Gender'] = data['Gender'].apply(lambda x: correct_typos(x, list_of_possible_gender))
data

Unnamed: 0,Name,Age,Gender,Salary,Join_Date,Description
0,John Doe,25.0,female,50000.0,15-01-2020,data scientist with a background in machine le...
1,Jane Smith,32.5,female,60000.0,10-03-2022,software engineer passionate about web develop...
2,Alice Johnson,32.0,female,75000.0,20-07-2021,data scientist with a background in machine le...
3,Bob Brown,45.0,male,80000.0,02-05-2019,marketing manager with experience in digital m...
4,Ella Davis,28.0,female,90000.0,30-01-2023,experienced financial analyst.


In [15]:
from fuzzywuzzy import fuzz
fuzz.ratio("femule", "female")



83

In [16]:
fuzz.ratio("femule", "male")

60