[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wasim/Data-Science/blob/main/data-analyst-roadmap/08_data_cleaning_projects/01_cleaning_walkthrough.ipynb)

# Data Cleaning Walkthrough
Real-world data is messy. In this project, we'll clean a dataset containing missing values, duplicates, and inconsistent formatting.


In [None]:
import pandas as pd
import numpy as np


## 1. Create Messy Data


In [None]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eve', 'Frank', 'Grace', np.nan],
    'Age': [25, 30, 35, 25, 40, '45', 28, 22],
    'Salary': [50000, 60000, 70000, 50000, 80000, 55000, np.nan, 45000],
    'Join_Date': ['2020-01-01', '2019-05-15', '2021/02/10', '2020-01-01', '2018-08-20', '2022-01-01', '2020-11-11', '2023-03-30']
}
df = pd.DataFrame(data)
df['Department'] = ['HR', ' IT ', 'Finance', 'HR', 'IT', 'Finance', ' HR', 'IT']
df


## 2. Initial Inspection


In [None]:
df.info()
df.describe()


## 3. Handling Duplicates


In [None]:
# Check for duplicates
df.duplicated().sum()


In [None]:
# Drop duplicates
df = df.drop_duplicates()
df


## 4. Handling Missing Values


In [None]:
# Check for missing values
df.isnull().sum()


In [None]:
# Fill missing Salary with median
median_salary = df['Salary'].median()
df['Salary'].fillna(median_salary, inplace=True)


In [None]:
# Drop rows with missing Name
df.dropna(subset=['Name'], inplace=True)


## 5. Cleaning Text Data


In [None]:
# Standardize Department names (remove spaces)
df['Department'] = df['Department'].str.strip()


In [None]:
# Check unique values
df['Department'].unique()


## 6. Type Conversion


In [None]:
# Convert Age to numeric (handle errors)
df['Age'] = pd.to_numeric(df['Age'])
df['Age'].dtype


In [None]:
# Convert Join_Date to datetime
df['Join_Date'] = pd.to_datetime(df['Join_Date'])
df.info()


## 7. Final Check


In [None]:
df
