In [None]:
import pandas as pd
import numpy as np

In [None]:
data = {
    'Employee_ID': [101, 102, 103, 104, 102, 105, 106],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Bob', 'Eve', np.nan],
    'Join_Date': ['2023-01-15', '2023/02/20', '15-03-2023', '2023-04-10', '2023/02/20', '2023-05-05', '2023-06-01'],
    'Salary': ['50000', '$60,000', '70000', np.nan, '$60,000', '85000', '90000'],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'IT', 'hr', 'Finance']
}
df_raw = pd.DataFrame(data)
df_raw.to_csv('messy_employee_data.csv', index=False)
print(" Created 'messy_employee_data.csv' successfully.\n")

 Created 'messy_employee_data.csv' successfully.



In [None]:
def clean_data_pipeline(file_path):
    print("--- Starting Data Cleaning Pipeline ---")
    df = pd.read_csv(file_path)
    print(f"Original Data Shape: {df.shape}")

In [None]:
def clean_data_pipeline(file_path):
    print("--- Starting Data Cleaning Pipeline ---")
    df = pd.read_csv(file_path)
    print(f"Original Data Shape: {df.shape}")
    # Removed the recursive call: clean_data_pipeline('messy_employee_data.csv')
    df = df.drop_duplicates()
    print("-> Duplicates removed.")


    df = df.dropna(subset=['Name'])

    df['Salary'] = df['Salary'].astype(str).str.replace(r'[$,]', '', regex=True)
    df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')


    median_salary = df['Salary'].median()
    df['Salary'] = df['Salary'].fillna(median_salary)
    print("-> Missing values handled and Salary formatted.")


    df['Department'] = df['Department'].str.upper()
    print("-> Text capitalization standardized.")


    df['Join_Date'] = pd.to_datetime(df['Join_Date'], errors='coerce', dayfirst=False)
    print("-> Dates standardized.")


    print(f"Final Data Shape: {df.shape}")
    return df


df_cleaned = clean_data_pipeline('messy_employee_data.csv')

print("\n--- Final Cleaned Data Preview ---")
print(df_cleaned)

df_cleaned.to_csv('cleaned_employee_data.csv', index=False)
print("\n Cleaned dataset saved as 'cleaned_employee_data.csv'")

--- Starting Data Cleaning Pipeline ---
Original Data Shape: (7, 5)
-> Duplicates removed.
-> Missing values handled and Salary formatted.
-> Text capitalization standardized.
-> Dates standardized.
Final Data Shape: (5, 5)

--- Final Cleaned Data Preview ---
   Employee_ID     Name  Join_Date   Salary Department
0          101    Alice 2023-01-15  50000.0         HR
1          102      Bob        NaT  60000.0         IT
2          103  Charlie        NaT  70000.0    FINANCE
3          104    David 2023-04-10  65000.0         IT
5          105      Eve 2023-05-05  85000.0         HR

 Cleaned dataset saved as 'cleaned_employee_data.csv'
