<a href="https://colab.research.google.com/github/usmanaly03/Syntecxhub_Data_Cleaning_Utility/blob/main/Syntecxhub_Data_Cleaning_Utility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

print("Data Cleaning Utility Project")

# creating sample dataset
data = {
    "Name": ["Ali", "Sara", "Ali", "Ahmed", "Fatima"],
    "Age": [22, 25, 22, None, 28],
    "Salary": [50000, 60000, 50000, 55000, None],
    "Date": ["2023-01-01", "2023-02-15", "2023-01-01", "2023-03-10", None]
}

df = pd.DataFrame(data)

print("\nOriginal Data:")
print(df)

log = []

# handling missing values
missing_before = df.isnull().sum().sum()

for col in df.select_dtypes(include=["float64", "int64"]).columns:
    df[col] = df[col].fillna(df[col].mean())

df = df.dropna()

missing_after = df.isnull().sum().sum()
log.append(f"Missing values handled: {missing_before - missing_after}")

# removing duplicates
dup_before = df.duplicated().sum()
df = df.drop_duplicates()
dup_after = df.duplicated().sum()

log.append(f"Duplicates removed: {dup_before - dup_after}")

# standardizing column names
df.columns = df.columns.str.lower().str.replace(" ", "_")
log.append("Column names standardized")

# converting date column
df["date"] = pd.to_datetime(df["date"], errors="coerce")
log.append("Date column converted")

# saving cleaned file
df.to_csv("cleaned_data.csv", index=False)

print("\nCleaned Data:")
print(df)

print("\nCleaning Log:")
for item in log:
    print("-", item)


Data Cleaning Utility Project

Original Data:
     Name   Age   Salary        Date
0     Ali  22.0  50000.0  2023-01-01
1    Sara  25.0  60000.0  2023-02-15
2     Ali  22.0  50000.0  2023-01-01
3   Ahmed   NaN  55000.0  2023-03-10
4  Fatima  28.0      NaN        None

Cleaned Data:
    name    age   salary       date
0    Ali  22.00  50000.0 2023-01-01
1   Sara  25.00  60000.0 2023-02-15
3  Ahmed  24.25  55000.0 2023-03-10

Cleaning Log:
- Missing values handled: 3
- Duplicates removed: 1
- Column names standardized
- Date column converted
