In [2]:
# Data Cleaning Basics 🔧🧹

import pandas as pd
import numpy as np

# Sample dirty data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', None],
    'Age': [25, np.nan, 30, 22],
    'Score': [85, 90, None, 70]
}

df = pd.DataFrame(data)
print("Original DataFrame:\n", df)


Original DataFrame:
       Name   Age  Score
0    Alice  25.0   85.0
1      Bob   NaN   90.0
2  Charlie  30.0    NaN
3     None  22.0   70.0


In [4]:
# 🔍 Handling Missing Values

# Drop rows with any missing values
df_dropna = df.dropna()
print("\nAfter dropna():\n", df_dropna)

# Fill missing values with default values
df_fillna = df.fillna({
    'Name': 'Unknown',
    'Age': df['Age'].mean(),
    'Score': df['Score'].median()
})
print("\nAfter fillna():\n", df_fillna)



After dropna():
     Name   Age  Score
0  Alice  25.0   85.0

After fillna():
       Name        Age  Score
0    Alice  25.000000   85.0
1      Bob  25.666667   90.0
2  Charlie  30.000000   85.0
3  Unknown  22.000000   70.0


In [6]:
# ⚠️ Outlier Detection Using IQR Method

# Sample numerical column
scores = df['Score'].dropna()

Q1 = scores.quantile(0.25)
Q3 = scores.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = scores[(scores < lower_bound) | (scores > upper_bound)]
print("\nOutliers in Score column:\n", outliers)



Outliers in Score column:
 Series([], Name: Score, dtype: float64)
