# Data Cleaning & Preprocessing

This notebook demonstrates a typical data cleaning pipeline: import, inspect, handle missing values, encoding, scaling, outlier detection/removal, and saving the cleaned dataset.

**Tools:** pandas, numpy, matplotlib, seaborn, scikit-learn



In [None]:
# Imports and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# load sample dataset
df = pd.read_csv('Titanic Dataset.csv')
df.head()

In [1]:
# Basic exploration
print('Shape:', df.shape)
print('\nInfo:')
print(df.info())
print('\nMissing values per column:')
print(df.isnull().sum())
df.describe(include='all')

NameError: name 'df' is not defined

In [None]:
# Visualize distributions and missingness
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,4))
sns.countplot(data=df, x='Gender')
plt.title('Gender counts')
plt.show()

plt.figure(figsize=(10,4))
sns.histplot(df['Age'], kde=True)
plt.title('Age distribution')
plt.show()

# Missing values heatmap
plt.figure(figsize=(6,3))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing values heatmap')
plt.show()

In [None]:
# Handling missing values
# Strategy used here (demonstration):
# - Numerical columns: median imputation
# - Categorical columns: most frequent (mode)
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
num_cols, cat_cols

In [None]:
# Impute numerical with median, categorical with mode
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

print('Missing values after imputation:')
print(df.isnull().sum())

In [None]:
# Encoding categorical variables
# Example: Label encoding for Gender (binary). For multi-class, use OneHotEncoder/pd.get_dummies.
le = LabelEncoder()
df['Gender_encoded'] = le.fit_transform(df['Gender'])
df.head()

In [None]:
# Scaling numerical features
scaler = StandardScaler()
scaled_cols = ['Age','Salary','Experience']
df_scaled = df.copy()
df_scaled[scaled_cols] = scaler.fit_transform(df[scaled_cols])
df_scaled.head()

In [None]:
# Visualize outliers with boxplots before removal
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))
for i, col in enumerate(scaled_cols):
    plt.subplot(1,3,i+1)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Remove outliers using IQR method on original (non-scaled) data
clean_df = df.copy()
def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[col] >= lower) & (data[col] <= upper)]

for col in ['Age','Salary','Experience']:
    clean_df = remove_outliers_iqr(clean_df, col)

print('Shape after outlier removal:', clean_df.shape)
clean_df.head()

In [None]:
# Save cleaned dataset
cleaned_path = 'cleaned_data.csv'
clean_df.to_csv(cleaned_path, index=False)
print(f'Cleaned dataset saved to {cleaned_path}')

In [None]:
# Summary of steps
print('Original shape:', pd.read_csv('sample_data.csv').shape)
print('After imputation:', df.shape)
print('After outlier removal:', clean_df.shape)