### **1.	Data Cleaning**

**a.	Install Libraries**

In [None]:
# After changing kernel, install directly
%conda install numpy pandas matplotlib seaborn scikit-learn tensorflow pytorch scikit-learn jupyter -y

**b.	Import Libraries**

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**c.	Load Dataset (Seaborn Library or Local Directory)**

In [6]:
df = sns.load_dataset('titanic')

# Alternative if loading from CSV
# df = pd.load_dataset('titanic.csv')

# Alternative if loading from JSON
# df = pd.json('titanic.json')

# Alternative if loading from Excel
# df = pd.read_excel('titanic.xlsx') 

# Alternative if loading from SQL
# df = pd.read_sql('SELECT * FROM titanic_table', connection)

**d.	Display first 5 rows of the dataset**

In [None]:
df.head()

#### **1.1.	Exploring/Identifying Missing Values in the Dataset**

**a.	Method 1: Missing Values Analysis**

In [None]:
# df.isnull().sum()
df.isnull().sum().sort_values(ascending=False)

**b.	Method 2: Percentage of missing values**

In [None]:
# df.isnull().mean() * 100

# df.isnull().sum() / len(df) * 100

# (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)

(df.isnull().sum() / len(df) * 100).round(2).sort_values(ascending=False)

**c.	Method 3: Visualizing Missing Values**

In [None]:
from matplotlib.colors import ListedColormap
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), cbar=False, cmap=ListedColormap(['black', 'white']))
plt.title('Missing Values Heatmap')
plt.show()

**d.	Method 4: DataFrame Info Missing Values Summary**

In [None]:
df.info()

#### **1.3.	How to Identify Missing Values?**

**● Missingno Library**

In [None]:
# Install missingno if not already installed
%conda install missingno -y

In [None]:
import missingno as msno
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))  # Set figure size
msno.matrix(df)
plt.title('Missing Values Matrix')
plt.show()

#### **1.8.	Dealing/Impute Missing Values Using Pandas Library**

**● Calculate median for age column**

In [16]:
df['age'].median()

np.float64(28.0)

**● Calculate the mean for age column, median is less affected by outliers as compared to mean**

In [17]:
df['age'].mean().round(2)

np.float64(29.7)

**● Mean or Median Imputation to fill age column missing values using pandas**

In [None]:
# df['age'].fillna(df['age'].mean(), inplace=True)  # Mean

df['age'].fillna(df['age'].median(), inplace=True)  # Median

# Missing Values Analysis
df.isnull().sum().sort_values(ascending=False)

**● Drop or remove the duck column with high missing values**

In [None]:
df.drop('deck', axis=1, inplace=True) # axis=0 for rows, axis=1 for columns, inplace=True to modify the original dataframe

# Missing Values Analysis
df.isnull().sum().sort_values(ascending=False)

**● Calculate mod for embarked column**

In [None]:
df['embarked'].mode()[0]

**● Calculate mod for embark_town column**

In [None]:
df['embark_town'].mode()[0]

**● Value counts for embarked columns**

In [None]:
df['embarked'].value_counts() 

**● embarked and embark_town are categorical columns with few missing values, we can use Mode Imputation to fill missing values**

In [None]:
# Mode Imputation to fill embarked missing values using pandas
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)  # Mode Imputation

# Mode Imputation to fill embark_town missing values using pandas
df['embark_town'].fillna(df['embark_town'].mode()[0], inplace=True)  # Mode Imputation

# Missing Values Analysis
df.isnull().sum().sort_values(ascending=False)

**● Using dropna method to remove rows with missing values in embarked column**

In [None]:
dna = sns.load_dataset('titanic')
dna.dropna(subset=['embarked'], inplace=True)

# Missing Values Analysis
dna.isnull().sum().sort_values(ascending=False)

#### **1.9.	Imputer age column missing values using forward fill method using pandas**

In [None]:
# Load dataset
ff = sns.load_dataset('titanic')

# Forward fill missing values in 'age' column (safe way)
ff['age'] = ff['age'].ffill()

# Missing values analysis
ff.isnull().sum().sort_values(ascending=False)

#### **1.10.	Imputer age column missing values using backward fill method using pandas**

In [None]:
# Load dataset
bf = sns.load_dataset('titanic')

# Backward fill missing values in 'age' column (safe way)
bf['age'] = ff['age'].bfill()

# Missing values analysis
bf.isnull().sum().sort_values(ascending=False)

#### **1.11.	Imputation of Missing Values Using scikit-learn Library**

In [None]:
import numpy
import scipy
import sklearn

print(numpy.__version__)
print(scipy.__version__)
print(sklearn.__version__)

**● Import SimpleImputer from sklearn.impute**

In [29]:
from sklearn.impute import SimpleImputer

##### **1.11.1.	Univariate Feature Imputation (SimpleImputer)**

In [None]:
# Load Dataset
si = sns.load_dataset('titanic')

# Impute age column missing values using SimpleImputer with Median strategy
imputer = SimpleImputer(strategy='median')
si['age'] = imputer.fit_transform(si[['age']])

# Missing Values Analysis
si.isnull().sum().sort_values(ascending=False)

##### **1.11.2.	Multivariate Feature Imputation (IterativeImputer)**

In [None]:
# Load Dataset
ii = sns.load_dataset('titanic')

# Enable IterativeImputer from experimental module
from sklearn.experimental import enable_iterative_imputer

# Impute age column missing values using IterativeImputer with Median strategy
from sklearn.impute import IterativeImputer

imputer = IterativeImputer()
# imputer = IterativeImputer(max_iter=10, random_state=0, n_nearest_features=5)
ii['age'] = imputer.fit_transform(ii[['age']])

# Missing Values Analysis
ii.isnull().sum().sort_values(ascending=False)

##### **1.11.3.	Nearest Neighbors Imputation (KNNImputer)**

In [None]:
# Load Dataset
knni = sns.load_dataset('titanic')

# Impute age column missing values using KNNImputer
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5, weights='uniform')
knni['age'] = imputer.fit_transform(knni[['age']])

# Missing Values Analysis
knni.isnull().sum().sort_values(ascending=False)

##### **1.11.4.	Marking Imputed Values (SimpleImputer, MissingIndicator)**


In [33]:
from sklearn.impute import MissingIndicator, SimpleImputer

# Load dataset
mi = sns.load_dataset('titanic')

# Create MissingIndicator object
indicator = MissingIndicator(features='missing-only')

# Find where values are missing
missing_mask = indicator.fit_transform(mi[['age']])

In [None]:
# Impute missing age values using median
imputer = SimpleImputer(strategy='median')
mi['age'] = imputer.fit_transform(mi[['age']])

# Check remaining missing values
mi.isnull().sum().sort_values(ascending=False)