In [1]:
# =====================================================================================
# üìò DAY 3 ‚Äì DATA CLEANING BASICS
# Author: Tanuja Mannem
# Description:
#     Learn essential Pandas data cleaning operations ‚Äî checking for missing values,
#     handling NaN with dropna() and fillna(), replacing values, removing duplicates,
#     and converting data types. Includes theory, examples, practice tasks, and Q&A.
# =====================================================================================

import pandas as pd
import numpy as np

# =====================================================================================
# 1Ô∏è‚É£ INTRODUCTION
# =====================================================================================
# Data cleaning is the process of detecting and correcting (or removing) corrupt or
# inaccurate records from a dataset.
#
# Importance:
# - Ensures data accuracy and consistency.
# - Makes analysis reliable and meaningful.
# - Prevents errors during transformations or aggregations.
#
# Common data cleaning operations:
#   1. Handling Missing Values (NaN)
#   2. Replacing Invalid Data
#   3. Removing Duplicates
#   4. Converting Data Types
# =====================================================================================


# =====================================================================================
# 2Ô∏è‚É£ SAMPLE DATASET
# =====================================================================================
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Esha', 'Frank', 'George'],
    'Age': [25, np.nan, 30, 35, 40, np.nan, 28],
    'Department': ['HR', 'IT', np.nan, 'Finance', 'IT', 'HR', 'Finance'],
    'Salary': [50000, 60000, np.nan, 80000, 75000, 62000, 80000],
    'City': ['Hyderabad', 'Chennai', 'Delhi', np.nan, 'Pune', 'Hyderabad', 'Delhi']
}
df = pd.DataFrame(data)
print("\n--- Original DataFrame ---")
print(df)


# =====================================================================================
# 3Ô∏è‚É£ CHECKING FOR MISSING DATA
# =====================================================================================
# Theory:
#   - Missing data is usually represented as NaN (Not a Number).
#   - Use df.isnull() or df.isna() ‚Üí returns True for NaN values.
#   - df.notnull() or df.notna() ‚Üí returns True for non-missing values.
#   - df.isnull().sum() ‚Üí count of missing values per column.
# =====================================================================================

print("\n--- Checking Missing Values ---")
print(df.isnull().sum())

print("\nAre there any missing values? ‚Üí", df.isnull().values.any())


# =====================================================================================
# 4Ô∏è‚É£ DROPPING MISSING DATA (dropna)
# =====================================================================================
# Theory:
#   - df.dropna() removes rows (or columns) with NaN values.
#   - axis=0 ‚Üí drop rows (default)
#   - axis=1 ‚Üí drop columns
#   - how='any' ‚Üí drop if any NaN exists
#   - how='all' ‚Üí drop if all values are NaN
#   - subset=['col1','col2'] ‚Üí check only specific columns
# =====================================================================================

print("\n--- Drop Rows with Any NaN ---")
print(df.dropna())

print("\n--- Drop Columns with Any NaN ---")
print(df.dropna(axis=1))

print("\n--- Drop Rows where Salary is NaN ---")
print(df.dropna(subset=['Salary']))


# =====================================================================================
# 5Ô∏è‚É£ FILLING MISSING DATA (fillna)
# =====================================================================================
# Theory:
#   - Replace NaN with a specific value.
#   - Common options:
#       ‚Üí fillna(value)
#       ‚Üí fillna(method='ffill')  ‚Üí forward fill (use previous value)
#       ‚Üí fillna(method='bfill')  ‚Üí backward fill (use next value)
#   - inplace=True modifies the DataFrame directly.
# =====================================================================================

print("\n--- Fill NaN with Specific Values ---")
df_fill = df.fillna({
    'Age': df['Age'].mean(),
    'Department': 'Unknown',
    'Salary': df['Salary'].median(),
    'City': 'Not Provided'
})
print(df_fill)

print("\n--- Forward Fill Example ---")
print(df.fillna(method='ffill'))

print("\n--- Backward Fill Example ---")
print(df.fillna(method='bfill'))


# =====================================================================================
# 6Ô∏è‚É£ REPLACING VALUES (replace)
# =====================================================================================
# Theory:
#   - Replace specific values using df.replace().
#   - Can replace single or multiple values.
#   - Commonly used to correct typos or convert outliers.
# =====================================================================================

print("\n--- Replace Example ---")
df_replaced = df_fill.replace({'Department': {'Unknown': 'General'}})
print(df_replaced)


# =====================================================================================
# 7Ô∏è‚É£ HANDLING DUPLICATES (drop_duplicates)
# =====================================================================================
# Theory:
#   - df.duplicated() ‚Üí returns True for duplicate rows.
#   - df.drop_duplicates() removes duplicates.
#   - subset=['col1',...] checks duplicates based on specific columns.
#   - keep='first' (default), 'last', or False (drop all duplicates).
# =====================================================================================

# Add a duplicate row for demonstration
df_dup = pd.concat([df_replaced, df_replaced.iloc[[1]]], ignore_index=True)
print("\n--- With Duplicate Row ---")
print(df_dup)

print("\nCheck Duplicates:\n", df_dup.duplicated())

print("\nDrop Duplicates:\n", df_dup.drop_duplicates())


# =====================================================================================
# 8Ô∏è‚É£ DATA TYPE CONVERSION (astype)
# =====================================================================================
# Theory:
#   - Convert data types for columns using .astype().
#   - Example: df['col'] = df['col'].astype('int')
#   - Common conversions:
#       ‚Üí str, int, float, bool, category, datetime
# =====================================================================================

print("\n--- Data Type Conversion ---")
df_converted = df_fill.copy()
df_converted['Age'] = df_converted['Age'].astype(int)
print(df_converted.dtypes)


# =====================================================================================
# 9Ô∏è‚É£ PRACTICE TASKS (with answers)
# =====================================================================================

print("\n================= PRACTICE TASKS & ANSWERS =================")

# Task 1: Find total missing values in each column
print("\nTask 1 ‚Äî Count missing values per column:")
print(df.isnull().sum())

# Task 2: Fill missing 'Department' with 'Others'
df_task = df.copy()
df_task['Department'].fillna('Others', inplace=True)
print("\nTask 2 ‚Äî Filled Department Missing Values:")
print(df_task)

# Task 3: Replace all 'IT' departments with 'Tech'
df_task['Department'].replace('IT', 'Tech', inplace=True)
print("\nTask 3 ‚Äî Replace IT ‚Üí Tech:")
print(df_task)

# Task 4: Drop all rows where Salary is missing
print("\nTask 4 ‚Äî Drop rows with missing Salary:")
print(df.dropna(subset=['Salary']))

# Task 5: Convert 'Age' column to integer after filling missing values
df_task['Age'].fillna(df_task['Age'].mean(), inplace=True)
df_task['Age'] = df_task['Age'].astype(int)
print("\nTask 5 ‚Äî Age converted to int after filling NaN:")
print(df_task)

# Task 6: Check and remove duplicate rows
df_task_dup = pd.concat([df_task, df_task.iloc[[0]]], ignore_index=True)
print("\nTask 6 ‚Äî Before Removing Duplicates:")
print(df_task_dup)
print("\nAfter Removing Duplicates:")
print(df_task_dup.drop_duplicates())


# =====================================================================================
# üîü INTERVIEW QUESTIONS (with short answers)
# =====================================================================================
# Q1. How do you detect missing values in Pandas?
#     ‚Üí df.isnull() or df.isna()
#
# Q2. How do you count missing values per column?
#     ‚Üí df.isnull().sum()
#
# Q3. Difference between dropna() and fillna()?
#     ‚Üí dropna() removes missing rows/columns, fillna() replaces them.
#
# Q4. How to fill missing numeric data with mean?
#     ‚Üí df['col'].fillna(df['col'].mean(), inplace=True)
#
# Q5. How to replace all occurrences of a value?
#     ‚Üí df.replace(old_value, new_value, inplace=True)
#
# Q6. How to find and remove duplicates?
#     ‚Üí df.duplicated() / df.drop_duplicates()
#
# Q7. What is the use of astype() in Pandas?
#     ‚Üí Converts a column‚Äôs data type.
#
# Q8. How to forward fill missing values?
#     ‚Üí df.fillna(method='ffill')
#
# Q9. How to backward fill missing values?
#     ‚Üí df.fillna(method='bfill')
#
# Q10. How to check if entire DataFrame has any NaN?
#     ‚Üí df.isnull().values.any()



--- Original DataFrame ---
      Name   Age Department   Salary       City
0    Alice  25.0         HR  50000.0  Hyderabad
1      Bob   NaN         IT  60000.0    Chennai
2  Charlie  30.0        NaN      NaN      Delhi
3    David  35.0    Finance  80000.0        NaN
4     Esha  40.0         IT  75000.0       Pune
5    Frank   NaN         HR  62000.0  Hyderabad
6   George  28.0    Finance  80000.0      Delhi

--- Checking Missing Values ---
Name          0
Age           2
Department    1
Salary        1
City          1
dtype: int64

Are there any missing values? ‚Üí True

--- Drop Rows with Any NaN ---
     Name   Age Department   Salary       City
0   Alice  25.0         HR  50000.0  Hyderabad
4    Esha  40.0         IT  75000.0       Pune
6  George  28.0    Finance  80000.0      Delhi

--- Drop Columns with Any NaN ---
      Name
0    Alice
1      Bob
2  Charlie
3    David
4     Esha
5    Frank
6   George

--- Drop Rows where Salary is NaN ---
     Name   Age Department   Salary     

  print(df.fillna(method='ffill'))
  print(df.fillna(method='bfill'))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_task['Department'].fillna('Others', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_task['Age'].fillna(df_task['Age'].mean(), inplace=True)
