# Data Cleaning Notebook
This notebook demonstrates the data cleaning process for the student dataset.

In [None]:
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Add src directory to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_cleaning import DataCleaner

## Load and Explore Raw Data

In [None]:
# Define file paths
raw_data_path = os.path.join('..', 'data', 'raw', 'students_raw.csv')
output_path = os.path.join('..', 'data', 'students_cleaned.csv')

# Create directories if they don't exist
os.makedirs(os.path.dirname(raw_data_path), exist_ok=True)
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Initialize the data cleaner
cleaner = DataCleaner(raw_data_path)

# Get the raw data
df = cleaner.df

# Display basic information
print("Raw Data Info:")
df.info()

# Display first few rows
print("\nFirst 5 rows of raw data:")
df.head()

## Clean and Preprocess Data

In [None]:
# Clean the data
cleaner.clean_data()

# Get the cleaned data
cleaned_df = cleaner.df

# Display information about cleaned data
print("Cleaned Data Info:")
cleaned_df.info()

# Display summary statistics
print("\nSummary Statistics:")
cleaned_df.describe()

## Handle Missing Values

In [None]:
# Check for missing values
print("Missing values in each column:")
print(cleaned_df.isnull().sum())

# Visualize missing values
plt.figure(figsize=(10, 4))
sns.heatmap(cleaned_df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

## Feature Engineering

In [None]:
# Check for and create age buckets if not already created
if 'age_bucket' not in cleaned_df.columns:
    # Create age buckets
    bins = [0, 18, 25, 35, 50, 100]
    labels = ['<18', '18-24', '25-34', '35-50', '50+']
    cleaned_df['age_bucket'] = pd.cut(cleaned_df['age'], bins=bins, labels=labels, right=False)

# Display the distribution of the new features
print("Age Distribution:")
print(cleaned_df['age_bucket'].value_counts().sort_index())

# Display first few rows with new features
print("\nFirst 5 rows with engineered features:")
cleaned_df.head()

## Save Cleaned Data

In [None]:
# Ensure the output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save the cleaned data
cleaned_df.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")

# Display the final structure
print("\nFinal data structure:")
cleaned_df.info()

## Data Quality Check

In [None]:
# Check for any remaining issues
print("Data Quality Check:")
print("\n1. Missing values in each column:")
print(cleaned_df.isnull().sum())

print("\n2. Duplicate rows:", cleaned_df.duplicated().sum())

print("\n3. Data types:")
print(cleaned_df.dtypes)

# Display basic statistics of numerical columns
print("\n4. Basic statistics:")
print(cleaned_df.describe())