# OULAD Dataset Exploration

Notebook này để explore OULAD dataset structure và understand data trước khi build pipeline.

## Goals:
- Understand dataset structure
- Check data quality (missing values, outliers)
- Explore target variable distribution
- Identify key features for modeling


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import our custom loader
import sys
sys.path.append('..')
from src.data.loader import load_oulad_data

print("Libraries imported successfully")


In [None]:
# Load OULAD data
print("Loading OULAD dataset...")
data, info = load_oulad_data()

print(f"\nLoaded {len(data)} files:")
for name, df in data.items():
    print(f"- {name}: {df.shape}")


In [None]:
# Explore main dataset - studentInfo
student_info = data['studentInfo']
print("StudentInfo dataset overview:")
print(f"Shape: {student_info.shape}")
print(f"\nColumns: {list(student_info.columns)}")
print(f"\nFirst few rows:")
student_info.head()


In [None]:
# Check target variable distribution
print("Final result distribution:")
print(student_info['final_result'].value_counts())
print(f"\nPercentage:")
print(student_info['final_result'].value_counts(normalize=True) * 100)

# Create binary target
student_info['is_at_risk'] = student_info['final_result'].apply(
    lambda x: 1 if x in ['Fail', 'Withdrawn'] else 0
)

print(f"\nBinary target (is_at_risk):")
print(student_info['is_at_risk'].value_counts())
print(f"At-risk rate: {student_info['is_at_risk'].mean():.2%}")
