In [None]:
# src/data_processing.py
import pandas as pd
import numpy as np
from datetime import datetime


def load_sleep_data(file_path):
    """
    Load sleep data from a CSV file.
    
    Parameters:
    -----------
    file_path : str
        Path to the sleep data CSV file
        
    Returns:
    --------
    pandas.DataFrame
        Loaded sleep data
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded data with {df.shape[0]} rows and {df.shape[1]} columns")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None


def clean_sleep_data(df):
    """
    Clean sleep data by handling missing values and outliers.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Raw sleep data
        
    Returns:
    --------
    pandas.DataFrame
        Cleaned sleep data
    """
    # Create a copy to avoid modifying the original dataframe
    df_clean = df.copy()
    
    # Print initial data information
    print("Initial data info:")
    print(f"Number of rows: {df_clean.shape[0]}")
    print(f"Number of missing values:\n{df_clean.isnull().sum()}")
    
    # Handle missing values
    # For numerical columns: replace with median
    numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        if df_clean[col].isnull().sum() > 0:
            median_val = df_clean[col].median()
            df_clean[col].fillna(median_val, inplace=True)
    
    # For categorical columns: replace with mode
    cat_cols = df_clean.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if df_clean[col].isnull().sum() > 0:
            mode_val = df_clean[col].mode()[0]
            df_clean[col].fillna(mode_val, inplace=True)
    
    # Handle outliers using IQR method for numerical columns
    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap outliers instead of removing them
        df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)
    
    print("\nAfter cleaning:")
    print(f"Number of rows: {df_clean.shape[0]}")
    print(f"Number of missing values:\n{df_clean.isnull().sum()}")
    
    return df_clean


def create_sleep_features(df):
    """
    Create additional features for sleep analysis specific to the Sleep Health and Lifestyle dataset.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Cleaned sleep data
        
    Returns:
    --------
    pandas.DataFrame
        Sleep data with additional features
    """
    df_features = df.copy()
    
    # Sleep efficiency score - combining duration and quality
    if 'Sleep Duration' in df_features.columns and 'Quality of Sleep' in df_features.columns:
        # Normalize sleep duration (7-9 hours is ideal)
        df_features['Sleep Duration Score'] = df_features['Sleep Duration'].apply(
            lambda x: 10 if 7 <= x <= 9 else 10 - 2 * abs(x - 8)
        ).clip(0, 10)
        
        # Combined sleep score (50% quality, 50% duration)
        df_features['Sleep Score'] = (df_features['Quality of Sleep'] + df_features['Sleep Duration Score']) / 2
    
    # Stress-activity balance (physical activity relative to stress level)
    if 'Physical Activity Level' in df_features.columns and 'Stress Level' in df_features.columns:
        df_features['Stress-Activity Ratio'] = df_features['Stress Level'] / df_features['Physical Activity Level']
    
    # Health metrics composite
    if 'BMI Category' in df_features.columns:
        # Convert BMI category to numeric
        bmi_mapping = {
            'Normal': 0, 
            'Normal Weight': 0,
            'Overweight': 1, 
            'Obese': 2,
            'Underweight': 1
        }
        df_features['BMI Numeric'] = df_features['BMI Category'].map(bmi_mapping)
    
    # Blood pressure risk factor
    if 'Blood Pressure' in df_features.columns:
        # Extract systolic and diastolic
        df_features[['Systolic', 'Diastolic']] = df_features['Blood Pressure'].str.split('/', expand=True).astype(int)
        
        # Create blood pressure risk score
        df_features['BP Risk'] = ((df_features['Systolic'] > 130).astype(int) + 
                                 (df_features['Diastolic'] > 85).astype(int))
    
    # Cardiovascular health score
    if 'Heart Rate' in df_features.columns and 'Daily Steps' in df_features.columns:
        # Normalize heart rate (60-70 is ideal)
        df_features['Heart Rate Score'] = 10 - abs(df_features['Heart Rate'] - 65) / 5
        
        # Normalize daily steps (8000-12000 is ideal)
        df_features['Activity Score'] = df_features['Daily Steps'].apply(
            lambda x: 10 if 8000 <= x <= 12000 else 10 - abs(x - 10000) / 1000
        ).clip(0, 10)
        
        # Combined cardiovascular score
        df_features['Cardio Health Score'] = (df_features['Heart Rate Score'] + df_features['Activity Score']) / 2
    
    # Sleep disorder binary flag
    if 'Sleep Disorder' in df_features.columns:
        df_features['Has Sleep Disorder'] = (df_features['Sleep Disorder'] != 'None').astype(int)
    
    # Age groups
    if 'Age' in df_features.columns:
        df_features['Age Group'] = pd.cut(
            df_features['Age'],
            bins=[0, 30, 40, 50, 60, 100],
            labels=['<30', '30-40', '40-50', '50-60', '60+']
        )
    
    print(f"Created {len(df_features.columns) - len(df.columns)} new features")
    return df_features


def save_processed_data(df, output_path):
    """
    Save processed data to CSV.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Processed sleep data
    output_path : str
        Path to save the processed data
        
    Returns:
    --------
    bool
        True if successful, False otherwise
    """
    try:
        # Create directory if it doesn't exist
        import os
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        df.to_csv(output_path, index=False)
        print(f"Successfully saved processed data to {output_path}")
        return True
    except Exception as e:
        print(f"Error saving data: {e}")
        return False