In [1]:
import logging
import random
import pickle
import os
from collections import defaultdict
import numpy as np
import pandas as pd
import torch

from ucimlrepo import fetch_ucirepo, list_available_datasets
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.model_selection import train_test_split

# Configure logging to write to both console and file
log_filename = "training.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),  # Output to console
        logging.FileHandler(log_filename, mode='w')  # Output to file
    ]
)

def save_checkpoint(agent, episode, imputed_data, checkpoint_dir="checkpoints"):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint = {
        'q_table': dict(agent.q_table),
        'episode': episode,
        'epsilon': agent.epsilon
    }
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_{episode}.pth")
    torch.save(checkpoint, checkpoint_path)
    imputed_data_path = os.path.join(checkpoint_dir, f"checkpoint_{episode}.csv")
    imputed_data.to_csv(imputed_data_path, index=False)
    logging.info(f"Checkpoint saved at episode {episode} with CSV file.")


def load_checkpoint(checkpoint_dir="checkpoints", checkpoint_file=None):
    if checkpoint_file is None:
        checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pth")]
        if not checkpoint_files:
            return None
        checkpoint_files.sort(key=lambda f: int(f.split('_')[1].split('.')[0]))
        checkpoint_file = checkpoint_files[-1]

    checkpoint_path = os.path.join(checkpoint_dir, checkpoint_file)
    checkpoint = torch.load(checkpoint_path)
    logging.info(f"Checkpoint loaded from {checkpoint_file}")

    # Convert q_table back to defaultdict
    q_table = defaultdict(lambda: defaultdict(float), checkpoint['q_table'])
    return {
        'q_table': q_table,
        'episode': checkpoint['episode'],
        'epsilon': checkpoint['epsilon']
    }


def calculate_metrics(env, agent):
    # Get the imputed data (after training)
    imputed_data = env.state

    # Check for NaN values before calculating MAE and RMSE
    if imputed_data.isna().sum().sum() > 0:
            # Option 1: Return a failure metric or log it
        # return None, None
        
        # Option 2: Fill NaN values with mean (conservative fallback)
        imputed_data = imputed_data.fillna(imputed_data.mean())

    # Calculate MAE and RMSE between imputed data and complete data
    mae = mean_absolute_error(env.complete_data.values.flatten(), imputed_data.values.flatten())
    rmse = root_mean_squared_error(env.complete_data.values.flatten(), imputed_data.values.flatten())

    return mae, rmse


def get_toy_data():
    # Sample Data
    complete_data = pd.DataFrame({
        'Col1': [1, 2, 3, 4],
        'Col2': [4, 1, 2, 3],
        'Col3': [1, 1, 1, 1],
        'Col4': [2, 2, 3, 4]
    })

    incomplete_data = pd.DataFrame({
        'Col1': [1, 2, np.nan, 4],
        'Col2': [np.nan, 1, 2, 3],
        'Col3': [1, np.nan, 1, 1],
        'Col4': [2, 2, 3, np.nan]
    })
    return complete_data, incomplete_data

def preprocess_columns_for_missing(df):
    """Preprocess columns by casting int64 columns to float64 to handle NaN values."""
    for col in df.columns:
        if pd.api.types.is_integer_dtype(df[col]):
            df[col] = df[col].astype(np.float64)  # Convert int64 to float64
    return df

def generate_missing_df(df, missing_rate):
    """Introduce missing values randomly into the dataframe at the specified rate."""
    df_with_missing = preprocess_columns_for_missing(df)

    # Total number of elements in the dataframe
    total_elements = df_with_missing.size

    # Number of elements to be set as NaN
    num_missing = int(missing_rate * total_elements)

    # Flatten the DataFrame to get flat indices
    flat_indices = np.arange(total_elements)

    # Get random indices
    missing_indices = np.random.choice(flat_indices, num_missing, replace=False)

    # Convert the flat indices to multi-dimensional indices
    multi_dim_indices = np.unravel_index(missing_indices, df_with_missing.shape)

    # Assign NaN to the missing indices
    for row_idx, col_idx in zip(*multi_dim_indices):
        df_with_missing.iat[row_idx, col_idx] = np.nan

    return df_with_missing

def load_dataset(datasetid, missing_rate):
    dataset = fetch_ucirepo(id=datasetid)
    df = dataset.data.original

    # Drop the target columns before generating missing values
    target_columns = dataset.metadata.target_col
    logging.info(f"Target columns: {target_columns}")

     # Ensure target_columns is valid
    if target_columns and set(target_columns).issubset(df.columns):
        df_dropped = df.drop(columns=target_columns)
        logging.info(f"Dropped target columns: {target_columns}")
    else:
        logging.warning(f"Target columns are missing or not valid for dataset {dataset_id}. Proceeding without dropping any columns.")
        df_dropped = df  # If no valid target columns, don't drop any columns

    # Drop all non-numerical columns
    df_numerical = df_dropped.select_dtypes(include=[np.number])
    logging.info(f"Retained numerical columns: {df_numerical.columns.tolist()}")

    # Use df_numerical as complete_data (without missing values)
    complete_data = df_numerical.copy()

    # Generate missing values for incomplete_data using the original copy of df_dropped
    incomplete_data = generate_missing_df(df_dropped, missing_rate)  # Generate missing values for incomplete_data

    # Ensure complete_data contains no missing values
    complete_values_count = complete_data.isna().sum().sum()
    logging.info(f"The complete DataFrame contains {complete_values_count} missing values after load_dataset()")

    # Check if incomplete_data contains missing values
    missing_values_count = incomplete_data.isna().sum().sum()
    logging.info(f"The incomplete DataFrame contains {missing_values_count} missing values after load_dataset()")

    # Return both the complete and incomplete datasets
    return complete_data, incomplete_data


class RLImputer:
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        logging.info("Initializing Q-Learning Agent")
        self.env = env
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        # Simplified Q-table: store Q-values for positions and actions
        self.q_table = defaultdict(lambda: defaultdict(float))

    def choose_action(self, position):
        """Choose an action using an epsilon-greedy policy."""
        state_key = tuple(position)  # Simplify state representation

        if random.uniform(0, 1) < self.epsilon:
            action = random.choice(self.env.get_possible_actions(position[1]))
            return action
        else:
            # Exploitation: choose the best known action
            actions = self.env.get_possible_actions(position[1])
            q_values = {a: self.q_table[state_key][a] for a in actions}
            best_action = max(q_values, key=q_values.get)
            return best_action

    def learn(self, position, action, reward, next_position):
        """Update the Q-table based on the action taken."""
        state_key = tuple(position)
        next_state_key = tuple(next_position)

        q_predict = self.q_table[state_key][action]
        q_target = reward + self.gamma * max(self.q_table[next_state_key].values(), default=0)
        self.q_table[state_key][action] += self.alpha * (q_target - q_predict)

    def train_with_logging(self, episodes=10000, log_interval=1000, results_dir="./results",
                                        experiment_table=None, progress_data=None, index=None):
        start_episode = 1
        # total_steps = 0  # Track the total number of steps (iterations)
        max_steps_per_episode = 1000  # Limit for maximum steps per episode

        try:
            while start_episode <= episodes:
                # Reset environment at the start of each episode
                state = self.env.reset()
                done = False
                step = 0

                # Loop over steps within an episode until done or max_steps_per_episode is reached
                while not done and step < max_steps_per_episode:
                    position = random.choice(self.env.missing_indices)
                    action = self.choose_action(position)
                    next_state, reward, done = self.env.step(action, position)
                    self.learn(position, action, reward, next_state)
                    state = next_state
                    step += 1

                # Decay epsilon
                self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

                # Log at the end of each episode
                if start_episode % log_interval == 0:
                    logging.info(
                        f"Episode {start_episode} completed with {step} steps, epsilon={self.epsilon:.4f}")
                    # Calculate MAE and RMSE once the entire imputation is done
                    mae, rmse = calculate_metrics(self.env, self)
                    logging.info(f"Final MAE = {mae:.6f}, Final RMSE = {rmse:.6f}")

                start_episode += 1

            # Calculate MAE and RMSE once the entire imputation is done
            mae, rmse = calculate_metrics(self.env, self)
            logging.info(f"Final MAE = {mae:.6f}, Final RMSE = {rmse:.6f}")

            # Update Streamlit table with MAE and RMSE at the end of all episodes
            if experiment_table is not None and progress_data is not None:
                progress_data.at[index, 'MAE'] = mae
                progress_data.at[index, 'RMSE'] = rmse
                experiment_table.dataframe(progress_data)

            file_name = f"{dataset_id}_missing_rate_{int(self.env.missing_rate * 100)}.csv"
            file_path = os.path.join(results_dir, file_name)
            self.env.state.to_csv(file_path, index=False)
            logging.info(f"Final imputed data saved to {file_path}")

        except KeyboardInterrupt:
            logging.info("Training interrupted.")


class ImputationEnvironment:
    def __init__(self, incomplete_data, complete_data, missing_rate):
        self.incomplete_data = incomplete_data.copy()
        self.complete_data = complete_data
        self.state = incomplete_data.copy()
        self.missing_indices = np.argwhere(pd.isna(self.incomplete_data.values))
        self.missing_rate = missing_rate
    def reset(self):
        self.state = self.incomplete_data.copy()
        return self.state

    def step(self, action, position):
        row, col = position
        self.state.iat[row, col] = action
        reward = -abs(self.complete_data.iat[row, col] - action)
        done = not pd.isna(self.state.values).any()
        return self.state, reward, done

    def get_possible_actions(self, col):
        """Return possible actions (values) for a column (excluding NaN)."""
        return self.complete_data.iloc[:, col].dropna().unique()


# Function to split dataset into training and test sets
def split_dataset(complete_data, missing_rate, test_size=0.3, random_state=42):
    complete_data_train, complete_data_test = train_test_split(complete_data, test_size=test_size,
                                                               random_state=random_state)

    # Generate missing values for both the training and test sets
    incomplete_data_train = generate_missing_df(complete_data_train, missing_rate)
    incomplete_data_test = generate_missing_df(complete_data_test, missing_rate)

    return complete_data_train, incomplete_data_train, complete_data_test, incomplete_data_test

In [None]:
from sklearn.model_selection import ParameterSampler
import numpy as np
import logging

# Define a parameter grid (or sampler for random search)
param_grid = {
    'alpha': [0.001, 0.01, 0.05, 0.1, 0.5],
    'gamma': [0.9, 0.91, 0.95, 0.99],
    'epsilon': [1.0, 0.5, 0.1],
    'epsilon_decay': [0.995, 0.99, 0.975]
}

# For random search, we define the number of samples to try
n_iter_search = 10  # Number of random parameter combinations to try

# Use ParameterSampler for random search
random_search = list(ParameterSampler(param_grid, n_iter=n_iter_search, random_state=42))

# Function to evaluate a single set of hyperparameters
def evaluate_hyperparameters(params, train_env, test_env, episodes=10000):
    # Initialize the RLImputer with current hyperparameters
    agent = RLImputer(
        env=train_env,
        alpha=params['alpha'],
        gamma=params['gamma'],
        epsilon=params['epsilon'],
        epsilon_decay=params['epsilon_decay']
    )
    
    # Train the agent
    agent.train_with_logging(episodes=episodes, log_interval=1000)
    
    # Test the agent on the test environment
    test_env.reset()
    done = False
    while not done:
        position = random.choice(test_env.missing_indices)
        action = agent.choose_action(position)
        _, _, done = test_env.step(action, position)

    # Calculate MAE and RMSE
    mae, rmse = calculate_metrics(test_env, agent)
    
    return mae, rmse

# Store the best hyperparameters and results
best_params = None
best_mae = np.inf
best_rmse = np.inf

# Run random search for hyperparameter tuning
for params in random_search:
    logging.info(f"Evaluating hyperparameters: {params}")
    
    # Split the data into training and test sets
    complete_data_train, incomplete_data_train, complete_data_test, incomplete_data_test = split_dataset(complete_data, missing_rate)
    train_env = ImputationEnvironment(incomplete_data_train, complete_data_train)
    test_env = ImputationEnvironment(incomplete_data_test, complete_data_test)
    
    # Evaluate the agent with the current hyperparameters
    mae, rmse = evaluate_hyperparameters(params, train_env, test_env)
    
    # Log and track the best results
    logging.info(f"MAE: {mae:.6f}, RMSE: {rmse:.6f} for hyperparameters: {params}")
    if mae < best_mae:
        best_mae = mae
        best_rmse = rmse
        best_params = params

# After tuning, log the best hyperparameters
logging.info(f"Best hyperparameters: {best_params} with MAE: {best_mae:.6f}, RMSE: {best_rmse:.6f}")


In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load dataset and create train/test splits using your existing function
def prepare_test_data(dataset_id, missing_rate, random_state=42):
    complete_data, _ = load_dataset(dataset_id, missing_rate)
    complete_data_train, incomplete_data_train, complete_data_test, incomplete_data_test = split_dataset(
        complete_data, missing_rate, test_size=0.3, random_state=random_state
    )
    return complete_data_test, incomplete_data_test

# Load the final imputed data from your saved CSV
def evaluate_on_test_set(dataset_id, missing_rate, final_imputed_path, random_state=42):
    # Prepare test set
    complete_data_test, _ = prepare_test_data(dataset_id, missing_rate, random_state)
    
    # Load final imputed results
    final_imputed = pd.read_csv(final_imputed_path)
    
    # Flatten and calculate MAE/RMSE on test set
    mae_test = mean_absolute_error(complete_data_test.values.flatten(), final_imputed.values.flatten())
    rmse_test = mean_squared_error(complete_data_test.values.flatten(), final_imputed.values.flatten(), squared=False)
    
    print(f"Test Set MAE: {mae_test}, RMSE: {rmse_test}")
    return mae_test, rmse_test

# Usage
dataset_id = 94
missing_rate = 0.05
final_imputed_path = './results/94_missing_rate_5.csv'
evaluate_on_test_set(dataset_id, missing_rate, final_imputed_path)
