In [None]:
import pandas as pd  
import numpy as np  

In [None]:
data_model_file_name = '../data/processed/data_model_output.csv'
clean_output_file_name = '../data/processed/clean_data_model_output.csv'
column_filter_file_name = '../data/processed/columns_to_keep.csv'

In [None]:
data_model_output = pd.read_csv(data_model_file_name)
column_filter = pd.read_csv(column_filter_file_name)

In [None]:
# Step 1: Filter rows where validation_severity_person and validation_severity_trip are NOT 'Critical'
# and is_completed is True
filtered_df = data_model_output[
    (data_model_output["validation_severity_person"] != "Critical") & 
    (data_model_output["validation_severity_trip"] != "Critical") & 
    (data_model_output["is_completed"] == True)
]

# Step 2: Apply column filter logic
# Create a mapping from column_filter where 'keep' means keep, 'delete' means remove, and others mean rename
column_mapping = dict(zip(column_filter["column"], column_filter["status"]))

# Step 3: Drop columns marked as 'delete'
columns_to_drop = [col for col, status in column_mapping.items() if status == "delete"]
filtered_df = filtered_df.drop(columns=columns_to_drop, errors="ignore")

# Step 4: Rename columns that have a different mapping
columns_to_rename = {col: new_name for col, new_name in column_mapping.items() if new_name not in ["keep", "delete"]}
filtered_df = filtered_df.rename(columns=columns_to_rename)



In [None]:
filtered_df.shape, data_model_output.shape

In [None]:
# Step 5: Save the cleaned data
filtered_df.to_csv(clean_output_file_name, index=False)