In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [None]:
def preprocess_data(filename, columns_to_drop, categorical_mappings=None):
    # Load the dataset
    data = pd.read_csv(filename, decimal=",")
    
    # Set the index
    data.set_index(data.columns[0], inplace=True)
    
    # Drop unused columns
    data.drop(columns_to_drop, axis=1, inplace=True)
    
    # Handle categorical variables
    if categorical_mappings:
        for column, mapping in categorical_mappings.items():
            data[column] = data[column].fillna(-1)
            data.replace({column: mapping}, inplace=True)
    
    # Fill missing values
    columns_to_fill_median = ['AGE', 'WBC', 'HB', 'PLT', 'LDH', 'BMB', 'PBB', 'FLT3R']
    columns_to_fill_minus1 = data.columns.difference(columns_to_fill_median)
    continuous_variables =[]
    
    for column in columns_to_fill_median:
        median_value = data[column].median()
        data[column].fillna(median_value, inplace=True)
    
    for column in columns_to_fill_minus1:
        data[column].fillna(-1, inplace=True)
    
    # Normalize continuous variables
    scaler = StandardScaler()
    data[continuous_variables] = scaler.fit_transform(data[continuous_variables])
    
    return data

In [None]:
def save_processed_data(data, output_path):
    data.to_csv(output_path)

In [None]:
# Define parameters
input_filename = 'path_to_input_file.csv'
output_filename = 'path_to_output_file.csv'
columns_to_drop = ['Sex male', 'sex female', ...]  # List of columns to drop
categorical_mappings = {'SEX': {"f": 1, "m": 0}, 'CEBPASTAT': {"WT": 1, ...}}  # Categorical mappings

# Preprocess the data
processed_data = preprocess_data(input_filename, columns_to_drop, categorical_mappings)

# Save the processed data
save_processed_data(processed_data, output_filename)