In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [None]:
# Read CSV file into a DataFrame
df = pd.read_csv('data/data_raw.csv')
df

In [None]:
# Get list of all input features
def get_inputs_list(df):
    return df.columns.tolist()

inputs_list = get_inputs_list(df)
inputs_list

In [None]:
# Count missing values (NaN or NA) in each column
def get_missing_counts(df):
    return df.isna().sum()

missing_counts = get_missing_counts(df)
missing_counts

In [None]:
# Compute correlation matrix between all columns
def get_corr_matrix(df):
    return df.corr()

corr_matrix = get_corr_matrix(df)
corr_matrix

In [None]:
# Listwise deletion (use with caution)
df_avai = df.dropna()
df_avai

In [None]:
# List of input features that will be used for prediction/training
inputs_selected = [
                  ]
len(inputs_selected)

In [None]:
# Preprocessing function
def preprocess_fct(df_raw, inputs_selected=None):
    if inputs_selected is None:
        df = df_raw.copy()
    else: 
        df = df_raw[inputs_selected].copy()
    
    # Masking 
    df['TODO'].fillna(-1, inplace=True)

    # Masking
    df['TODO'].fillna(0, inplace=True)
    df['TODO'][df['TODO']!=0] = 1

    # Masking
    df['TODO'] = (df['TODO']=='NAME').astype(float)

    # Mean imputation
    mean_value = df['TODO'].mean()
    df['TODO'].fillna(mean_value, inplace=True)

    # One-hot encoding for categorical data
    one_hot_encoded = pd.get_dummies(df['TODO'], prefix='TODO')
    # Drop original column and concatenate the one-hot encoded columns
    df = pd.concat([df.drop('TODO', axis=1), one_hot_encoded], axis=1)

    # Slicing strings, e.g. for selecting dates
    df['TODO'] = df['TODO'].str[-2:].astype(float)

    return df

In [None]:
df_clean = preprocess_fct(df, inputs_selected)
df_clean

In [None]:
# Normalize selected columns (e.g. for NN training)
def normalize_fct(df_raw, columns=None):
    if columns is None:
        columns = df_raw.columns.tolist()
    
    df = df_raw.copy()
    scalers = {}
    
    for column in columns:
        missing = (df[column] == -1)
        data_avai = df[column][~missing].values.reshape(-1, 1)
        scaler = StandardScaler().fit(data_avai)
        data_norm = scaler.transform(df[column].values.reshape(-1, 1))
        data_norm[missing] = -1
        df[column] = data_norm
        scalers[column] = scaler
    
    return df, scalers

In [None]:
# columns = 
df_clean, scalers = normalize_fct(df_clean) #, columns)
df_clean

In [None]:
# Preprocess test data

In [None]:
# Save preprocessed data
df_clean.to_csv('data/data_clean.csv', index=False)
df_clean.to_numpy().dump('data_clean.npy')