In [2]:
import os
import numpy as np
import pandas as pd
from scipy.stats import bernoulli, multivariate_normal
from scipy.sparse import csr_matrix
from feature_engine.selection import DropCorrelatedFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [117]:
def preprocess_data(df, missing_threshold=0.0, correlation_threshold=0.8):
    """
    Preprocess dataset by:
    1. Filling in missing values
    2. Removing collinear variables
    
    Parameters:
    - df: pandas DataFrame
    - missing_threshold: maximum fraction of missing values allowed in a column
    - correlation_threshold: threshold for removing correlated features
    
    Returns:
    - Preprocessed DataFrame
    """
    print(f"Original shape: {df.shape}")
    
    # 1. Handle missing values
    # Count missing values per column
    missing_count = df.isnull().sum()
    print(f"Columns with missing values: {sum(missing_count > 0)}")
    
    # Remove columns with too many missing values
    cols_to_drop = missing_count[missing_count > missing_threshold * len(df)].index
    print(f"Dropping {len(cols_to_drop)} columns with >={missing_threshold*100}% missing values")
    df = df.drop(columns=cols_to_drop)
    
    # For remaining columns with missing values, fill with median (numerical)
    cols_to_fill = df.columns[df.isnull().any()]
    for col in cols_to_fill:
        if df[col].dtype.kind in 'ifc':  # Check if column is numeric
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])
            
    # 2. Remove collinear variables
    # Calculate correlation matrix
    corr_matrix = df.drop(columns='target').corr().abs()
    
    # Create upper triangle mask
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features with correlation above threshold
    to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
    print(f"Dropping {len(to_drop)} collinear features with correlation > {correlation_threshold}")
    
    # Drop collinear features
    df = df.drop(columns=to_drop)
    
    print(f"Final shape: {df.shape}")
    return df

## Arhythmia

In [None]:
# load data
data = pd.read_csv("raw_data/arrhythmia/arrhythmia.data", header=None, na_values='?')

# set last column as target
data.rename(columns={data.columns[-1]: 'target'}, inplace=True)

# convert classes to binary: 0 (Normal), 1 (all others)
data['target'] = data['target'].apply(lambda x: 0 if x == 1 else 1)

# create data directory if needed
os.makedirs('data', exist_ok=True)

In [119]:
data = preprocess_data(data, missing_threshold=0.0, correlation_threshold=0.8)

Original shape: (452, 280)
Columns with missing values: 5
Dropping 5 columns with >=0.0% missing values
Dropping 60 collinear features with correlation > 0.8
Final shape: (452, 215)


In [120]:
# save data to CSV
data.to_csv('data/arrhythmia.csv', index=False)
print("Data saved to 'data/arrhythmia.csv'.")

Data saved to 'data/arrhythmia.csv'.


## Speech

In [121]:
data = pd.read_csv("raw_data/speech/pd_speech_features.csv").drop(columns=['id'])
data.rename(columns={data.columns[-1]: 'target'}, inplace=True)

In [122]:
corr_matrix = data.drop(columns=['target']).corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
data.drop(columns=to_drop, inplace=True)

In [123]:
X = data.drop(columns=['target'])
y = data['target']

binary_cols = X.nunique()[X.nunique() == 2].index
numeric_cols = X.columns.difference(binary_cols)

In [124]:
new_column_names = {col: i for i, col in enumerate(data.columns[:-1])}  
data = data.rename(columns=new_column_names)

In [125]:
data = preprocess_data(data, missing_threshold=0.0, correlation_threshold=0.8)

Original shape: (756, 390)
Columns with missing values: 0
Dropping 0 columns with >=0.0% missing values
Dropping 91 collinear features with correlation > 0.8
Final shape: (756, 299)


In [126]:
data.to_csv('data/speech.csv', index=False)

## Secom

In [127]:
""" https://archive.ics.uci.edu/dataset/179/secom """

' https://archive.ics.uci.edu/dataset/179/secom '

In [3]:
X_secom = pd.read_csv("raw_data/secom/secom.data", header=None, sep=' ')
y_secom = pd.read_csv("raw_data/secom/secom_labels.data", header=None, sep=' ')[0]

In [4]:
y_secom = y_secom.apply(lambda x: max(x,0))  # change -1 to 0

In [None]:
# fill nans
X_secom = X_secom.fillna(X_secom.mean())

# drop correlated columns
tr = DropCorrelatedFeatures(None, threshold=0.9)
X_secom = tr.fit_transform(X_secom)
X_secom = pd.DataFrame(X_secom)

# drop columns with only one unique value
X_secom = X_secom.loc[:, X_secom.nunique() > 1]

In [6]:
# concatenate
newdata = X_secom.copy()
newdata['target'] = y_secom

In [15]:
# balance dataset
counts = y_secom.value_counts()
diff = counts[0] - counts[1]

subset = newdata.drop(newdata[newdata['target'] == 0].sample(n=diff, random_state=42).index)

In [16]:
y_subset = subset["target"]
X_subset = subset.drop(columns=["target"])

print("instances x features:", X_subset.shape)
y_subset.value_counts()

instances x features: (208, 268)


target
1    104
0    104
Name: count, dtype: int64

In [17]:
subset.to_csv("data/secom.csv", index=False)

## Ionosphere

In [134]:
ionosphere = pd.read_csv("raw_data/ionosphere/ionosphere.data", header=None)

print(f"Original data shape: {ionosphere.shape}")

target_col = ionosphere.columns[-1]
class_counts = ionosphere[target_col].value_counts()
print(f"Class distribution:\n{class_counts}")

class_values = class_counts.index.tolist()
balanced_df = pd.DataFrame()

for cls in class_values:
    class_samples = ionosphere[ionosphere[target_col] == cls].sample(n=35, random_state=42)
    balanced_df = pd.concat([balanced_df, class_samples])

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

column_names = [i for i in range(balanced_df.shape[1] - 1)] + ['target']
balanced_df.columns = column_names

if balanced_df['target'].dtype == object:

    label_mapping = {label: i for i, label in enumerate(balanced_df['target'].unique())}
    balanced_df['target'] = balanced_df['target'].map(label_mapping)
    print(f"Label mapping: {label_mapping}")



Original data shape: (351, 35)
Class distribution:
34
g    225
b    126
Name: count, dtype: int64
Label mapping: {'g': 0, 'b': 1}


In [135]:
balanced_df = preprocess_data(balanced_df, missing_threshold=0.0, correlation_threshold=0.8)

Original shape: (70, 35)
Columns with missing values: 0
Dropping 0 columns with >=0.0% missing values
Dropping 3 collinear features with correlation > 0.8
Final shape: (70, 32)


In [136]:
balanced_df.to_csv("data/ionosphere.csv", index=False)

print(f"Saved balanced dataset with shape: {balanced_df.shape}")
print(f"New class distribution:\n{balanced_df['target'].value_counts()}")

Saved balanced dataset with shape: (70, 32)
New class distribution:
target
0    35
1    35
Name: count, dtype: int64


## Synthetic

In [137]:
def generate_dataset(p=0.5, n=1000, d=10, g=0.5) -> tuple[np.ndarray, np.ndarray]:
    """
    Generates synthethic dataset

    Args: 
        p: prior probability for y=1
        n: number of instances
        d: number of features
        g: param for cov matrix

    Returns:
        X, y
    """
    y = bernoulli.rvs(p, size=n)
    
    # mean vectors
    m0 = np.zeros(d)
    m1 = np.array([1/(i+1) for i in range(d)])

    # cov matrix
    S = np.array([[g**abs(i - j) for j in range(d)] for i in range(d)])

    X = np.zeros((n, d))
    X[y==0] = multivariate_normal.rvs(mean = m0, cov=S, size=len(X[y==0]))
    X[y==1] = multivariate_normal.rvs(mean = m1, cov=S, size=len(X[y==1]))   

    return X, y