In [None]:
import os
import numpy as np
import pandas as pd
from scipy.stats import bernoulli, multivariate_normal
from scipy.sparse import csr_matrix
from feature_engine.selection import DropCorrelatedFeatures
from sklearn.preprocessing import StandardScaler

## Arhythmia

In [None]:
# load data
data = pd.read_csv("raw_data/arrhythmia/arrhythmia.data", header=None, na_values='?')

# set last column as target
data.rename(columns={data.columns[-1]: 'target'}, inplace=True)

# convert classes to binary: 0 (Normal), 1 (all others)
data['target'] = data['target'].apply(lambda x: 0 if x == 1 else 1)

# create data directory if needed
os.makedirs('data', exist_ok=True)

# save data to CSV
data.to_csv('data/arrhythmia.csv', index=False)
print("Data saved to 'data/arrhythmia.csv'.")

## Dexter

In [None]:
# Function to load Dexter data in sparse format
def load_dexter_data_sparse(filepath_data, filepath_labels, num_features=20000):
    rows, cols, vals = [], [], []
    with open(filepath_data, 'r') as file:
        for row_idx, line in enumerate(file):
            elements = line.strip().split()
            for el in elements:
                col_idx, val = el.split(":")
                rows.append(row_idx)
                cols.append(int(col_idx)-1)  # indices start at 1
                vals.append(int(val))

    X_sparse = csr_matrix((vals, (rows, cols)), shape=(row_idx+1, num_features))
    X = pd.DataFrame(X_sparse.toarray())

    y = pd.read_csv(filepath_labels, header=None, names=['target'])
    
    df = X.copy()
    df['target'] = y['target']
    return df

train_data_path = 'raw_data/dexter/DEXTER/dexter_train.data'
train_labels_path = 'raw_data/dexter/DEXTER/dexter_train.labels'

dexter_df = load_dexter_data_sparse(train_data_path, train_labels_path)

print("Training data table (first 5 rows):")
print(dexter_df.head())

print("\nBasic statistics for training data:")
print(dexter_df.describe())

print("\nUnique classes in training data:")
print(dexter_df['target'].unique())

os.makedirs('data', exist_ok=True)

dexter_df.to_csv('data/dexter.csv', index=False)
print("Data saved to 'data/dexter.csv'.")

## Speech

In [None]:
data = pd.read_csv("raw_data/pd_speech_features.csv").drop(columns=['id'])
data.rename(columns={data.columns[-1]: 'target'}, inplace=True)

In [None]:
corr_matrix = data.drop(columns=['target']).corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
data.drop(columns=to_drop, inplace=True)

In [None]:
X = data.drop(columns=['target'])
y = data['target']

binary_cols = X.nunique()[X.nunique() == 2].index
numeric_cols = X.columns.difference(binary_cols)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numeric_cols])
X_scaled_df = pd.DataFrame(X_scaled, columns=numeric_cols)
X_final = pd.concat([X_scaled_df, X[binary_cols].reset_index(drop=True)], axis=1)
data_scaled = pd.concat([X_final, y], axis=1)

In [None]:
new_column_names = {col: i for i, col in enumerate(data.columns[:-1])}  
data = data.rename(columns=new_column_names)

In [None]:
data.to_csv('data/speech.csv', index=False)

## Secom

In [None]:
""" https://archive.ics.uci.edu/dataset/179/secom """

In [4]:
X_secom = pd.read_csv("raw_data/secom/secom.data", header=None, sep=' ')
y_secom = pd.read_csv("raw_data/secom/secom_labels.data", header=None, sep=' ')[0]

In [5]:
y_secom = y_secom.apply(lambda x: max(x,0))  # change -1 to 0

In [6]:
# fill nans
X_secom = X_secom.fillna(X_secom.mean())

# drop correlated columns
tr = DropCorrelatedFeatures(None, threshold=0.9)
X_secom = tr.fit_transform(X_secom)

In [7]:
y_secom.value_counts()

0
0    1463
1     104
Name: count, dtype: int64

In [9]:
# subset so number of features is at least 50% of number of instances
new_n = X_secom.shape[1] * 2

data = X_secom.copy()
data["target"] = y_secom

y1 = data[data["target"] == 1]
y0 = data[data["target"] == 0]

# we keep all rows with y=1, as data is highly imbalanced and we want to balance it a litle bit
rest_n = new_n - len(y1)
y0_sample = y0.sample(n=rest_n, random_state=42)

subset = pd.concat([y1, y0_sample]).sample(frac=1).reset_index(drop=True)  # sample to randomize order of rows
y_subset = subset["target"]
X_subset = subset.drop(columns=["target"])

In [10]:
print("instances x features:", X_subset.shape)
y_subset.value_counts()

instances x features: (768, 384)


target
0    664
1    104
Name: count, dtype: int64

In [11]:
subset.to_csv("data/secom.csv", index=False)

## Synthetic

In [None]:
def generate_dataset(p=0.5, n=1000, d=10, g=0.5) -> tuple[np.ndarray, np.ndarray]:
    """
    Generates synthethic dataset

    Args: 
        p: prior probability for y=1
        n: number of instances
        d: number of features
        g: param for cov matrix

    Returns:
        X, y
    """
    y = bernoulli.rvs(p, size=n)
    
    # mean vectors
    m0 = np.zeros(d)
    m1 = np.array([1/(i+1) for i in range(d)])

    # cov matrix
    S = np.array([[g**abs(i - j) for j in range(d)] for i in range(d)])

    X = np.zeros((n, d))
    X[y==0] = multivariate_normal.rvs(mean = m0, cov=S, size=len(X[y==0]))
    X[y==1] = multivariate_normal.rvs(mean = m1, cov=S, size=len(X[y==1]))   

    return X, y