In [None]:
import pandas as pd
import numpy as np
from scipy.stats import bernoulli, multivariate_normal
from feature_engine.selection import DropCorrelatedFeatures

## Real

### dataset 1 - secom

https://archive.ics.uci.edu/dataset/179/secom

In [76]:
X_secom = pd.read_csv("raw_data/secom/secom.data", header=None, sep=' ')
y_secom = pd.read_csv("raw_data/secom/secom_labels.data", header=None, sep=' ')[0]

In [77]:
y_secom = y_secom.apply(lambda x: max(x,0))  # change -1 to 0

In [78]:
# fill nans
X_secom = X_secom.fillna(X_secom.mean())

# drop correlated columns
tr = DropCorrelatedFeatures(None, threshold=0.9)
X_secom = tr.fit_transform(X_secom)

In [80]:
y_secom.value_counts()

0
0    1463
1     104
Name: count, dtype: int64

In [None]:
# subset so number of features is at least 50% of number of instances
new_n = X_secom.shape[1] * 2

data = X_secom.copy()
data["y"] = y_secom

y1 = data[data["y"] == 1]
y0 = data[data["y"] == 0]

# we keep all rows with y=1, as data is highly imbalanced and we want to balance it a litle bit
rest_n = new_n - len(y1)
y0_sample = y0.sample(n=rest_n, random_state=42)

subset = pd.concat([y1, y0_sample]).sample(frac=1).reset_index(drop=True)  # sample to randomize order of rows
y_subset = subset["y"]
X_subset = subset.drop(columns=["y"])

In [103]:
print("instances x features:", X_subset.shape)
y_subset.value_counts()

instances x features: (768, 384)


y
0    664
1    104
Name: count, dtype: int64

In [100]:
subset.to_csv("data/secom.csv", index=False)

### dataset 2

propozycja: https://archive.ics.uci.edu/dataset/604/gait+classification

## Synthetic

In [None]:
def generate_dataset(p=0.5, n=1000, d=10, g=0.5) -> tuple[np.ndarray, np.ndarray]:
    """
    Generates synthethic dataset

    Args: 
        p: prior probability for y=1
        n: number of instances
        d: number of features
        g: param for cov matrix

    Returns:
        X, y
    """
    y = bernoulli.rvs(p, size=n)
    
    # mean vectors
    m0 = np.zeros(d)
    m1 = np.array([1/(i+1) for i in range(d)])

    # cov matrix
    S = np.array([[g**abs(i - j) for j in range(d)] for i in range(d)])

    X = np.zeros((n, d))
    X[y==0] = multivariate_normal.rvs(mean = m0, cov=S, size=len(X[y==0]))
    X[y==1] = multivariate_normal.rvs(mean = m1, cov=S, size=len(X[y==1]))   

    return X, y