In [None]:
import pandas as pd
import numpy as np
from scipy.stats import bernoulli, multivariate_normal

## Real

### dataset 4

In [4]:
X_secom = pd.read_csv("raw_data/secom/secom.data", header=None, sep=' ')
y_secom = pd.read_csv("raw_data/secom/secom_labels.data", header=None, sep=' ')[0]

## Synthetic

In [None]:
def generate_dataset(p=0.5, n=1000, d=10, g=0.5) -> tuple[np.ndarray, np.ndarray]:
    """
    Generates synthethic dataset

    Args: 
        p: prior probability for y=1
        n: number of instances
        d: number of features
        g: param for cov matrix

    Returns:
        X, y
    """
    y = bernoulli.rvs(p, size=n)
    
    # mean vectors
    m0 = np.zeros(d)
    m1 = np.array([1/(i+1) for i in range(d)])

    # cov matrix
    S = np.array([[g**abs(i - j) for j in range(d)] for i in range(d)])

    X = np.zeros((n, d))
    X[y==0] = multivariate_normal.rvs(mean = m0, cov=S, size=len(X[y==0]))
    X[y==1] = multivariate_normal.rvs(mean = m1, cov=S, size=len(X[y==1]))   

    return X, y