### indicator

In [42]:
import numpy as np
import torch

from sklearn.model_selection import train_test_split

In [43]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x754207f8c0d0>

In [44]:
d = 6   # label 후보 feature 개수
n = 10000  # 총 데이터 개수

In [45]:
X = np.random.binomial(n=1, p=0.5, size=(n, d))
indicators = np.random.choice(a=d, size=(n, 1), replace=True)
y = X[np.arange(n), indicators.flatten()]
X = np.concatenate([X, indicators], axis=-1)

In [46]:
# 8:1:1 분할 

# train, (val+test)
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    shuffle=True,
)

# val, test
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp,
    y_tmp,
    test_size=0.50,
    random_state=42,
    shuffle=True,
)

In [47]:
X_train = torch.from_numpy(X_train).float()
X_val   = torch.from_numpy(X_val).float()
X_test  = torch.from_numpy(X_test).float()

y_train = torch.from_numpy(y_train).float()
y_val   = torch.from_numpy(y_val).float()
y_test  = torch.from_numpy(y_test).float()

In [48]:
torch.save(X_train, "X_train.pt")
torch.save(X_val,   "X_val.pt")
torch.save(X_test,  "X_test.pt")

torch.save(y_train, "y_train.pt")
torch.save(y_val,   "y_val.pt")
torch.save(y_test,  "y_test.pt")

### XOR

In [49]:
def create_XOR(n, noisy_x, delta, seed=42): # 전체 데이터 개수, noisy_x의 개수, delta의 개수
    '''
    기본 feature 구성: x_1, x_2, noisy x_s
    y = x_1 ^ x_2로 구성 (XOR)
    delta값에 따라 다른 noisy_x가 샘플링될 때 y와 동일한 값의 샘플이 더 많이 생성됨 (0 ~ 0.5) 
    '''
    assert 0.0 <= delta <= 0.5

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    x1 = np.random.binomial(1, 0.5, size=n)
    x2 = np.random.binomial(1, 0.5, size=n)
    y = (x1 ^ x2).astype(int)

    p = 0.5 + delta * (2*y - 1)
    p = p.reshape(-1, 1)
    noisy = np.random.binomial(1, p, size=(n, noisy_x))

    X = np.concatenate(
        [x1.reshape(-1, 1), x2.reshape(-1, 1), noisy],
        axis=-1
    )

    # 8:1:1 분할 
    # train, (val+test)
    X_train, X_tmp, y_train, y_tmp = train_test_split(
        X,
        y,
        test_size=0.20,
        random_state=42,
        shuffle=True,
    )

    # val, test
    X_val, X_test, y_val, y_test = train_test_split(
        X_tmp,
        y_tmp,
        test_size=0.50,
        random_state=42,
        shuffle=True,
    )

    X_train = torch.from_numpy(X_train).long()
    X_val   = torch.from_numpy(X_val).long()
    X_test  = torch.from_numpy(X_test).long()

    y_train = torch.from_numpy(y_train).long()
    y_val   = torch.from_numpy(y_val).long()
    y_test  = torch.from_numpy(y_test).long()

    torch.save(X_train, "X_train.pt")
    torch.save(X_val,   "X_val.pt")
    torch.save(X_test,  "X_test.pt")

    torch.save(y_train, "y_train.pt")
    torch.save(y_val,   "y_val.pt")
    torch.save(y_test,  "y_test.pt")

    return None