In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
data_path = "data/"
gen_data_path = "generated/"
c1_m1_file = "DS2_c1_m1.txt" # positive 
c1_m2_file = "DS2_c1_m2.txt" # positive 
c1_m3_file = "DS2_c1_m3.txt" # positive 
c2_m1_file = "DS2_c2_m1.txt" # negative
c2_m2_file = "DS2_c2_m2.txt" # negative
c2_m3_file = "DS2_c2_m3.txt" # negative
cov1_file = "DS2_Cov1.txt"
cov2_file = "DS2_Cov2.txt"
cov3_file = "DS2_Cov3.txt"

probability_mixture = [0.1,0.42,0.48]
num_class = 2
num_features = 20
num_obs = 2000
train_val_test_sizes = [0.6, 0.2, 0.2]

In [3]:
def get_matrix(feat, file_path, squeeze = True):
    if squeeze:
        return np.squeeze(pd.read_csv(file_path, header = None).drop(columns = feat).values, axis = 0)
    else:
        return pd.read_csv(file_path, header = None).drop(columns = feat).values

In [4]:
def get_data(feat, obs, class_, mixture):
    # get the matrix
    c1_m1 = get_matrix(feat, data_path + c1_m1_file)
    c1_m2 = get_matrix(feat, data_path + c1_m2_file)
    c1_m3 = get_matrix(feat, data_path + c1_m3_file)
    c2_m1 = get_matrix(feat, data_path + c2_m1_file)
    c2_m2 = get_matrix(feat, data_path + c2_m2_file)
    c2_m3 = get_matrix(feat, data_path + c2_m3_file)

    cov1 = get_matrix(feat, data_path + cov1_file, False)
    cov2 = get_matrix(feat, data_path + cov2_file, False)
    cov3 = get_matrix(feat, data_path + cov3_file, False)
    
    # generate random data for negative
    neg_class_1 = np.random.multivariate_normal(c1_m1, cov1, int(obs*mixture[0]/class_))
    neg_class_2 = np.random.multivariate_normal(c1_m2, cov2, int(obs*mixture[1]/class_))
    neg_class_3 = np.random.multivariate_normal(c1_m3, cov3, int(obs*mixture[2]/class_))
    
    # generate random data for positive
    pos_class_1 = np.random.multivariate_normal(c2_m1, cov1, int(obs*mixture[0]/class_))
    pos_class_2 = np.random.multivariate_normal(c2_m2, cov2, int(obs*mixture[1]/class_))
    pos_class_3 = np.random.multivariate_normal(c2_m3, cov3, int(obs*mixture[2]/class_))
    
    # add the class id
    df_1_n = pd.DataFrame(neg_class_1)
    df_2_n = pd.DataFrame(neg_class_2)
    df_3_n = pd.DataFrame(neg_class_3)
    df_1_p = pd.DataFrame(neg_class_1)
    df_2_p = pd.DataFrame(pos_class_2)
    df_3_p = pd.DataFrame(pos_class_3)
    df_1_n['class'], df_2_n['class'], df_3_n['class'] = 0, 0, 0
    df_1_p['class'], df_2_p['class'], df_3_p['class'] = 1, 1, 1
    
    # concat into a single df
    return pd.concat([df_1_n, df_2_n, df_3_n, df_1_p, df_2_p, df_3_p], ignore_index = True)

In [5]:
df = get_data(num_features, num_obs, num_class, probability_mixture)

In [6]:
def train_val_test_slipt(data, sizes):
    # shuffle the data
    data = data.sample(frac=1)
    
    # get split sizes
    tr_size = int(sizes[0] * len(data))
    val_size = int(sizes[1] * len(data))
    test_size = int(sizes[2] * len(data))
    
    # divide the data
    data_tr = data[:tr_size]
    data_val = data[tr_size:tr_size + val_size]
    data_test = data[-test_size:]
    
    return data_tr, data_val, data_test

In [7]:
df_tr, df_val, df_test = train_val_test_slipt(df, train_val_test_sizes)

In [9]:
df_test.to_csv(gen_data_path + 'DS2')
df_val.to_csv(gen_data_path + 'DS2_val')
df_tr.to_csv(gen_data_path + 'DS2_train')