In [1]:
import numpy as np
import os
from os.path import join

(1) Dataset Synthesis 
-------------------------
We have two classes, each with 20 features. Each class follows a multivariate normal distribution and mean vectors for class 0 and class 1 are given, along with the covariance matrix. We generate 2000 class examples using np.random.multivariate_normal, label class 0 as negative, class 1 as positive. Then, we randomly place these classes in a 1200/400/400 training/validation/test set split. We write the data to a .csv file. Performance results will be on the test set.

In [2]:
# Load data into mean_1, mean_2, and covariance vectors
dataset_dir = join(os.getcwd(),'hwk2_datasets')
test = os.getcwd()
mean_0 = np.loadtxt(join(dataset_dir, 'DS1_m_0.txt'), delimiter=',', usecols=range(20))
mean_1 = np.loadtxt(join(dataset_dir, 'DS1_m_1.txt'), delimiter=',', usecols=range(20))
cov = np.loadtxt(join(dataset_dir, 'DS1_Cov.txt'), delimiter=',', usecols=range(20))

# Now, generate 2000 examples for each class (use multivariate normal distribution)
# Use seed = 1 to bias for reproducability & testing
np.random.seed(1)
class_0 = np.random.multivariate_normal(mean_0, cov, 2000)
class_1 = np.random.multivariate_normal(mean_1, cov, 2000)

# Label data generated with m0 negative, m1 positive (this should be faster than append/concat 
# based methods)
negative_labels = -1 * np.ones((class_0.shape[0], class_0.shape[1] + 1))
negative_labels[:,:-1] = class_0
class_0 = negative_labels
positive_labels = np.ones((class_1.shape[0], class_1.shape[1] + 1))
positive_labels[:,:-1] = class_1
class_1 = positive_labels

# Randomly shuffle the two classes (randomly assigning into train/val/test sets)
np.random.shuffle(class_0)
np.random.shuffle(class_1)

# Divide 2000 examples into 1200 train/ 400 validation/ 400 testing
test_set = np.concatenate((class_0[:400], class_1[:400]), axis=0)
val_set = np.concatenate((class_0[400:800], class_1[400:800]), axis=0)
train_set = np.concatenate((class_0[800:], class_1[800:]), axis=0)

# Finally, shuffle the sets so that class_0/class_1 are intermixed randomly in each set
np.random.shuffle(train_set)
np.random.shuffle(val_set)
np.random.shuffle(test_set)

# Write datasets to .csv files
np.savetxt('DS1_train_set.csv', train_set, delimiter=',')
np.savetxt('DS1_val_set.csv', val_set, delimiter=',')
np.savetxt('DS1_test_set.csv', test_set, delimiter=',')