In [1]:
import numpy as np
import os
from os.path import join

(4) Dataset Synthesis 
-------------------------
We have two classes, each with 20 features. Each class follows a mixture of three multivariate normal distributions, each with given mean vectors for class 0 and class 1, along with the covariance matrix shared between Gaussians. We generate 2000 class examples using np.random.multivariate_normal as before, label class 0 as negative, class 1 as positive. Then, we randomly place these classes in a 1200/400/400 training/validation/test set split. We write the data to a .csv file. Performance results reported in (5) will be on the test set.

In [2]:
# Load data into per-class mean vectors, and covariance vectors
dataset_dir = join(os.getcwd(),'hwk2_datasets')
test = os.getcwd()
class1_m1 = np.loadtxt(join(dataset_dir, 'DS2_c1_m1.txt'), delimiter=',', usecols=range(20))
class1_m2 = np.loadtxt(join(dataset_dir, 'DS2_c1_m2.txt'), delimiter=',', usecols=range(20))
class1_m3 = np.loadtxt(join(dataset_dir, 'DS2_c1_m3.txt'), delimiter=',', usecols=range(20))
class2_m1 = np.loadtxt(join(dataset_dir, 'DS2_c2_m1.txt'), delimiter=',', usecols=range(20))
class2_m2 = np.loadtxt(join(dataset_dir, 'DS2_c2_m2.txt'), delimiter=',', usecols=range(20))
class2_m3 = np.loadtxt(join(dataset_dir, 'DS2_c2_m3.txt'), delimiter=',', usecols=range(20))
cov1 = np.loadtxt(join(dataset_dir, 'DS2_Cov1.txt'), delimiter=',', usecols=range(20))
cov2 = np.loadtxt(join(dataset_dir, 'DS2_Cov2.txt'), delimiter=',', usecols=range(20))
cov3 = np.loadtxt(join(dataset_dir, 'DS2_Cov3.txt'), delimiter=',', usecols=range(20))

# Fix randomness for testing purposes
np.random.seed(1)

# First, generate a class distribution vector that dictates the number of samples we
# choose from each of the Gaussians (p_dist =[0.1, 0.42, 0.48])
p_dist = [0.1, 0.42, 0.48]
class1_dist = np.random.multinomial(2000, p_dist)
class2_dist = np.random.multinomial(2000, p_dist)

# Now, sample from a multivariate Gaussian accordingly, and concatenate results to form 
# class 1, class 2

class1_m1_sample = np.random.multivariate_normal(class1_m1, cov1, class1_dist[0])
class1_m2_sample = np.random.multivariate_normal(class1_m2, cov2, class1_dist[1])
class1_m3_sample = np.random.multivariate_normal(class1_m3, cov3, class1_dist[2])

class2_m1_sample = np.random.multivariate_normal(class2_m1, cov1, class2_dist[0])
class2_m2_sample = np.random.multivariate_normal(class2_m2, cov2, class2_dist[1])
class2_m3_sample = np.random.multivariate_normal(class2_m3, cov3, class2_dist[2])

class1 = np.concatenate((class1_m1_sample, class1_m2_sample, class1_m3_sample))
class2 = np.concatenate((class2_m1_sample, class2_m2_sample, class2_m3_sample))

# Label data generated as class 1 negative, class 2 positive (this should be faster than 
# append/concat based methods)

negative_labels = -1 * np.ones((class1.shape[0], class1.shape[1] + 1))
negative_labels[:,:-1] = class1
class1 = negative_labels
positive_labels = np.ones((class2.shape[0], class2.shape[1] + 1))
positive_labels[:,:-1] = class2
class2 = positive_labels

# Divide 2000 examples into 400 train/ 400 validation/ 1200 testing
test_set = np.concatenate((class1[:400], class2[:400]), axis=0)
val_set = np.concatenate((class1[400:800], class2[400:800]), axis=0)
train_set = np.concatenate((class1[800:], class2[800:]), axis=0)

# Finally, shuffle the sets so that class1/class2 are intermixed randomly in each set
np.random.shuffle(train_set)
np.random.shuffle(val_set)
np.random.shuffle(test_set)

# Write datasets to .csv files
np.savetxt('DS2_train_set.csv', train_set, delimiter=',')
np.savetxt('DS2_val_set.csv', val_set, delimiter=',')
np.savetxt('DS2_test_set.csv', test_set, delimiter=',')