In [None]:
import sklearn.datasets as datasets
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [None]:
def generate(n_instances, n_features, n_informative_features, label_noise, seed=0, n_classes=2, n_clusters=1):
    "Generate synthetic data."
    X, y = datasets.make_classification(
        n_samples=n_instances,                  # The number of samples
        n_features=n_features,                  # The total number of features.
        n_informative=n_informative_features,   # The number of informative features
        n_redundant=0,                          # No redundant features
        n_repeated=0,                           # No duplicated features
        n_classes=n_classes,                    # The number of classes
        n_clusters_per_class=n_clusters,        # The number of clusters per class.
        weights=None,                           # balanced classes
        flip_y=label_noise,                     # The fraction of samples whose class are randomly exchanged
        class_sep=1.0,                          # Larger values spread out the clusters/classes and make the classification task easier.
        scale=1.0,                              # No scale
        shuffle=True,                           # Shuffle the samples and the features.
        random_state=seed)
    
    # attach labels to data
    data = np.column_stack((X,y))

    return data

In [None]:
def create_test(data, test_size, seed=0):
    """
    Create a test set from the generated synthetic data.
    
    Args:
        data: numpy array. 
        test_size: float or int
        seed: int.
        
    Return:
        test set and the remaining data (both are numpy arrays)
    """
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    for remain_index, test_index in sss.split(data, data[:,-1]):
        test_set = data[test_index]
        remain = data[remain_index]   
        
    return test_set, remain

In [None]:
def stratified_split(data, train_size, seed=0):
    """
    stratified sampling to create training set of desired size.
    
    Args:
        data: numpy array.
        train_size: float or int
        seed: int.
    
    Return:
        A numpy array containing training data.
    """
    sss = StratifiedShuffleSplit(n_splits=1, train_size=train_size, test_size=None, random_state=seed)
    for train_index, test_index in sss.split(data, data[:,-1]):
        training_set = data[train_index]
        test_set = data[test_index]
        
    return training_set, test_set   

### Create datasets with different number of instances, but no noisy features or noisy labels

In [None]:
N_INSTANCES = 30000
N_FEATURES = 25
N_INFORMATIVE = 25 
# number of noisy features = N_FEATURES - N_INFORMATIVE
LABEL_NOISE = 0.0
SEED=0

DIR_PATH = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/size"

N_CLASSES = 2
N_CLUSTERS = 1

In [None]:
# Generate synthetic data
data = generate(
    n_instances=N_INSTANCES, 
    n_features=N_FEATURES, 
    n_informative_features=N_INFORMATIVE, 
    label_noise=LABEL_NOISE, 
    seed=SEED, 
    n_classes=N_CLASSES, 
    n_clusters=N_CLUSTERS)

In [None]:
# create and write test data to a file
TEST_SIZE = 5000

test_set, remain = create_test(data, test_size=TEST_SIZE)

FILENAME = "test_set.csv"
WRITE_PATH = DIR_PATH + "/" + FILENAME
# save test set to csv file
np.savetxt(WRITE_PATH, test_set, delimiter=",")

In [None]:
# create training sets with different sizes and write them to files
TRAIN_SIZE = [100, 500, 1000, 5000, 10000]

for train_size in TRAIN_SIZE:
    train_set, _ = stratified_split(remain, train_size, seed=0)
    
    FILENAME = "{}_instances.csv".format(train_size)
    WRITE_PATH = DIR_PATH + "/" + FILENAME
    # save test set to csv file
    np.savetxt(WRITE_PATH, train_set, delimiter=",")


### Increasing number of noisy features but fixed number of training instances

In [None]:
N_INSTANCES = 10000
N_FEATURES = 25
# create training sets and test sets with different number of noisy features and write them to files
# N_INFORMATIVE = [25, 20, 15, 10, 5]
N_INFORMATIVE = [0]

# number of noisy features = N_FEATURES - N_INFORMATIVE
TRAIN_SIZE = 5000
LABEL_NOISE = 0.0
SEED=0

DIR_PATH = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/noise_feature"

N_CLASSES = 2
N_CLUSTERS = 1

In [None]:
# create and write test data to a file
TEST_SIZE = 5000

for n_informative in N_INFORMATIVE:
    # Generate synthetic data
    data = generate(
        n_instances=N_INSTANCES, 
        n_features=N_FEATURES, 
        n_informative_features=n_informative, 
        label_noise=LABEL_NOISE, 
        seed=SEED, 
        n_classes=N_CLASSES, 
        n_clusters=N_CLUSTERS)
    
    train_set, test_set = stratified_split(data, TRAIN_SIZE, seed=0)
    
    FILENAME = "{}_noisy_features.csv".format(N_FEATURES-n_informative)
    WRITE_PATH = DIR_PATH + "/" + FILENAME    
    # save data to csv file
    np.savetxt(WRITE_PATH, train_set, delimiter=",")
    
    FILENAME = "{}_noisy_features_test.csv".format(N_FEATURES-n_informative)
    WRITE_PATH = DIR_PATH + "/" + FILENAME    
    # save data to csv file
    np.savetxt(WRITE_PATH, test_set, delimiter=",")


### Increasing label noises, no noisy features, fixed number of instances

In [None]:
N_INSTANCES = 10000
N_FEATURES = 25
# create training sets and test sets with different number of noisy features and write them to files
N_INFORMATIVE = 25
# number of noisy features = N_FEATURES - N_INFORMATIVE
TRAIN_SIZE = 5000
LABEL_NOISE = [0.0, 0.1, 0.01, 0.001]
SEED=0

DIR_PATH = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/noise_label"

N_CLASSES = 2
N_CLUSTERS = 1

In [None]:
# create and write test data to a file
TEST_SIZE = 5000

for label_noise in LABEL_NOISE:
    # Generate synthetic data
    data = generate(
        n_instances=N_INSTANCES, 
        n_features=N_FEATURES, 
        n_informative_features=N_INFORMATIVE, 
        label_noise=label_noise, 
        seed=SEED, 
        n_classes=N_CLASSES, 
        n_clusters=N_CLUSTERS)
    
    train_set, test_set = stratified_split(data, TRAIN_SIZE, seed=0)
    
    FILENAME = "{}_noisy_labels.csv".format(str(label_noise).replace('.',''))
    WRITE_PATH = DIR_PATH + "/" + FILENAME    
    # save data to csv file
    np.savetxt(WRITE_PATH, train_set, delimiter=",")
    
    FILENAME = "{}_noisy_labels_test.csv".format(str(label_noise).replace('.',''))
    WRITE_PATH = DIR_PATH + "/" + FILENAME    
    # save data to csv file
    np.savetxt(WRITE_PATH, test_set, delimiter=",")

### Test loading the data

In [None]:
def load_csv(path):
    """
    Read data in csv format.
    Note: this function should only be used for dataset that can fit into memory.

    Args:
        path: a string that specifies the path to the csv file

    Return:
        data as numpy arrays
    """

    # load raw data into memory
    data = pd.read_csv(path, header=None)
    
    # find the target index from the target location index
    target_index = data.columns[-1]

    labels = pd.get_dummies(data[target_index])
    data.drop(target_index, axis=1, inplace=True)

    return data.as_matrix(), labels.as_matrix()
