In [1]:
import sklearn.datasets as datasets
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import pandas as pd
import errno
import os


In [2]:
def generate(n_instances, n_features, n_informative, n_redundant, n_classes, class_sep=1.0, n_clusters=1, seed=0):
    "Generate synthetic data."
    X, y = datasets.make_classification(
        n_samples=n_instances,                  # The number of samples
        n_features=n_features,                  # The total number of features.
        n_informative=n_informative,   # The number of informative features
        n_redundant=n_redundant,                          # No redundant features
        n_repeated=0,                           # No duplicated features
        n_classes=n_classes,                    # The number of classes
        n_clusters_per_class=n_clusters,        # The number of clusters per class.
        weights=None,                           # balanced classes
        flip_y=0,                               # The fraction of samples whose class are randomly exchanged
        class_sep=class_sep,                          # Larger values spread out the clusters/classes and make the classification task easier.
        scale=1.0,                              # No scale
        shuffle=True,                           # Shuffle the samples and the features.
        random_state=seed)
    
    # attach labels to data
    data = np.column_stack((X,y))

    return data

# Start generating synthetic data

In [None]:
data = generate(
    n_instances=10000,
    n_features=10,
    n_informative=10,
    n_redundant=0,
    n_classes=2,
    class_sep=2
)

In [None]:
data.shape

In [None]:
WRITE_PATH="/home/alex/gitrepos/project_activation_function/generalisation/synthetic/easy_informative_10/"
try:
    os.makedirs(WRITE_PATH)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise
np.savetxt(WRITE_PATH+"data.csv", data, delimiter=",")

In [18]:
DATA = "difficult_informative_10"
PATH = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/rest.csv".format(DATA)

In [19]:
data = pd.read_csv(
    PATH, 
    header=None,
    sep=',',  #'\s+'
    skiprows=None)

In [20]:
data.shape

(7000, 11)

In [21]:
target_index = data.columns[-1]
split = StratifiedShuffleSplit(n_splits=1, train_size=2000, random_state=0)
for train_index, _ in split.split(data, data[target_index]):
    train_set = data.loc[train_index]



In [22]:
train_set.to_csv("/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/train_2000.csv".format(DATA),
                header=False,
                index=False)