In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import utils
from sklearn.model_selection import train_test_split

### Set path to data

In [None]:
DATA = "easy_informative_10"

In [None]:
PATH = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/data.csv".format(DATA)

### Load data into pandas data frame

In [None]:
data = pd.read_csv(
    PATH, 
    header=None,
    sep=',',  #'\s+'
    skiprows=None)
#     dtype=np.float32,
#     na_values=["?"])

Basic data information

### Preprocessing
Convert categorical variable to dummy variable <br/>
Delete columns <br/>
Extract binary classes

In [None]:
# # create dummy variables for categorical variables
# seismic_dummies = pd.get_dummies(data[0])
# seismoacoustic_dummies = pd.get_dummies(data[1])
# shift_dummies = pd.get_dummies(data[2])
# ghazard_dummies = pd.get_dummies(data[7])
# # delete original categorical variables
# data.drop([0, 1, 2, 7], axis=1, inplace=True)

In [None]:
# data.isnull().any().any()

In [None]:
# data = data[(data[40] == 1) | (data[40] == 2)]

In [None]:
# data = pd.concat([seismic_dummies, seismoacoustic_dummies, shift_dummies, ghazard_dummies, data], axis=1)

In [None]:
# data.drop(["Study", "Run"], axis=1, inplace=True)

In [None]:
# data[10] = data[10].astype('category').cat.codes

In [None]:
# dummies = pd.get_dummies(data[0])
# data.drop(0, axis=1, inplace=True)
# data = pd.concat([dummies, data], axis=1)

In [None]:
# data.to_csv("/home/ypen260/gitrepos/project_activationfunction/generalisation/uci_data/magic04_new/data.csv",
#                 header=False,
#                 index=False)

###  Stratified sampling to create test set and the rest

In [None]:
data = pd.read_csv(
    "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/data.csv".format(DATA), 
    header=None,
    sep=',',
    skiprows=None)

In [None]:
target_index = data.columns[-1]
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for rest_index, test_index in split.split(data, data[target_index]):
    rest_set = data.loc[rest_index]
    test_set = data.loc[test_index]


In [None]:
rest_set.to_csv("/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/rest.csv".format(DATA),
                header=False,
                index=False)
test_set.to_csv("/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/test.csv".format(DATA),
                header=False,
                index=False)

### Generate training set and unlabelled data

In [None]:
remaining_data = pd.read_csv(
        "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/rest.csv".format(DATA), 
        header=None,
        sep=',',
        skiprows=None)

In [None]:
# Split remaining data into training data and unlabelled data
target_index = remaining_data.columns[-1]
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
for unlabelled_index, train_index in split.split(remaining_data, remaining_data[target_index]):
    unlabelled = remaining_data.loc[unlabelled_index]
    train_set = remaining_data.loc[train_index]


In [None]:
# delete all the labels for unlabelled data
target_index = remaining_data.columns[-1]
unlabelled.drop(target_index, axis=1, inplace=True)

In [None]:
train_set.to_csv("/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/train.csv".format(DATA),
                header=False,
                index=False)
unlabelled.to_csv("/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/unlabelled.csv".format(DATA),
                header=False,
                index=False)

### Generate training and validation sets for pretraining

In [None]:
train_path = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/train.csv".format(DATA)
unlabelled_path = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/unlabelled.csv".format(DATA)

In [None]:
train_set = pd.read_csv(train_path, header=None)
unlabelled = pd.read_csv(unlabelled_path, header=None)

In [None]:
target_index = remaining_data.columns[-1]
train_set.drop(target_index, axis=1, inplace=True)

In [None]:
# combine train and unlabelled data
combined = pd.concat([train_set, unlabelled], axis=0)

In [None]:
# split combined into unlabelled_train and unlabelled_val
unlabelled_train, unlabelled_val = train_test_split(combined, test_size=0.3, random_state=0)

In [None]:
unlabelled_train_path = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/unlabelled_train.csv".format(DATA)
unlabelled_val_path = "/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/unlabelled_val.csv".format(DATA)

In [None]:
unlabelled_train.to_csv(unlabelled_train_path,
                header=False,
                index=False)
unlabelled_val.to_csv(unlabelled_val_path,
                header=False,
                index=False)

In [None]:
# create training and validation sets for the supervised pretraining
pre_train = utils.get_pretrain_data(unlabelled_train_path, seed=0)
pre_val = utils.get_pretrain_data(unlabelled_val_path, seed=0)

In [None]:
assert 2*unlabelled_train.shape[0] == pre_train.shape[0]
assert 2*unlabelled_val.shape[0] == pre_val.shape[0]
assert unlabelled_train.shape[1]+1 == pre_train.shape[1]
assert unlabelled_val.shape[1]+1 == pre_val.shape[1]

In [None]:
np.savetxt("/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/pre_train.csv".format(DATA), pre_train, delimiter=",")
np.savetxt("/home/alex/gitrepos/project_activation_function/generalisation/synthetic/{}/pre_val.csv".format(DATA), pre_val, delimiter=",")
