In [34]:
import os
import argparse
import math as m
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [35]:
datasetname = 'unedited_data/Seawater.csv'
prefix = 'Seawater'
percent = 0.25
verbose = 1

In [36]:
# Read original dataset
if os.path.exists(datasetname): # -d
    df = pd.read_csv(datasetname, header=None, encoding='latin-1')
else:
    raise FileNotFoundError('File %s not found!' % (datasetname))

# Making sure NAs are taken care of
df.dropna(axis=1, how='all', inplace=True)
df.fillna(0, inplace=True)

df = df.rename(columns={df.columns[0]: 'label'})
df = df.reset_index(drop=True)

In [37]:
# Dealing with labels and classes
classes = dict(enumerate(np.unique(df.label)))
classes = {v: k for k, v in classes.items()}
df.label = df.label.map(classes)
df_classes_counts = Counter(df.label).values()
# Save dictionary
np.save('../data/{}_labels_dict.npy'.format(prefix), classes) 

In [38]:
X_train, X_test, y_train, y_test = train_test_split(df, df.label,
                                                stratify=df.label, 
                                                test_size=percent)

In [39]:
X_train_classes_counts = Counter(X_train.label).values()
X_test_classes_counts = Counter(X_test.label).values()

# Saving to CSV files
X_train.to_csv('../data/{}_TRAIN'.format(prefix), header=None, index_label=None, index=None) # -p
X_test.to_csv('../data/{}_TEST'.format(prefix), header=None, index_label=None, index=None)

if verbose == 1: # -v
    print()
    print("Finished loading dataset...")
    print("------------------------")
    print("Number of samples:  ", df.shape[0])
    print("Number of classes:  ", len(np.unique(df.label)))
    print("Number of samples per class:  ", df_classes_counts)
    print("Sequence length:  ", df.shape[-1])
    print("------------------------")
    print("Number of samples in training set:  ", X_train.shape[0])
    print("Number of classes in training set:  ", len(X_train_classes_counts))
    print("Number of training samples per class:  ", X_train_classes_counts)
    print("------------------------")
    print("Number of samples in test set:  ", X_test.shape[0])
    print("Number of classes in test set:  ", len(X_test_classes_counts))
    print("Number of testing samples per class:  ", X_test_classes_counts)


Finished loading dataset...
------------------------
Number of samples:   351
Number of classes:   11
Number of samples per class:   dict_values([8, 32, 23, 12, 18, 60, 50, 23, 37, 8, 80])
Sequence length:   1003
------------------------
Number of samples in training set:   263
Number of classes in training set:   11
Number of training samples per class:   dict_values([45, 37, 24, 60, 6, 17, 14, 28, 17, 9, 6])
------------------------
Number of samples in test set:   88
Number of classes in test set:   11
Number of testing samples per class:   dict_values([9, 13, 15, 20, 8, 2, 6, 6, 2, 4, 3])
