In [None]:
# import libraries
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
#

# Create 'fakedata' by stacking and summing two components:
# 1. A repeated array of values [1, 2, 3, 4] tiled 10 times vertically (10 rows).
# 2. Another array generated by multiplying [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] by 10 and then transposing it.
fakedata = np.tile(np.array([1, 2, 3, 4]), (10, 1)) + np.tile(10 * np.arange(1, 11), (4, 1)).T

# Create 'fakelabels' by checking if each element in the range [0, 1, 2, ..., 9] is greater than 4.
# This creates a Boolean array where 'True' corresponds to values greater than 4.
fakelabels = np.arange(10) > 4

# Print 'fakedata' and add a line break for clarity.
print(fakedata)
print(' ')

# Print 'fakelabels'.
print(fakelabels)


# Using train_test_split

In [None]:
# Specify the sizes of the partitions for data splitting.
# The order is training data, development set (devset), and test data.
partitions = [.8, .1, .1]

# Split the data into training and temporary test data using the specified training size.
# 'testTMP' variables are temporary and will be further split into devset and test data.
train_data, testTMP_data, train_labels, testTMP_labels = \
    train_test_split(fakedata, fakelabels, train_size=partitions[0])

# Now, split the temporary test data into devset and test data based on the partition sizes.
# The 'split' variable calculates the proportion of the devset relative to the remaining data.
devset_data, test_data, devset_labels, test_labels = \
    train_test_split(testTMP_data, testTMP_labels, train_size=partitions[1] / np.sum(partitions[1:]))

# Print out the sizes of the data partitions.
print('Training data size: ' + str(train_data.shape))
print('Devset data size: ' + str(devset_data.shape))
print('Test data size: ' + str(test_data.shape))
print(' ')

# Print out the contents of each data partition.
print('Training data: ')
print(train_data)
print(' ')

print('Devset data: ')
print(devset_data)
print(' ')

print('Test data: ')
print(test_data)


# Splitting the data manually using numpy

In [None]:
# Specify sizes of the partitions
# The order is train, devset, test
partitions = [.8, .1, .1]

# Split the data into train and test sets
train_data, testTMP_data, train_labels, testTMP_labels = train_test_split(fakedata, fakelabels, train_size=partitions[0])

# Now split the testTMP data into devset and test sets
# The split ratio is based on the specified partition proportions
split = partitions[1] / np.sum(partitions[1:])
devset_data, test_data, devset_labels, test_labels = train_test_split(testTMP_data, testTMP_labels, train_size=split)

# Print out the sizes of the partitions
print('Training data size: ' + str(train_data.shape))
print('Devset data size: ' + str(devset_data.shape))
print('Test data size: ' + str(test_data.shape))
print(' ')

# Print out the train, devset, and test data
print('Training data: ')
print(train_data)
print(' ')

print('Devset data: ')
print(devset_data)
print(' ')

print('Test data: ')
print(test_data)


In [None]:
# Selecting rows for the training data:
# We use indexing to select a subset of rows from the `fakedata` and `fakelabels` arrays.
# The indices for the training data are determined by the `randindices` array.
# We select rows from the beginning up to `partitionBnd[0]`, which corresponds to the size of the training partition.
train_dataN = fakedata[randindices[:partitionBnd[0]], :]
train_labelsN = fakelabels[randindices[:partitionBnd[0]]]

# Selecting rows for the devset data:
# We continue to use indexing to select a subset of rows for the devset data.
# Here, we start from `partitionBnd[0]` (the end of the training data) and go up to `partitionBnd[1]`, which defines the size of the devset partition.
devset_dataN = fakedata[randindices[partitionBnd[0]:partitionBnd[1]], :]
devset_labelsN = fakelabels[randindices[partitionBnd[0]:partitionBnd[1]]]

# Selecting rows for the test data:
# Similar to the previous two cases, we use indexing to select rows for the test data.
# This time, we start from `partitionBnd[1]` (the end of the devset data) and select rows up to the end of the dataset.
test_dataN = fakedata[randindices[partitionBnd[1]:], :]
test_labelsN = fakelabels[randindices[partitionBnd[1]:]]


In [None]:
# Printing out the sizes of the datasets:
# We use the `shape` attribute of NumPy arrays to determine the dimensions (rows and columns) of each dataset.
# The `train_dataN`, `devset_dataN`, and `test_dataN` arrays represent the data for training, devset, and testing.
# We use the `str()` function to convert the shape information to a string and then concatenate it with the description.

print('Training data size: ' + str(train_dataN.shape))
print('Devset size: '        + str(devset_dataN.shape))
print('Test data size: '     + str(test_dataN.shape))
print(' ')

# Printing out the actual data:
# We print the contents of each dataset to inspect the values.
# This is useful for debugging and understanding the dataset's structure and content.

print('Training data: ')
print(train_dataN)
print(' ')

print('Devset data: ')
print(devset_dataN)
print(' ')

print('Test data: ')
print(test_dataN)
