In [1]:
# Import necessary libraries for the task.
import torch  # Import PyTorch for deep learning.
import torch.nn as nn  # Import the neural network module from PyTorch.
import numpy as np  # Import NumPy for numerical operations.


In [2]:
# Load the Iris dataset using Seaborn.
import seaborn as sns
iris_data = sns.load_dataset('iris')

# Convert the dataset from a pandas dataframe to a PyTorch tensor.
data = torch.tensor(iris_data[iris_data.columns[0:4]].values).float()

# Transform the species labels into numerical values.
labels = torch.zeros(len(data), dtype=torch.long)
labels[iris_data.species == 'versicolor'] = 1
labels[iris_data.species == 'virginica'] = 2


In [15]:
labels.unique(return_counts=True)

(tensor([0, 1, 2]), tensor([50, 50, 50]))

# Separate data into train and test

In [6]:
# Set the proportion of data for training (e.g., 80% for training).
training_proportion = 0.8

# Calculate the number of training examples based on the proportion.
num_training = int(len(labels) * training_proportion)

# Initialize a boolean vector to select data and labels for training.
train_test_bool = np.zeros(len(labels), dtype=bool)

# Randomly select samples for training.
train_indices = np.random.choice(range(len(labels)), num_training, replace=False)
train_test_bool[train_indices] = True


In [17]:
train_test_bool

array([False,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True, False, False, False,  True,  True,
        True, False,  True, False,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False, False,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True, False, False,
        True,  True,

In [8]:
# Check the balance of species labels in the dataset.
print('Average of full dataset:')
print(torch.mean(labels.float()))  # Should be 1 by definition
print(' ')

print('Average of training dataset:')
print(torch.mean(labels[train_test_bool].float()))  # Should also be 1
print(' ')

print('Average of testing dataset:')
print(torch.mean(labels[~train_test_bool].float()))  # Should also be 1


Average of full dataset:
tensor(1.)
 
Average of training dataset:
tensor(0.9750)
 
Average of testing dataset:
tensor(1.1000)


In [9]:
# Define the architecture of the artificial neural network (ANN) model.
custom_ann_model = nn.Sequential(
    nn.Linear(4, 64),   # Input layer
    nn.ReLU(),          # ReLU activation
    nn.Linear(64, 64),  # Hidden layer
    nn.ReLU(),          # ReLU activation
    nn.Linear(64, 3)    # Output layer
)

# Define the loss function (Cross-Entropy Loss).
loss_function = nn.CrossEntropyLoss()

# Define the optimizer (Stochastic Gradient Descent).
optimizer = torch.optim.SGD(custom_ann_model.parameters(), lr=0.01)


In [10]:
# Print the shapes of the entire dataset, training set, and test set.
print('Shape of the entire dataset:')
print(data.shape)

print('Shape of the training set:')
print(data[train_test_bool, :].shape)

print('Shape of the test set:')
print(data[~train_test_bool, :].shape)


Shape of the entire dataset:
torch.Size([150, 4])
Shape of the training set:
torch.Size([120, 4])
Shape of the test set:
torch.Size([30, 4])


# Train and test the model

In [11]:
# Set the number of training epochs.
num_epochs = 1000

# Initialize losses and ongoing accuracy.
losses = torch.zeros(num_epochs)
ongoing_accuracy = []

# Loop over epochs for training.
for epoch_idx in range(num_epochs):

    # Forward pass through the model using the training data.
    predictions = custom_ann_model(data[train_test_bool, :])

    # Compute accuracy for the current epoch.
    epoch_accuracy = 100 * torch.mean(
        (torch.argmax(predictions, axis=1) == labels[train_test_bool]).float())
    ongoing_accuracy.append(epoch_accuracy)

    # Compute the Cross-Entropy loss.
    loss = loss_function(predictions, labels[train_test_bool])
    losses[epoch_idx] = loss

    # Backpropagation.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [12]:
# Final forward pass using the training data.
train_predictions = custom_ann_model(data[train_test_bool, :])
training_accuracy = 100 * torch.mean(
    (torch.argmax(train_predictions, axis=1) == labels[train_test_bool]).float())

# Final forward pass using the test data.
test_predictions = custom_ann_model(data[~train_test_bool, :])
testing_accuracy = 100 * torch.mean(
    (torch.argmax(test_predictions, axis=1) == labels[~train_test_bool]).float())


In [13]:
# Report the final training and testing accuracies.
print('Final TRAIN accuracy: %g%%' % training_accuracy)
print('Final TEST accuracy:  %g%%' % testing_accuracy)


Final TRAIN accuracy: 99.1667%
Final TEST accuracy:  96.6667%


In [None]:
# normally also inspect losses and accuracy by epoch, etc etc etc.

# Additional explorations

In [None]:
# 1) Randomly assigning data samples to be in the train vs test phase produced a statistical balance, but it was 
#    not perfect. Write an algorithm that will guarantee a balance of flower types while also randomly assigning
#    samples to be in train vs. test.
# 
# 2) Revert the code to its original form -- with the strong imbalance in flower types. Then train the model. What are
#    the train and test accuracies? Compute the accuracy separately for each type of flower to see whether the model
#    learned some categories, or whether it performed equally on all three categories. Are you surprised at the results? 
# 

## Question 1

In [36]:
import pandas as pd

In [37]:
# Shuffle data to induce randomness
shuffled_data = iris_data.sample(frac=1, random_state=740)
shuffled_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
15,5.7,4.4,1.5,0.4,setosa
2,4.7,3.2,1.3,0.2,setosa
68,6.2,2.2,4.5,1.5,versicolor
37,4.9,3.6,1.4,0.1,setosa
65,6.7,3.1,4.4,1.4,versicolor


In [38]:
# Define variables
train_list = []
test_list = []
train_size = 0.8

for label in shuffled_data['species'].unique():
    # Get subset of dataframe of one label
    label_subset = shuffled_data[shuffled_data['species'] == label]

    # Find index to split dataframe on
    split_idx = int(len(label_subset) * train_size)

    # Append the subsets to the lists
    train_list.append(label_subset.iloc[:split_idx])
    test_list.append(label_subset.iloc[split_idx:])

# Concat the lists into dataframes, then shuffle to randomize
train_df = pd.concat(train_list).sample(frac=1, random_state=740).reset_index(drop=True)
test_df = pd.concat(test_list).sample(frac=1, random_state=740).reset_index(drop=True)



In [41]:
train_df['species'].value_counts()

species
setosa        40
versicolor    40
virginica     40
Name: count, dtype: int64

In [43]:
test_df['species'].value_counts()

species
versicolor    10
virginica     10
setosa        10
Name: count, dtype: int64

## Question 2

In [46]:
train_preds = torch.argmax(train_predictions, dim=1)
test_preds = torch.argmax(test_predictions, dim=1)

In [50]:
labels[train_test_bool]
labels[~train_test_bool]

tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])

In [64]:
num_classes = len(iris_data['species'].unique())
correct_counts = np.zeros(num_classes)
total_counts = np.zeros(num_classes)

for i in range(num_classes):
    correct_counts[i] += ((train_preds == labels[train_test_bool]) & (labels[train_test_bool] == i)).sum().item()
    total_counts[i] += (labels[train_test_bool] == i).sum().item()

class_train_accuracies = {f'Class {i}': correct_counts[i] / total_counts[i] if total_counts[i] > 0 else 0 for i in range(num_classes)}
class_train_accuracies

{'Class 0': np.float64(1.0),
 'Class 1': np.float64(0.972972972972973),
 'Class 2': np.float64(1.0)}

In [65]:
num_classes = len(iris_data['species'].unique())
correct_counts = np.zeros(num_classes)
total_counts = np.zeros(num_classes)

for i in range(num_classes):
    correct_counts[i] += ((test_preds == labels[~train_test_bool]) & (labels[~train_test_bool] == i)).sum().item()
    total_counts[i] += (labels[~train_test_bool] == i).sum().item()

class_train_accuracies = {f'Class {i}': correct_counts[i] / total_counts[i] if total_counts[i] > 0 else 0 for i in range(num_classes)}
class_train_accuracies

{'Class 0': np.float64(1.0),
 'Class 1': np.float64(0.9230769230769231),
 'Class 2': np.float64(1.0)}