# COURSE: A deep understanding of deep learning
## SECTION: Overfitting, cross-validation, regularization
### LECTURE: Cross-validation -- manual separation
#### TEACHER: Mike X Cohen, sincxpress.com
##### COURSE URL: udemy.com/course/deeplearning_x/?couponCode=202305

In [160]:
# import libraries
import torch
import torch.nn as nn
import numpy as np

In [161]:
# import dataset (comes with seaborn)
import seaborn as sns
iris = sns.load_dataset('iris')


# convert from pandas dataframe to tensor
data = torch.tensor( iris[iris.columns[0:4]].values ).float()

# transform species to number
labels = torch.zeros(len(data), dtype=torch.long)
# labels[iris.species=='setosa'] = 0 # don't need!
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica'] = 2

# Separate data into train and test

In [162]:
#  (no devset here)

# how many training examples
propTraining = .8 # in proportion, not percent
nTraining = int(len(labels)*propTraining)

# initialize a boolean vector to select data and labels
traintestBool = np.zeros(len(labels),dtype=bool)

# is this the correct way to select samples?
traintestBool[range(nTraining)] = True

# this is better, but why?
#items2use4train = np.random.choice(range(len(labels)),nTraining,replace=False)
#traintestBool[items2use4train] = True
#traintestBool

# items2use4train = np.random.choice(range(len(labels)//3),nTraining//3,replace=False)
# items2use4trainExtended = np.append(items2use4train,50+items2use4train)
# items2use4trainExtended = np.append(items2use4trainExtended,100+items2use4train)
# traintestBool[items2use4trainExtended] = True
# items2use4trainExtended

In [163]:
# test whether it's balanced
print('Average of full data:')
print( torch.mean(labels.float()) ) # =1 by definition
print(' ')

print('Average of training data:')
print( torch.mean(labels[traintestBool].float()) ) # should be 1...
print(' ')

print('Average of test data:')
print( torch.mean(labels[~traintestBool].float()) ) # should also be 1...

Average of full data:
tensor(1.)
 
Average of training data:
tensor(0.7500)
 
Average of test data:
tensor(2.)


In [164]:
print(labels[traintestBool].float())

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
        2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])


In [165]:
# create the ANN model

# model architecture
ANNiris = nn.Sequential(
    nn.Linear(4,64),   # input layer
    nn.ReLU(),         # activation unit
    nn.Linear(64,64),  # hidden layer
    nn.ReLU(),         # activation unit
    nn.Linear(64,3),   # output units
      )

# loss function
lossfun = nn.CrossEntropyLoss()

# optimizer
optimizer = torch.optim.SGD(ANNiris.parameters(),lr=.01)

In [166]:
# entire dataset
print( data.shape )

# training set
print( data[traintestBool,:].shape )

# test set
print( data[~traintestBool,:].shape )

torch.Size([150, 4])
torch.Size([120, 4])
torch.Size([30, 4])


# Train and test the model

In [167]:
# train the model

numepochs = 1000

# initialize losses
losses = torch.zeros(numepochs)
ongoingAcc = []

# loop over epochs
for epochi in range(numepochs):

  # forward pass
  yHat = ANNiris(data[traintestBool,:])

  # compute accuracy (note: denser than previous code!)
  ongoingAcc.append( 100*torch.mean(
              (torch.argmax(yHat,axis=1) == labels[traintestBool]).float()) )

  # compute loss
  loss = lossfun(yHat,labels[traintestBool])
  losses[epochi] = loss

  # backprop
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

In [168]:
# compute train and test accuracies

# final forward pass USING TRAINING DATA
predictionsTrain = ANNiris(data[traintestBool,:])
trainacc = 100*torch.mean((torch.argmax(predictionsTrain,axis=1) == labels[traintestBool]).float())


# final forward pass USING TEST DATA!
predictionsTest = ANNiris(data[~traintestBool,:])
testacc = 100*torch.mean((torch.argmax(predictionsTest,axis=1) == labels[~traintestBool]).float())

In [169]:
# report accuracies

print('Final TRAIN accuracy: %g%%' %trainacc)
print('Final TEST accuracy:  %g%%' %testacc)

Final TRAIN accuracy: 98.3333%
Final TEST accuracy:  70%


In [170]:
# normally also inspect losses and accuracy by epoch, etc etc etc.

In [171]:
label0Testaccurate = torch.sum(torch.logical_and(torch.argmax(predictionsTest,axis=1) == 0,labels[~traintestBool] == 0)).item()
label1Testaccurate = torch.sum(torch.logical_and(torch.argmax(predictionsTest,axis=1) == 1,labels[~traintestBool] == 1)).item()
label2Testaccurate = torch.sum(torch.logical_and(torch.argmax(predictionsTest,axis=1) == 2,labels[~traintestBool] == 2)).item()

label0Trainaccurate = torch.sum(torch.logical_and(torch.argmax(predictionsTrain,axis=1) == 0,labels[traintestBool] == 0)).item()
label1Trainaccurate = torch.sum(torch.logical_and(torch.argmax(predictionsTrain,axis=1) == 1,labels[traintestBool] == 1)).item()
label2Trainaccurate = torch.sum(torch.logical_and(torch.argmax(predictionsTrain,axis=1) == 2,labels[traintestBool] == 2)).item()

In [182]:
print('label0Trainaccurate: ',label0Trainaccurate )
print('label0Testaccurate: ',label0Testaccurate )

print('label1Trainaccurate: ',label1Trainaccurate )
print('label1Testaccurate: ',label1Testaccurate )

print('label2Trainaccurate: ',label2Trainaccurate )
print('label2Testaccurate: ',label2Testaccurate )

label0Trainaccurate:  50
label0Testaccurate:  0
label1Trainaccurate:  49
label1Testaccurate:  0
label2Trainaccurate:  19
label2Testaccurate:  21


In [173]:
torch.sum(torch.logical_and(torch.argmax(predictionsTest,axis=1) == 2,labels[~traintestBool] == 2))

tensor(21)

# Additional explorations

In [174]:
# 1) Randomly assigning data samples to be in the train vs test phase produced a statistical balance, but it was 
#    not perfect. Write an algorithm that will guarantee a balance of flower types while also randomly assigning
#    samples to be in train vs. test.
# MY COMMENTS: Final label0 accuracy: 100%
#              Final label1 accuracy: 100%
#              Final label2 accuracy: 100%
# 
# 2) Revert the code to its original form -- with the strong imbalance in flower types. Then train the model. What are
#    the train and test accuracies? Compute the accuracy separately for each type of flower to see whether the model
#    learned some categories, or whether it performed equally on all three categories. Are you surprised at the results? 
# MY COMMENTS: The predictions test cases were only for last 30 label2 cases out of which 21 were correct giving accuracy of 70%