# Deep Learning Lab
Stephen Casper, scasper@college.harvard.edu

### Imports and Google Drive Filesystem Configuration
Also, make sure to connect to a GPU runtime. Above, select Runtime > Change runtime type > Hardware accelerator > GPU.

In [0]:
# Numpy is great for working with arrays and algebra.
import numpy as np

# SKLearn is useful for machine learning.
import sklearn as skl

# Pandas is helpful for working with tables of data.
import pandas as pd

# PyTorch is a state of the art deep learning library.
import torch
import torch.utils.data  # datasets
import torchvision.datasets as dsets  # datasets
import torchvision.transforms as transforms  # data transformations
import torchvision  # for working with images 
import torch.nn as nn  # networks
import torch.nn.functional as F  # key functions
import torch.optim as optim  # optimizers

# Matplotlib is great for plotting
import matplotlib.pyplot as plt

# For reproducibility
np.random.seed(0)
torch.manual_seed(0)

# from google.colab import drive
# drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


### Train and Test Functions

In [0]:
# Function to train a network
def train(model, train_loader, epochs, device):

    print('Training...')

    model.train()  # Put model in training mode

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

    train_accs = []  # Initialize list for epoch accuracies.

    # Iterate over epochs/passes of the training set.
    for epoch in range(epochs):
        print('Training epoch ' + str(epoch+1) + '...')

        # To keep track of accuracy 
        epoch_total = 0
        epoch_correct = 0

        # Iterate over dataset
        for X, y in train_loader:
            X = X.to(device) # Pass X, y to the GPU. X may be augmented depending on the train loader. 
            y = y.to(device)
            optimizer.zero_grad()  # Reset the gradeints.
            y_hat = model(X)  # Get the outputs of the model.
            loss = criterion(y_hat, y)  # Get the loss according to the criterion.
            loss.backward()  # Backpropagate to calculate the gradients.
            optimizer.step()  # Take an optimization step and adjust the model parameters.

            _, pred_labels = torch.max(y_hat.data, 1)  # Get labels from the outputs.
            epoch_total += y.size(0)  # Add to total.
            epoch_correct += (pred_labels == y).sum()  # Add to correct.
        
        train_accs.append(float(int(epoch_correct)/epoch_total))  # Update list of epoch accuracies.

    return train_accs

# Function to test a network
def test(model, test_loader, device):

    print('Testing...')

    model.eval()  # Put model in testing mode.

    # To keep track of accuracy 
    test_total = 0
    test_correct = 0

    # Iterate over dataset
    for X, y in test_loader:
        X = X.to(device) # Pass X, y to the GPU.
        y = y.to(device)
        y_hat = model(X)  # Get the outputs of the model.
        _, pred_labels = torch.max(y_hat.data, 1)  # Get labels from the outputs.
        test_total += y.size(0)  # Add to total.
        test_correct += (pred_labels == y).sum()  # Add to correct.
        
    return float(int(test_correct)/test_total)

### Multilayer Perceptron for [Wisconsin Breast Cancer](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/kernels) Dataset
You will need to make a Kaggle account, download the data, and upload it to google drive with the menu on the left. 

In [0]:
# Get raw Wisconsin Breast Cancer data as pandas dataframe, extract labels, and drop useless columns.
raw_df = pd.read_csv('/content/data.csv', delimiter=",")  # TODO CASPER CHANGE
raw_shuffled_df = raw_df.sample(frac=1, random_state=0)  # Shuffle rows.
labels_df = raw_shuffled_df['diagnosis']  # Get labels. They are 'M' for malignent and 'B' for benign.
raw_shuffled_df = raw_shuffled_df.drop(columns=['id', 'diagnosis', 'Unnamed: 32'])  # drop columns we don't want in X.

# Get data in np.array form.
X = raw_shuffled_df.to_numpy()
y = np.zeros(X.shape[0])
y[labels_df == 'M'] = 1  # 0s and 1s

# Split train and test data.
n_total = X.shape[0]
dims = X.shape[1]
n_train = int(n_total * 0.5)
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(X_train, y_train)
yclf_hat = clf.predict(X_test)
print(np.sum(yclf_hat==y_test) / n_train)

# Dataset class
class basic_dataset(torch.utils.data.Dataset):  # This is a subclass of torch.utils.data.Dataset.

    def __init__(self, X, y):  # Initialization function

        # Standardize and get from numpy.
        self.X = torch.from_numpy((X - np.mean(X, axis=0)) / np.std(X, axis=0)).float()
        self.y = torch.from_numpy(y).long()

    def __len__(self):
        return self.X.size(0)  # Return length.

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]  # Return item.

# Neural Network
class WBC_MLP(nn.Module):  # This is a subclass of torch.nn.Module.

    def __init__(self):  # Initialization function
        super(WBC_MLP, self).__init__()  # Initialization function for parent class.

        # Instantiate l2 hidden layers.
        self.linear1 = nn.Linear(dims, 10)
        self.linear2 = nn.Linear(10, 2)

    def forward(self, X):  # Function to get network's output.

        X = F.relu(self.linear1(X))
        X = self.linear2(X)
        return X

# Use GPU.
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # use GPU if possible

# Get model.
wbc_model = WBC_MLP().to(device)

# Get data loaders.
wbc_train_loader = torch.utils.data.DataLoader(dataset=basic_dataset(X_train, y_train), batch_size=8, shuffle=True)
wbc_test_loader = torch.utils.data.DataLoader(dataset=basic_dataset(X_test, y_test), batch_size=8)

# Train and test.
wbc_train_accs = train(model=wbc_model, train_loader=wbc_train_loader, epochs=10, device=device)
wbc_test_acc = test(model=wbc_model, test_loader=wbc_test_loader, device=device)

# Report results.
print('Random guess baseline:', 1-np.mean(y))
print('Train accuracies:', wbc_train_accs)
print('Test accuracy:', wbc_test_acc)



0.9471830985915493
Training...
Training epoch 1...
Training epoch 2...
Training epoch 3...
Training epoch 4...
Training epoch 5...
Training epoch 6...
Training epoch 7...
Training epoch 8...
Training epoch 9...
Training epoch 10...
Testing...
Random guess baseline: 0.6274165202108963
Train accuracies: [0.795774647887324, 0.9366197183098591, 0.9647887323943662, 0.9823943661971831, 0.9859154929577465, 0.9894366197183099, 0.9894366197183099, 0.9929577464788732, 0.9894366197183099, 0.9894366197183099]
Test accuracy: 0.9578947368421052


### Convolutional Neural Network for [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html)

In [0]:
# Set up transformations
train_transform_cifar10 = transforms.Compose([transforms.RandomCrop(32, padding=4),  # Random crop
                                      transforms.RandomHorizontalFlip(p=0.5), # Random flip
                                      transforms.ToTensor(), 
                                      transforms.Normalize((0, 0, 0), (1, 1, 1))])  # Normalize
test_transform_cifar10 = transforms.Compose([transforms.ToTensor(), 
                                      transforms.Normalize((0, 0, 0), (1, 1, 1))])  # Normalize

# Get CIFAR-10 data. PyTorch already has this one available.
trainset_cifar10 = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform_cifar10)  # 50000 images
testset_cifar10 = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform_cifar10)  # 10000 images

# Standard convolutional net with 2 conv/pooling layers and 3 fc layers
class CIFAR10_CNN(nn.Module):  
    
    def __init__(self, p=0.2):
        super(CIFAR10_CNN, self).__init__()  # Initialize parent class

        self.conv1 = nn.Conv2d(3, 8, 4)  # First convolution
        self.conv2 = nn.Conv2d(8, 16, 4)  # Second convolution
        self.pool = nn.MaxPool2d(2, 2)  # Max pooling 
        self.fc1 = nn.Linear(400, 200)  # Fully connected 1
        self.fc2 = nn.Linear(200, 100)  # Fully connected 2
        self.fc3 = nn.Linear(100, 10)  # Fully connected 2
        self.drop = nn.Dropout(p=p)  # Dropout for fully connected layers

    def forward(self, X):  # Function to get network's output.
        X = self.pool(F.relu(self.conv1(X)))
        X = self.pool(F.relu(self.conv2(X)))
        X = X.view(-1, 400)
        X = F.relu(self.drop(self.fc1(X)))
        X = F.relu(self.drop(self.fc2(X)))
        X = self.fc3(X)
        return X

# Use GPU.
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # use GPU if possible

# Get model.
cifar10_model = CIFAR10_CNN().to(device)

# Get data loaders.
cifar10_train_loader = torch.utils.data.DataLoader(trainset_cifar10, batch_size=32, shuffle=True)
cifar10_test_loader = torch.utils.data.DataLoader(testset_cifar10, batch_size=32, shuffle=False)

# Train and test.
cifar10_train_accs = train(model=cifar10_model, train_loader=cifar10_train_loader, epochs=10, device=device)
cifar10_test_acc = test(model=cifar10_model, test_loader=cifar10_test_loader, device=device)

# Report results.
print('random guess baseline:', 0.1)
print('train accuracies:', cifar10_train_accs)
print('test accuracy:', cifar10_test_acc)

  0%|          | 0/170498071 [00:00<?, ?it/s]

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


170500096it [00:01, 90254472.20it/s]                               


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Training...
Training epoch 1...
Training epoch 2...
Training epoch 3...
Training epoch 4...
Training epoch 5...
Training epoch 6...
Training epoch 7...
Training epoch 8...
Training epoch 9...
Training epoch 10...
Testing...
random guess baseline: 0.1
train accuracies: [0.24242, 0.37892, 0.42652, 0.44852, 0.4675, 0.48124, 0.49542, 0.49958, 0.5057, 0.51322]
test accuracy: 0.5701


### Activity: Implement Something!

- Preprocess Winsconsin data with [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) or [t-SNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html).
- Use another algorithm with the Wisconsin data like a [linear](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html) or [random forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) classifier.
- Experiment with batch size and learning rate in the CIFAR-10 network.
- Experiment with the size and depth of layers in the CIFAR-10 network. Also consider adding residual (skip) connections. 
- Experiment with another [activation function](https://pytorch.org/docs/stable/nn.functional.html) (other than ReLU) and another [optimizer](https://pytorch.org/docs/stable/optim.html) other than SGD with the CIFAR-10 network.
- Implement [batch normalization](https://discuss.pytorch.org/t/batch-normalization-of-linear-layers/20989) with the CIFAR-10 network. 
