<a href="https://colab.research.google.com/github/yahui624/CompArchFinalProject/blob/main/cudaGPUTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import time
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from IPython.display import display, clear_output
from itertools import product
from collections import namedtuple
from collections import OrderedDict
     


In [2]:
class Network(nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = nn.Conv2d(in_channels=1,out_channels=6,kernel_size=5)
    self.conv2 = nn.Conv2d(in_channels=6,out_channels=12,kernel_size=5)

    self.fc1 = nn.Linear(in_features=12*4*4,out_features=120)
    self.fc2 = nn.Linear(in_features=120,out_features=60)
    self.out = nn.Linear(in_features=60,out_features=10)
    
  def forward(self,t):
    t= t

    t = self.conv1(t)
    t = F.relu(t)
    t = F.max_pool2d(t,kernel_size=2,stride=2)

    t = self.conv2(t)
    t = F.relu(t)
    t = F.max_pool2d(t,kernel_size=2,stride=2)

    t = t.reshape(-1,12*4*4)
    t = self.fc1(t)
    t = F.relu(t)

    t = self.fc2(t)
    t = F.relu(t)

    t = self.out(t)

    return t;


In [None]:
train_set = torchvision.datasets.FashionMNIST(root='./data',train=True,
                                download=True,
                                transform=transforms.Compose([
        transforms.ToTensor()
    ]))

In [5]:
class RunBuilder():
    @staticmethod
    def get_runs(params):

        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
          runs.append(Run(*v))
        return runs

In [6]:
params = OrderedDict(
    lr = [0.01,0.001,0.0001],
    batch_size = [100,1000,10000,20000],
    num_workers=[0,1,2,4],
    device = ['cuda','cpu']
)

In [7]:
for run in RunBuilder.get_runs(params):
  print(run.lr,run.batch_size,run.num_workers,run.device)

0.01 100 0 cuda
0.01 100 0 cpu
0.01 100 1 cuda
0.01 100 1 cpu
0.01 100 2 cuda
0.01 100 2 cpu
0.01 100 4 cuda
0.01 100 4 cpu
0.01 1000 0 cuda
0.01 1000 0 cpu
0.01 1000 1 cuda
0.01 1000 1 cpu
0.01 1000 2 cuda
0.01 1000 2 cpu
0.01 1000 4 cuda
0.01 1000 4 cpu
0.01 10000 0 cuda
0.01 10000 0 cpu
0.01 10000 1 cuda
0.01 10000 1 cpu
0.01 10000 2 cuda
0.01 10000 2 cpu
0.01 10000 4 cuda
0.01 10000 4 cpu
0.01 20000 0 cuda
0.01 20000 0 cpu
0.01 20000 1 cuda
0.01 20000 1 cpu
0.01 20000 2 cuda
0.01 20000 2 cpu
0.01 20000 4 cuda
0.01 20000 4 cpu
0.001 100 0 cuda
0.001 100 0 cpu
0.001 100 1 cuda
0.001 100 1 cpu
0.001 100 2 cuda
0.001 100 2 cpu
0.001 100 4 cuda
0.001 100 4 cpu
0.001 1000 0 cuda
0.001 1000 0 cpu
0.001 1000 1 cuda
0.001 1000 1 cpu
0.001 1000 2 cuda
0.001 1000 2 cpu
0.001 1000 4 cuda
0.001 1000 4 cpu
0.001 10000 0 cuda
0.001 10000 0 cpu
0.001 10000 1 cuda
0.001 10000 1 cpu
0.001 10000 2 cuda
0.001 10000 2 cpu
0.001 10000 4 cuda
0.001 10000 4 cpu
0.001 20000 0 cuda
0.001 20000 0 cpu
0.001 2

In [8]:
class RunManager():
  def __init__(self):
    self.epoch_count = 0
    self.epoch_loss = 0
    self.epoch_num_correct = 0
    self.epoch_start_time = None
    self.epoch_num_workers=0


    self.run_params = None
    self.run_count = 0
    self.run_data = []
    self.run_start_time = None

    self.network = None
    self.loader = None
    self.tb = None

  
  def begin_run(self,run,network,loader):
    self.run_start_time = time.time()

    self.run_params = run
    self.run_count += 1

    self.network = network
    self.loader = loader
    self.tb = SummaryWriter(comment=f'-{run}')

    images, labels = next(iter(self.loader))
    grid = torchvision.utils.make_grid(images)

    self.tb.add_image('images', grid)
    self.tb.add_graph(
            self.network
        ,images.to(getattr(run, 'device', 'cpu'))
    )

  def end_run(self):
    self.tb.close()
    self.epoch_count = 0

  def begin_epoch(self):
    self.epoch_start_time = time.time()
    self.epoch_count += 1
    self.epoch_loss = 0
    self.epoch_num_correct = 0


  def end_epoch(self):

    epoch_duration = time.time() - self.epoch_start_time
    run_duration = time.time() - self.run_start_time

    loss = self.epoch_loss / len(self.loader.dataset)
    accuracy = self.epoch_num_correct / len(self.loader.dataset)

    self.tb.add_scalar('Loss', loss, self.epoch_count)
    self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)

    for name, param in self.network.named_parameters():
        self.tb.add_histogram(name, param, self.epoch_count)
        self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)

    results = OrderedDict()
    results["run"] = self.run_count
    results["epoch"] = self.epoch_count
    results['loss'] = loss
    results["accuracy"] = accuracy
    results['epoch duration'] = epoch_duration
    results['run duration'] = run_duration
    results['num_workers'] = self.epoch_num_workers
    results['device']=self.run_params.device
    for k,v in self.run_params._asdict().items(): results[k] = v
    self.run_data.append(results)

    df = pd.DataFrame.from_dict(self.run_data, orient='columns')
    
    clear_output(wait=True)
    
    display(df)

  def get_num_workers(self,num_workers):
    self.epoch_num_workers = num_workers


  def track_loss(self, loss, batch):
    self.epoch_loss += loss.item() * batch[0].shape[0]

  def track_num_correct(self, preds, labels):
    self.epoch_num_correct += self.get_num_correct(preds, labels)

  def get_num_correct(self, preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

  def save(self, fileName):

    pd.DataFrame.from_dict(
        self.run_data, orient='columns'
    ).to_csv(f'{fileName}.csv')

    with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
        json.dump(self.run_data, f, ensure_ascii=False, indent=4)

In [9]:
runManger = RunManager()
for run in RunBuilder.get_runs(params):
  print(run)
  network = Network()
  network.to(run.device)
  train_loader = DataLoader(
      train_set,
      batch_size =run.batch_size,
      num_workers=run.num_workers
  )
  optimizer = optim.Adam(network.parameters(),lr=run.lr)
  runManger.begin_run(run,network,train_loader)
  for epoch in range(1):
    runManger.begin_epoch()
    runManger.get_num_workers(run.num_workers)
    
    for batch in train_loader:
      images,labels = batch
      images = images.to(run.device)
      labels = labels.to(run.device)      
      preds = network(images)

      loss = F.cross_entropy(preds,labels)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      runManger.track_loss(loss,batch)
      runManger.track_num_correct(preds,labels)
    runManger.end_epoch()
  runManger.end_run()

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,num_workers,device,lr,batch_size
0,1,1,0.571034,0.783567,7.464984,18.439404,0,cuda,0.01,100
1,2,1,0.547129,0.79285,12.35348,12.613362,0,cpu,0.01,100
2,3,1,0.554538,0.792117,8.819749,9.022031,1,cuda,0.01,100
3,4,1,0.559689,0.78625,11.664173,11.872422,1,cpu,0.01,100
4,5,1,0.576286,0.778333,7.389254,7.661213,2,cuda,0.01,100
5,6,1,0.60605,0.76435,11.630561,11.90846,2,cpu,0.01,100
6,7,1,0.574524,0.783883,7.566058,7.978428,4,cuda,0.01,100
7,8,1,0.605739,0.769783,11.990325,12.439025,4,cpu,0.01,100
8,9,1,0.986737,0.621283,5.409128,5.938713,0,cuda,0.01,1000
9,10,1,1.012079,0.611533,12.219434,12.959972,0,cpu,0.01,1000


Run(lr=0.001, batch_size=10000, num_workers=1, device='cpu')


KeyboardInterrupt: ignored

In [10]:
runManger.save("results")

In [12]:
pd.DataFrame.from_dict(runManger.run_data, orient='columns').sort_values('epoch duration')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,num_workers,device,lr,batch_size
44,45,1,1.665622,0.444833,5.229367,6.203323,2,cuda,0.001,1000
40,41,1,1.524989,0.477717,5.286013,5.959027,0,cuda,0.001,1000
8,9,1,0.986737,0.621283,5.409128,5.938713,0,cuda,0.01,1000
12,13,1,1.008485,0.6186,5.411087,6.640848,2,cuda,0.01,1000
46,47,1,1.609782,0.435633,5.462487,6.732724,4,cuda,0.001,1000
42,43,1,1.553685,0.497367,5.502408,6.252602,1,cuda,0.001,1000
10,11,1,0.970796,0.62735,5.567976,6.514298,1,cuda,0.01,1000
14,15,1,1.11674,0.578467,5.871184,7.022339,4,cuda,0.01,1000
20,21,1,2.107855,0.229017,5.920285,12.171511,2,cuda,0.01,10000
18,19,1,2.103876,0.32585,6.115611,10.930315,1,cuda,0.01,10000
