In [1]:
import torch
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets
import torch.nn as nn
import torch.optim as optim

from net import Net
from dataset import create_data_loader

import os
import string

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#parameters:
numEpochs = 10
batchSize = 1
learningRate = 0.001

paddingValue = -1 #atm its a negative number, edit vocab lst if you want to make it a positive number

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
all_characters = string.printable

vocab = []

# Turn string into tensor of ints
def tensorConvert(string): 
    #runtime can be improved if we find a way to convert tensor using tensor / numpy methods of some sort
    #only thing is i wouldnt know how to do this with a vocab though
    
    output= []
    
    tokenize = string.split()
    for token in tokenize:
        if token not in vocab:
            vocab.append(token)
        
        output.append(vocab.index(token))
        
    output = torch.tensor(output)
    
    return output
        


In [5]:
#useful function if you want to put a space between author names. since author names are written FirstnameLastname in the folders
def getUpperIndices(s):
    return [i for i, c in enumerate(s) if c.isupper()]

def addASpace(s):
    upIdx = getUpperIndices(s)[1]
    s = s[:upIdx] + ' ' + s[upIdx:] #add space between author's first and last name

In [6]:
# get list of authors
classes = []
rootdir = 'Data/C50train'
for it in os.scandir(rootdir): #scan subdirectory and append each element to list of classes
    if it.is_dir():
        classes.append(it.path.replace(rootdir + "\\" , '')) #remove 'C50train\' from string
        
print('classes:',classes)

classes: ['AaronPressman', 'AlanCrosby', 'AlexanderSmith', 'BenjaminKangLim', 'BernardHickey', 'BradDorfman', 'DarrenSchuettler', 'DavidLawder', 'EdnaFernandes', 'EricAuchard', 'FumikoFujisaki', 'GrahamEarnshaw', 'HeatherScoffield', 'JaneMacartney', 'JanLopatka', 'JimGilchrist', 'JoeOrtiz', 'JohnMastrini', 'JonathanBirt', 'JoWinterbottom', 'KarlPenhaul', 'KeithWeir', 'KevinDrawbaugh', 'KevinMorrison', 'KirstinRidley', 'KouroshKarimkhany', 'LydiaZajc', "LynneO'Donnell", 'LynnleyBrowning', 'MarcelMichelson', 'MarkBendeich', 'MartinWolk', 'MatthewBunce', 'MichaelConnor', 'MureDickie', 'NickLouth', 'PatriciaCommins', 'PeterHumphrey', 'PierreTran', 'RobinSidel', 'RogerFillion', 'SamuelPerry', 'SarahDavison', 'ScottHillis', 'SimonCowell', 'TanEeLyn', 'TheresePoletti', 'TimFarrand', 'ToddNissen', 'WilliamKazer']


In [7]:
#data:
testLen = sum([len(files) for r, d, files in os.walk("Data/C50test")])
print('test:', testLen)

trainLen = sum([len(files) for r, d, files in os.walk("Data/C50train")])
print('train:', trainLen)

trainData = []
testData = []

def gatherData(path, trainOrTest):
    #0 = train, 1 = test
    address = path + "/C50train"
    
    if trainOrTest == 1:
        address = path + "/C50test"
        
    prelude = len(address)
    
    for r, d, files in os.walk(address):
        
        #print here to show progress, warning, a shit load of print statements
        print('r:',r)
        print('test:',prelude)
        author = r[(prelude+1):]
        print('\nAuthor: ',author)
        
        if author == "":
            continue
            
        #EARLY STOP FOR DEVELOPMENT PURPOSES (cuz going through every author takes an ass load time)
        if author == "AlanCrosby":
            break

        for file in files:
            address = r + '/' + file

            with open(address, 'r') as f:
                content = f.read()

            convert = tensorConvert(content)
            #print('sample:',convert[:5])
            
            authorIdx = classes.index(author)
            
            item = [authorIdx, convert]
            #print('item:',item)
            
            if trainOrTest == 0:
                trainData.append(item)
            else:
                testData.append(item)
                

test: 2500
train: 2500


In [8]:
gatherData('Data', 0)
gatherData('Data', 1)

r: Data/C50train
test: 13

Author:  
r: Data/C50train\AaronPressman
test: 13

Author:  AaronPressman
r: Data/C50train\AlanCrosby
test: 13

Author:  AlanCrosby
r: Data/C50test
test: 12

Author:  
r: Data/C50test\AaronPressman
test: 12

Author:  AaronPressman
r: Data/C50test\AlanCrosby
test: 12

Author:  AlanCrosby


In [9]:
print(trainData[0])

[0, tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,   0,  18,  19,  20,  21,  22,  23,  15,  24,  25,
         26,  27,  15,   1,  28,  15,  29,  30,  10,  31,  32,  33,  10,  34,
         35,  36,  37,  38,  39,   5,  40,  14,  41,  42,   0,  43,  34,  44,
         45,  46,  47,  48,  49,  50,  51,  34,  52,  53,  54,  55,  27,  15,
         56,  57,  49,  10,  58,  59,   1,  60,  61,  62,  63,   0,  64,  31,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  34,  26,  27,  15,   1,  81,  82,  83,  21,  84,  85,  86,
         14,  15,  87,  88,  89,  90,  14,  91,  92,   1,  60,  61,  93,  71,
         34,  94,  95,  96,  15,  97,  10,  98,  99,  50,  10, 100, 101, 102,
        103, 104, 105,  15, 106, 107,   1,  66,  96, 108, 109,  34,  52, 110,
         74, 111, 112,  23, 113, 114,  10, 115, 116, 117, 118,  33, 119, 120,
        121, 122,  34, 123,  68, 124, 125, 126,  14, 127, 12

In [10]:
#find maxLength for padding purposes
maxLength = 0

for item in trainData:
    tensor = item[1]
    length = len(tensor)
    if length > maxLength:
        maxLength = length
        
for item in testData:
    tensor = item[1]
    length = len(tensor)
    if length > maxLength:
        maxLength = length
        
print(maxLength)
print(len(trainData[0][1]))

1048
319


In [11]:
def padTensor(tensor):
    length = len(tensor)

    pad = torch.ones((maxLength - length,))
    pad = pad * -1

    newTensor = torch.cat((tensor, pad),0)
    
    return newTensor
    
for item in trainData:
    tensor = item[1]
    newTensor = padTensor(tensor)
    
    item[1] = newTensor
        
for item in testData:
    tensor = item[1]
    newTensor = padTensor(tensor)
    
    item[1] = newTensor

item = trainData[0]
print(item)


[0, tensor([ 0.,  1.,  2.,  ..., -1., -1., -1.])]


In [14]:
trainLoader = create_data_loader(trainData, 
                                      batchSize,
                                      shuffle=True)
valLoader = create_data_loader(testData, 
                                    batchSize,
                                    shuffle=True)

next(iter(train_dataloader)) #NOTE THAT THIS SHIT IS PACKAGED UP WATCH OUT HOPEFULLY NOT A BIG DEAL

#to do: 
    #shuffle test and train data

    #allocate half the test data into validation data

{'input': tensor([[489., 680., 321.,  ...,  -1.,  -1.,  -1.]]),
 'target': tensor([0])}

In [11]:
#insert model here, need to get numbers for these
inputSize = 1
hiddenSize = 1
outputSize = len(classes)

net = Net(inputSize, hiddenSize, outputSize)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(net.parameters(), lr=learningRate)

In [None]:
#start training + print accuracy
#i see no reason to split any of this up into multiple cells but feel free to do so if there is one

for e in range(numEpochs):
    #training loop
    train_loss = 0.0
    net.train()
    for data, labels in trainLoader:
        data, labels = data.to(device), labels.to(device)

        optimizer.zero_grad()
        target = net(data)

        loss = criterion(target, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    #validation loop
    valid_loss = 0.0
    net.eval()  # Optional when not using Model Specific layer
    for data, labels in validLoader:
        if torch.cuda.is_available():
            data, labels = data.cuda(), labels.cuda()

        target = net(data)
        loss = criterion(target, labels)
        valid_loss = loss.item() * data.size(0)

    print(
        f'Epoch {e + 1} \t\t Training Loss: {train_loss / len(trainLoader)} \t\t Validation Loss: {valid_loss / len(validLoader)}')

    #save model if validation loss decreases
    if min_valid_loss > valid_loss:
        print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
        min_valid_loss = valid_loss
        # Saving State Dict
        torch.save(net.state_dict(), 'Models/saved_model.pth')

#test loop
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

with torch.no_grad():
    for data in testLoader:
        images, labels = data
        outputs = net(images)
        _, predictions = torch.max(outputs, 1)

        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1

for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print("Accuracy for class {:5s} is: {:.1f} %".format(classname, accuracy))