In [1]:
from torchnlp.datasets import iwslt_dataset
from torchnlp.word_to_vector import GloVe
from tqdm.auto import tqdm
from utils import *
from models import *

In [2]:
vectors = GloVe()

In [3]:
train = [sentence['en'] for sentence in iwslt_dataset(train=True)]
dev = [sentence['en'] for sentence in iwslt_dataset(dev=True)]
test = [sentence['en'] for sentence in iwslt_dataset(test=True)]
print("Train:", len(train))
print("Dev:", len(dev))
print("Test:", len(test))

Train: 196884
Dev: 993
Test: 1305


In [4]:
print("Preprocessing the train data")
train_x, train_y = preprocess_data(train[:15000], WINDOW_SIZE, CLASSES, vectors)
print("Preprocessing the dev data")
dev_x, dev_y = preprocess_data(dev, WINDOW_SIZE, CLASSES, vectors)
print("Preprocessing the test data")
test_x, test_y = preprocess_data(test, WINDOW_SIZE, CLASSES, vectors)

Preprocessing the train data
Tokenizing:


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))


Padding:


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))


Creating labels:


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))


Get word vector weights


HBox(children=(FloatProgress(value=0.0, max=268729.0), HTML(value='')))


Converting labels to tensor


HBox(children=(FloatProgress(value=0.0, max=268729.0), HTML(value='')))


Preprocessing the dev data
Tokenizing:


HBox(children=(FloatProgress(value=0.0, max=993.0), HTML(value='')))


Padding:


HBox(children=(FloatProgress(value=0.0, max=993.0), HTML(value='')))


Creating labels:


HBox(children=(FloatProgress(value=0.0, max=993.0), HTML(value='')))


Get word vector weights


HBox(children=(FloatProgress(value=0.0, max=18302.0), HTML(value='')))


Converting labels to tensor


HBox(children=(FloatProgress(value=0.0, max=18302.0), HTML(value='')))


Preprocessing the test data
Tokenizing:


HBox(children=(FloatProgress(value=0.0, max=1305.0), HTML(value='')))


Padding:


HBox(children=(FloatProgress(value=0.0, max=1305.0), HTML(value='')))


Creating labels:


HBox(children=(FloatProgress(value=0.0, max=1305.0), HTML(value='')))


Get word vector weights


HBox(children=(FloatProgress(value=0.0, max=21793.0), HTML(value='')))


Converting labels to tensor


HBox(children=(FloatProgress(value=0.0, max=21793.0), HTML(value='')))




In [5]:
train_loader = DataLoader(PuncDataset(train_x, train_y), batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=0)
dev_loader = DataLoader(PuncDataset(dev_x, dev_y), batch_size=BATCH_SIZE,
                        shuffle=True, num_workers=0)
test_loader = DataLoader(PuncDataset(test_x, test_y), batch_size=BATCH_SIZE,
                        shuffle=True, num_workers=0)

In [6]:
import torch.nn as nn
import torch.nn.functional as F


class LinearModel(nn.Module):
    def __init__(self):
        super(LinearModel, self).__init__()
        
        self.linear = nn.Sequential(
            nn.Linear(300 * 5, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, len(CLASSES)),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

    def forward(self, data):
        flatten_data = data.reshape(data.shape[0], 1500)
        prediction = self.linear(flatten_data)
        
        return prediction

model = LinearModel()

In [7]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [8]:
import pandas as pd

def train(train_loader, dev_loader, model, criterion, optimizer):

    for epoch in range(2):  # loop over the dataset multiple times
        print("Epoch:", epoch)
        
        model.train()
        t = tqdm(iter(dataloader), leave=False, total=len(dataloader))
        epoch_loss = 0.0
        
        for _, data in enumerate(t, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
        validate(dev_loader, model)
                
def validate(dataloader, model):
    correct = 0
    total = 0
    predicted_total = [1] * len(CLASSES)
    predicted_correct = [0] * len(CLASSES)
    predicted_expected = [1] * len(CLASSES)
    
    model.eval()
    with torch.no_grad():
        for data in dataloader:
            inputs, labels = data
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            
            # Gathering information for f score
            for i in range(predicted.shape[0]):
                predicted_class = predicted[i]
                correct_class = labels[i]
                predicted_total[predicted_class] += 1
                predicted_expected[correct_class] += 1
                if predicted_class == correct_class:
                    predicted_correct[predicted_class] += 1
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accuracy: %d %%' % (100 * correct / total))
    
    f_scores = []
    for i in range(len(CLASSES)):
        precision = predicted_correct[i] / predicted_total[i]
        recall = predicted_correct[i] / predicted_expected[i]
        f_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        f_scores.append([CLASSES[i], predicted_total[i], predicted_correct[i], predicted_expected[i], precision, recall, f_score])
        
    df = pd.DataFrame(f_scores, columns=["punctuation", "predicted", "predicted correctly", "predicted expectation", "precision", "recall", "f_score"])
    df = df.set_index("punctuation")
    return df

In [9]:
if __name__=='__main__':
    train(train_loader, model, criterion, optimizer)

    print('Finished Training')

Epoch: 0


HBox(children=(FloatProgress(value=0.0, max=16796.0), HTML(value='')))

Accuracy: 92 %
Epoch: 1


HBox(children=(FloatProgress(value=0.0, max=16796.0), HTML(value='')))

Accuracy: 93 %
Finished Training


In [10]:
validate(test_loader, model)

Accuracy: 93 %


Unnamed: 0_level_0,predicted,predicted correctly,predicted expectation,precision,recall,f_score
punctuation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
o,19713,18765,19072,0.95191,0.983903,0.967642
",",763,462,1355,0.605505,0.340959,0.436261
.,1320,1171,1228,0.887121,0.953583,0.919152
?,1,0,129,0.0,0.0,0.0
!,1,0,14,0.0,0.0,0.0


In [19]:
#from datetime import datetime
#now = datetime.now()

#PATH = './checkpoints/linear_' + now.strftime("%m-%d-%Y-%H-%M-%S") + '.pth'
#torch.save(model.state_dict(), PATH)