In [1]:
%run ../../src/config.py

In [78]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
transcript_train      = pd.read_csv('../../data/prepared_data/transcript_train.csv')
transcript_test       = pd.read_csv('../../data/prepared_data/transcript_test.csv')
transcript_validation = pd.read_csv('../../data/prepared_data/transcript_validation.csv')
transcript_train

Unnamed: 0,campaign_no,arc_no,episode_no,episode_index,episode_label,section_no,line_no,speaker,line,nwords
0,2,5,18,223.0,2-5-18,2,1054,LIAM,"Fjord, do you want to carry these, or do you w...",16
1,3,1,19,274.0,3-1-19,2,1943,LAURA,"Oh, it's my glasses. Hold on. (laughter)",7
2,3,2,17,295.0,3-2-17,4,710,MATT,"I'll allow it for the time being, yeah.",8
3,3,2,25,303.0,3-2-25,4,468,MARISHA,You're more just visiting another landscape.,6
4,2,6,15,241.0,2-6-15,4,13,TRAVIS,As a point of clarification--,5
...,...,...,...,...,...,...,...,...,...,...
371415,3,3,19,325.0,3-3-19,2,1258,MATT,"All right, finishing FCG's go.",5
371416,3,1,11,266.0,3-1-11,2,415,MATT,22 points of lightning damage.,5
371417,2,6,28,254.0,2-6-28,4,395,LIAM,But I'm still pretty low.,5
371418,3,3,20,326.0,3-3-20,2,1058,MATT,"All right, who's keeping watch?",5


In [4]:
transcript_train['speakerno']      = [cast[x]['speakerno'] for x in transcript_train['speaker']]
transcript_test['speakerno']       = [cast[x]['speakerno'] for x in transcript_test['speaker']]
transcript_validation['speakerno'] = [cast[x]['speakerno'] for x in transcript_validation['speaker']]

In [5]:
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data):
    for text in data['line']:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(transcript_train), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [26]:
text_pipeline = lambda x: vocab(tokenizer(x))[:100]
label_pipeline = lambda x: int(x) - 1

print(text_pipeline('how do you want to do this williamchadyoung, habcldiekso?'))
print(text_pipeline('<unk>'))
[vocab[token] for token in tokenizer("I'll allow it for the time being, yeah.")]
# [text_pipeline(x) for x in ['how want kill', 'do to']]

[97, 26, 5, 86, 7, 26, 18, 0, 2, 0, 14]
[0]


[6, 3, 64, 1265, 8, 37, 4, 105, 213, 2, 36, 1]

In [88]:
class LinesDataset(Dataset):
    def __init__(self, dataset, vocab, line_length):
        self.line_length = line_length
        self.data   = [self.tokenize_line(x) for x in dataset['line']]
        self.labels = [cast[x]['speakerno'] for x in dataset['speaker']]
        self.vocab  = vocab

    def tokenize_line(self, line):
        tokens = tokenizer(line)[:self.line_length]
        return np.pad(tokens, (self.line_length - len(tokens), 0), mode='constant', constant_values='<unk>').tolist()

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.vocab(self.data[idx]), self.labels[idx]
        # example = self.data[idx]
        # Convert the example into numerical representations using the vocabulary
        # numerical_tokens = [self.vocab[token] for token in example]
        # return numerical_tokens

In [95]:
train_dataset = LinesDataset(transcript_train[:10], vocab, 15)
train_dataset[2]

([0, 0, 0, 6, 3, 64, 1265, 8, 37, 4, 105, 213, 2, 36, 1], 0)

In [94]:
train_dataset = LinesDataset(transcript_train[:10], vocab, 15)

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=False)

next(iter(train_loader))

[[tensor([286,   0,   0,   0,   0,   0,   0,   0,   5,   0]),
  tensor([  2,  56,   0,   0,   0,  26,   0,   0, 100,   0]),
  tensor([ 26,   2,   0,   0,   0,   5,   0,   0, 140,   0]),
  tensor([    5,     8,     6,     0,     0,    86,     0,     0, 35485,     0]),
  tensor([86,  3,  3,  0,  0,  7,  0,  0, 89,  0]),
  tensor([    7,    11,    64,     0,     0,    49,     0,     0,    10, 38864]),
  tensor([  923,    61,  1265,     5,     0,    35,     0,     0, 22559,    18]),
  tensor([ 128, 2174,    8,    3,    0,   51,    0,   62,  452,    1]),
  tensor([ 2,  1, 37, 24,  0,  3,  0, 16,  2, 15]),
  tensor([ 62, 244,   4, 111,   0,  24,   0,  13,   4,  20]),
  tensor([  26,   23,  105,   22,   28,  934,    0,  284, 1825,   49]),
  tensor([   5,    1,  213, 7466,   10,   17,   50,   12, 1819,   97]),
  tensor([ 86,  32,   2, 201, 170,   4,   2,  38,  13, 219]),
  tensor([  68,   95,   36, 1830,   12, 6363,    9, 1464, 1181,   89]),
  tensor([    7,    31,     1,     1, 23724,    14, 

In [29]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        last_hidden = output[:, -1, :]
        logits = self.fc(last_hidden)
        return logits

In [108]:
# Training parameters
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
num_classes = 8
batch_size = 5
num_epochs = 1
learning_rate = 0.001

# Create the model
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, num_classes)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Create data loaders for the training and validation sets
train_dataset = LinesDataset(transcript_train[:10], vocab, 5)
valid_dataset = LinesDataset(transcript_validation[:10], vocab, 5)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# Iterate over the training data for the specified number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_samples = 0
    for inputs in train_loader:
        print(inputs[0])
        print('Classes')
        print(inputs[1])
        # optimizer.zero_grad()
        # # inputs = torch.LongTensor(inputs[0])
        # targets = inputs[1]
        # outputs = model(inputs[0])
        # loss = criterion(outputs.view(-1, num_classes), targets.view(-1))
        # loss.backward()
        # optimizer.step()

    #     total_loss += loss.item() * len(inputs)
    #     total_samples += len(inputs)

    # # Evaluate on the validation set after every epoch
    # model.eval()
    # total_val_loss = 0.0
    # total_val_samples = 0
    # with torch.no_grad():
    #     for inputs in valid_loader:
    #         inputs = torch.LongTensor(inputs)
    #         targets = inputs.clone()
    #         outputs = model(inputs)
    #         val_loss = criterion(outputs.view(-1, num_classes), targets.view(-1))

    #         total_val_loss += val_loss.item() * len(inputs)
    #         total_val_samples += len(inputs)

    # avg_loss = total_loss / total_samples
    # avg_val_loss = total_val_loss / total_val_samples

    # print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

[tensor([   26,   286, 38864,     0,    28]), tensor([ 5,  2, 18, 50, 10]), tensor([ 86,  26,   1,   2, 170]), tensor([ 7,  5, 15,  9, 12]), tensor([   49,    86,    20,  1803, 23724])]
Classes
tensor([5, 6, 1, 3, 4])
[tensor([ 5,  5,  6, 56, 62]), tensor([100,   3,   3,   2,  16]), tensor([140,  24,  64,   8,  13]), tensor([35485,   111,  1265,     3,   284]), tensor([89, 22,  8, 11, 12])]
Classes
tensor([0, 3, 0, 1, 4])


In [8]:
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

def yield_data(data):
    for _, row in data.iterrows():
        yield(row['speakerno'], row['line'])

dataloader = DataLoader(
    yield_data(transcript_train),
    batch_size = 8,
    shuffle    = False,
    collate_fn = collate_batch
)

In [9]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [10]:
train_iter = yield_data(transcript_train)
num_class  = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize     = 64
model      = TextClassificationModel(vocab_size, emsize, num_class).to(device)
print(num_class)
print(vocab_size)

8
48971


In [11]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [12]:
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS     = 10  # epoch
LR         = 5  # learning rate
BATCH_SIZE = 64  # batch size for training

criterion  = torch.nn.CrossEntropyLoss()
optimizer  = torch.optim.SGD(model.parameters(), lr=LR)
scheduler  = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_iter = yield_data(transcript_train[:100])
valid_iter = yield_data(transcript_validation[:100])
test_iter  = yield_data(transcript_test[:100])

train_dataset = to_map_style_dataset(train_iter)
valid_dataset = to_map_style_dataset(valid_iter)
test_dataset  = to_map_style_dataset(test_iter)

num_train = len(train_dataset)

train_dataloader = DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle    = True,
    collate_fn = collate_batch
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = BATCH_SIZE,
    shuffle    = True,
    collate_fn = collate_batch
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size = BATCH_SIZE,
    shuffle    = True,
    collate_fn = collate_batch
)

#train(train_dataloader)

# for epoch in range(1, EPOCHS + 1):
#     epoch_start_time = time.time()
#     train(train_dataloader)
#     accu_val = evaluate(valid_dataloader)
#     if total_accu is not None and total_accu > accu_val:
#         scheduler.step()
#     else:
#         total_accu = accu_val
#     print("-" * 59)
#     print(
#         "| end of epoch {:3d} | time: {:5.2f}s | "
#         "valid accuracy {:8.3f} ".format(
#             epoch, time.time() - epoch_start_time, accu_val
#         )
#     )
#     print("-" * 59)

In [None]:
train(train_dataloader)