In [1]:
import pandas as pd
import numpy as np
import torch
from torchtext import vocab
# from torchtext.data import  
from torchtext.legacy import data, datasets
from torchtext.legacy.data import BucketIterator, TabularDataset , Dataset
from torch.utils.data import Sampler, Subset#, Dataset
from typing import Sequence, Optional
from torch import nn
from sklearn.model_selection import KFold
from tqdm import tqdm 

In [2]:
# glove = vocab.GloVe(name="6B", dim=100) # smaller for ruddit
# glove.vectors.size()
# del glove

In [3]:
# glove.itos[glove.stoi[glove.unk_init]]
# glove.stoi['<pad>']

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
## Constants ##
comment_key = "txt"
target_key = "score"
train_data_path = "./train/ruddit_with_text.csv" #"./train/clean_ruddit_with_text.csv"
embedding_name = "glove.6B.100d"
embedding_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
k_folds = 5
n_epochs = 7
batch_size = 256
dropout_rate = 0.5
output_model_path = "./output/bilstm_not_clean_ruddit_only/model_%s_%s" #"./output/bilstm_civil_only/model_%s_%s" # loss, more info
vocab_path = "./output/bilstm_not_clean_ruddit_only/ruddit_vocab"

In [6]:
# ruddit = pd.read_csv(train_data_path)
# ruddit.shape, ruddit.head()

In [7]:
# import spacy
# spacy.cli.download("en_core_web_sm")

In [8]:
# text_field = data.Field(tokenize='spacy', include_lengths=True, use_vocab=True, batch_first=True, lower=True, tokenizer_language='en_core_web_sm') # some preprocessing before as well
text_field = data.Field(tokenize='spacy', include_lengths=True, use_vocab=True, batch_first=True, lower=True, tokenizer_language='en_core_web_sm', )
label_field = data.Field(dtype=torch.float32, batch_first=True, sequential=False, use_vocab=False, preprocessing=float)
fields = [(None, None), (None, None), ('text', text_field), (None, None), ('label', label_field)]# fields = [('text', text_field), ('label', label_field)] # (None, None)]
# text_field.build_vocab(ruddit[comment_key], vectors='glove.6B.100d') # we are fitting on dataset, not using all of glove
# text_field.vocab.vectors.size()

In [9]:
ruddit = TabularDataset(
    path=train_data_path,
    format='csv',
    fields=fields,
    skip_header=True,
    
)
len(ruddit.examples)

5838

In [10]:
text_field.build_vocab(ruddit, vectors=embedding_name) # discards embedding which are not used in dataset and aligns the indices
text_field.vocab.vectors.size()
# text_field.vocab = glove
# text_field.vocab.stoi[text_field.pad_token] = torch.zeros(embedding_dim)

torch.Size([15118, 100])

In [11]:
ts = len(ruddit.examples) * (k_folds - 1)/ k_folds
print("trains size in k folds", ts)
print("batch size", batch_size)
print("number of batches (in 1 fold training)", ts/batch_size)

trains size in k folds 4670.4
batch size 256
number of batches (in 1 fold training) 18.24375


In [12]:
# label_field.build_vocab(ruddit) # not sure if needed
# label_field.vocab.freqs

In [13]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_vocab: vocab, hidden_dim: int, output_dim: int, n_layers: int, # vocab_size: int, embedding_dim: int
        bidirectional: bool, dropout: float, pad_idx: Optional[int]):
        super().__init__()
        vocab_size, embedding_dim = embedding_vocab.vectors.size()
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_vocab.vectors, freeze=True, padding_idx=pad_idx) # not training embeddding
        # self.embedding_layer =  nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           batch_first=True, # imp
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim) # bcos birectional
        self.dropout_emb = nn.Dropout(dropout) # not sure if same layer object can be used
        self.dropout_fc = nn.Dropout(dropout)
 
    def forward(self, text, examples_lengths): # text is already padded
        embedded = self.dropout_emb(self.embedding_layer(text))
        pack_out = nn.utils.rnn.pack_padded_sequence(embedded, examples_lengths.cpu(), batch_first=True)#.to(device)
        out_lstm, (hidden, cell) = self.lstm(embedded) # hidden -> (D∗num_layers, batch , hidden_dim) # D = 2 if bidirectional=True otherwise 1
        h1, h2 = hidden[-2, :, :], hidden[-1, :, :]# -2, -1 is taking last hidden state (twice bcos bidirectional) # so h1,h2 -> (batch, hidden_dim)
        x = self.dropout_fc(torch.cat((h1, h2), dim=1)) # concatenate along hidden_dim # x -> (batch, hidden_dim*2)
        return self.fc(x) # feel like too many dropouts


In [14]:
# 1. write train and test function 
# 2. use k fold train using BucketIterator (CHECK padding) and save them
def test(test_iter: data.iterator.BucketIterator, model: BiLSTM, criterion: nn.MSELoss):
    epoch_loss = []
    model.eval()

    with torch.no_grad():
        for batch in test_iter:
            texts, examples_lengths = batch.text
            predictions = model(texts, examples_lengths)
            predictions = torch.tanh(predictions)
            loss = criterion(predictions.squeeze(1), batch.label)
            epoch_loss.append(loss.item())
    
    return np.mean(epoch_loss)


def train(train_iter: data.iterator.BucketIterator, val_iter: data.iterator.BucketIterator, model: BiLSTM, criterion: nn.MSELoss, optimizer: torch.optim.Optimizer):
    epoch_loss_train = []
    epoch_loss_test = []

    for i, batch in tqdm(enumerate(train_iter)):
        optimizer.zero_grad()
        texts, examples_lengths = batch.text
        # print(texts)
        model.train()
        predictions = model(texts, examples_lengths)
        # predictions = torch.tanh(predictions) # not for civil
        loss = criterion(predictions.squeeze(1), batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss_train.append(loss.item())
        predictions.detach()
        torch.cuda.empty_cache()

        # print("train loss", loss.item())
        # print(i, torch.cuda.memory_allocated(device=device), torch.cuda.max_memory_allocated(device=device))
        # if val_iter != None and i%10 == 0 and i!=0:
        #     epoch_loss_test.append(test(val_iter, model, criterion)) # could eval after certain number of batches/updates
        #     print("batch index", i, "loss_train", epoch_loss_train[-1], "loss_test", epoch_loss_test[-1])
            # break
        
    
    if val_iter != None:
        epoch_loss_test.append(test(val_iter, model, criterion)) # could eval after certain number of batches/updates
        # print("loss_train", epoch_loss_train[-1], "loss_test", epoch_loss_test[-1])
    
    
    return np.mean(epoch_loss_train), np.mean(epoch_loss_test) if val_iter != None else None

In [15]:
# print(next(iter(ruddit)), next(iter(ruddit)).text, next(iter(ruddit)).label)
# ruddit_train_subset = Subset(ruddit, [0,1,2,3])
# ruddit_test_subset = Subset(ruddit, [5,4,7])


# ruddit_train_subset, ruddit_test_subset = ruddit.split(split_ratio=0.7)

# print(next(iter(ruddit_test_subset)).label)
# (train_iter,)= BucketIterator.splits((ruddit,),
#                                 sort_key=lambda x: len(x.text),  # sort by s attribute (quote)
#                                 sort_within_batch=True,
#                                 batch_size=batch_size,
#                                 device=device) 
# train_iter= BucketIterator(ruddit,
#                                 sort_key=lambda x: len(x.text),  # sort by s attribute (quote)
#                                 sort_within_batch=True,
#                                 batch_size=batch_size,
                                # device=device) 

# indices = [5,4,7]
# train_iter, test_iter = BucketIterator.splits((Dataset([ruddit.examples[i] for i in indices], fields=fields), Dataset([ruddit.examples[i] for i in indices], fields=fields)),
#                                 sort_key=lambda x: len(x.text),  # sort by s attribute (quote)
#                                 sort_within_batch=True,
#                                 batch_size=batch_size,
#                                 device=device)
# ruddit_train_subset = Dataset(ruddit_df.loc[[1,2,3]].values, fields=fields)
# ruddit_test_subset = Dataset(ruddit_df.loc[[4,6,7]].values, fields=fields)
# train_iter,test_iter = BucketIterator.splits((ruddit_train_subset, ruddit_test_subset),
#                                 sort_key=lambda x: len(x.text),  # sort by s attribute (quote)
#                                 sort_within_batch=True,
#                                 batch_size=batch_size,
#                                 device=device)
# print(len(train_iter))
# train_iter.create_batches()
# print(train_iter.batches)
# print(next(iter(train_iter)))

# for batch in train_iter:
#     text, batch_len = batch.text
#     print(text.get_device())
#     print(torch.cuda.memory_allocated(device=device), torch.cuda.max_memory_allocated(device=device))
    # print(text)
    # break
#     print(len(text))
    # print(text, batch_len)
    # print("batch.label", batch.label)
    
# for ex in ruddit:
#     print(ex.text) # ruddit is a Dataset # it wont convert text to tensor but DataLoader (not checked) and BucketIterator will via text_field
#     break

# for batch in train_iter.batches:
#     print('Batch size: %d\n'% len(batch))

In [16]:
kf = KFold(n_splits=k_folds, random_state=111, shuffle=True)

for f, (train_index, test_index) in enumerate(kf.split(ruddit.examples)): # ruddit is a generator and ruddit.examples is list
    train_subset = Dataset([ruddit.examples[i] for i in train_index], fields=fields) # subsets dont work!!
    test_subset = Dataset([ruddit.examples[i] for i in test_index], fields=fields)
    train_iter, test_iter = BucketIterator.splits((train_subset, test_subset),
                                sort_key=lambda x: len(x.text),  # sort by s attribute (quote)
                                sort_within_batch=True,
                                batch_size=batch_size,
                                device=device)
                            
    train_iter.create_batches()
    test_iter.create_batches()
    model = BiLSTM(text_field.vocab, hidden_dim, output_dim=1, n_layers=n_layers, bidirectional=True, dropout=dropout_rate, pad_idx=text_field.vocab.stoi[text_field.pad_token])
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.MSELoss()
    train_losses, test_losses = [], []
    for e in range(n_epochs): # should make it early stopping
        train_loss, test_loss = train(train_iter, test_iter, model, criterion, optimizer) # train for one epoch but updates = no. of batches
        print("Fold", f, "epoch", e, "avg train_loss", train_loss, "test_loss", test_loss)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
    
    model_name = output_model_path%(str(test_losses[-1]), "fold_"+str(f))# saving after every epoch # can save after certain batches
    torch.save(model.state_dict(), model_name)
    print("Fold", f, "avg train_loss", np.mean(train_losses), "avg test_loss", np.mean(test_losses))
    # break # using 4:1 train: test ( ie test is 20%)

torch.save(text_field, vocab_path)
# k_folds = 2
# n_epochs = 1
# without val at each batch update took 1.40 min on gpu
# with no val took 5 sec on gpu
# with val at end of epoch took 7 sec on gpu

19it [00:01, 11.07it/s]


Fold 0 epoch 0 avg train_loss 0.11404155998637802 test_loss 0.1013382226228714


19it [00:01, 11.99it/s]


Fold 0 epoch 1 avg train_loss 0.10475546886262141 test_loss 0.09432916566729546


19it [00:01, 12.11it/s]


Fold 0 epoch 2 avg train_loss 0.0940287803348742 test_loss 0.09207454100251197


19it [00:01, 11.70it/s]


Fold 0 epoch 3 avg train_loss 0.09135029288498979 test_loss 0.0910108059644699


19it [00:01, 11.89it/s]


Fold 0 epoch 4 avg train_loss 0.09063581141986345 test_loss 0.08881203383207321


19it [00:01, 11.88it/s]


Fold 0 epoch 5 avg train_loss 0.08180172819840281 test_loss 0.07854141741991043


19it [00:01, 11.78it/s]


Fold 0 epoch 6 avg train_loss 0.07949320344548476 test_loss 0.08217549547553063
Fold 0 avg train_loss 0.0937295493046592 avg test_loss 0.08975452599780899


19it [00:01, 11.82it/s]


Fold 1 epoch 0 avg train_loss 0.11246485027827714 test_loss 0.10435306131839753


19it [00:01, 11.56it/s]


Fold 1 epoch 1 avg train_loss 0.10213008013210799 test_loss 0.09683376029133797


19it [00:01, 11.43it/s]


Fold 1 epoch 2 avg train_loss 0.09734898942865823 test_loss 0.09822099879384041


19it [00:01, 11.54it/s]


Fold 1 epoch 3 avg train_loss 0.09367733115428373 test_loss 0.08862912729382515


19it [00:01, 11.40it/s]


Fold 1 epoch 4 avg train_loss 0.10402490394680124 test_loss 0.09560944736003876


19it [00:01, 11.33it/s]


Fold 1 epoch 5 avg train_loss 0.09322057036977065 test_loss 0.08987650796771049


19it [00:01, 11.48it/s]


Fold 1 epoch 6 avg train_loss 0.08420576743389431 test_loss 0.08099494650959968
Fold 1 avg train_loss 0.09815321324911332 avg test_loss 0.09350254993353573


19it [00:01, 11.31it/s]


Fold 2 epoch 0 avg train_loss 0.11221652968149436 test_loss 0.10163026377558708


19it [00:01, 11.31it/s]


Fold 2 epoch 1 avg train_loss 0.10426381857771623 test_loss 0.09750190973281861


19it [00:01, 11.36it/s]


Fold 2 epoch 2 avg train_loss 0.0986201741585606 test_loss 0.09366491585969924


19it [00:01, 11.53it/s]


Fold 2 epoch 3 avg train_loss 0.09685381443092697 test_loss 0.08584127873182297


19it [00:01, 12.53it/s]


Fold 2 epoch 4 avg train_loss 0.086281694667904 test_loss 0.14199174642562867


19it [00:01, 11.86it/s]


Fold 2 epoch 5 avg train_loss 0.08943756689366542 test_loss 0.082682416588068


19it [00:01, 12.39it/s]


Fold 2 epoch 6 avg train_loss 0.08879519979420461 test_loss 0.09249704629182816
Fold 2 avg train_loss 0.09663839974349604 avg test_loss 0.09940136820077895


19it [00:01, 11.84it/s]


Fold 3 epoch 0 avg train_loss 0.10958172733846464 test_loss 0.10939157754182816


19it [00:01, 11.62it/s]


Fold 3 epoch 1 avg train_loss 0.10458877800326598 test_loss 0.11295296996831894


19it [00:01, 11.32it/s]


Fold 3 epoch 2 avg train_loss 0.10587693397936068 test_loss 0.11129003465175628


19it [00:01, 11.38it/s]


Fold 3 epoch 3 avg train_loss 0.09768615094454665 test_loss 0.10330906882882118


19it [00:01, 11.52it/s]


Fold 3 epoch 4 avg train_loss 0.09553959985312663 test_loss 0.10510232597589493


19it [00:01, 11.31it/s]


Fold 3 epoch 5 avg train_loss 0.09157659505542956 test_loss 0.09089349955320358


19it [00:01, 11.43it/s]


Fold 3 epoch 6 avg train_loss 0.08137310551185357 test_loss 0.08438036367297172
Fold 3 avg train_loss 0.09803184152657825 avg test_loss 0.10247426288468497


19it [00:01, 11.39it/s]


Fold 4 epoch 0 avg train_loss 0.11527502223065025 test_loss 0.1038052998483181


19it [00:01, 11.41it/s]


Fold 4 epoch 1 avg train_loss 0.10684407071063393 test_loss 0.10476048514246941


19it [00:01, 12.14it/s]


Fold 4 epoch 2 avg train_loss 0.10350463500148371 test_loss 0.10963433906435967


19it [00:01, 11.26it/s]


Fold 4 epoch 3 avg train_loss 0.09288895463472918 test_loss 0.09181951582431794


19it [00:01, 11.62it/s]


Fold 4 epoch 4 avg train_loss 0.08398079970165302 test_loss 0.08659720271825791


19it [00:01, 11.49it/s]


Fold 4 epoch 5 avg train_loss 0.0793655251986102 test_loss 0.0806125432252884


19it [00:01, 11.38it/s]


Fold 4 epoch 6 avg train_loss 0.07719997316598892 test_loss 0.07617800012230873
Fold 4 avg train_loss 0.09415128294910703 avg test_loss 0.09334391227790287
