In [1]:
import torch
print(torch.cuda.is_available())
print("Done!")

True
Done!


In [3]:
import numpy as np
from tqdm import tqdm

UNK_LABEL = "unk_label"
PAD_LABEL = "pad_label"
added_words = [PAD_LABEL, UNK_LABEL]
class Embedded_Words:
    def __init__(self, model_file: str, added_pads: list, norm: bool) -> None:
        self.vectors, self.w2i, self.i2w = self.read_model(model_file, added_pads, norm)

    def read_model(self, model_file: str, added_pads: list, norm: bool) -> tuple:
        with open(model_file, "r", encoding="utf-8") as f:
            lines = [x.strip() for x in f.readlines()]

        print(model_file)
        print(len(lines))
        print(lines[0])

        num_word, dim = [int(x) for x in lines[0].split()]
        vectors = np.zeros((num_word + len(added_pads), dim))
        w2i = {}
        i2w = {}
        for line in tqdm(lines[1:]):
            tokens = line.split()
            word = tokens[0]
            word_index = len(w2i)
            v = np.array([float(x) for x in tokens[1:]])
            if norm:
                v = v / np.linalg.norm(v)
            vectors[word_index] = v
            w2i[word] = word_index
            i2w[word_index] = word

        for word in added_pads:
            word_index = len(w2i)
            w2i[word] = word_index
            i2w[word_index] = word
        
        return vectors, w2i, i2w

model_file = "drive/MyDrive/ColabData/embed.model"
embedded_words = Embedded_Words(model_file, added_words, True)
print("")
print(embedded_words.vectors.shape)
print(embedded_words.vectors[embedded_words.vectors.shape[0]-5:,:10])

drive/MyDrive/ColabData/embed.model
85899
85898 256


100%|██████████| 85898/85898 [00:08<00:00, 9595.20it/s]



(85900, 256)
[[ 0.06298662 -0.03220133 -0.02866318  0.12042718  0.18225189 -0.06114065
   0.05848268 -0.04147663  0.04435284 -0.00816747]
 [ 0.05723497 -0.02765993  0.0106675   0.13086651  0.09357034 -0.08705442
  -0.00878455 -0.0667807  -0.00667503 -0.00585762]
 [-0.05147354 -0.00427562  0.09155177  0.13254647  0.11083994 -0.00598478
   0.02139013 -0.03059529 -0.08231317  0.02991129]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]]


In [4]:
def get_active_device():
    """Picking GPU if available or else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
active_device = get_active_device()
print(active_device)

cuda


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

BATCH_FIRST = True
FREEZE_EMBEDDING = True
NORM_EMBED_VECS = True
HIDDEN_DIM = 512
BIDIRECTIONAL = True
NUM_LAYERS = 1
DROPOUT = 0.5
class LSTM(nn.Module):
    def __init__(self,
                 num_classes: int):
        # Constructor.
        super(LSTM, self).__init__()
        
        print('Freeze embedding matrix = ' + str(FREEZE_EMBEDDING))
        print('load embedding model')
        added_words = [PAD_LABEL, UNK_LABEL]
        self.w2v_model = Embedded_Words(model_file, added_words, NORM_EMBED_VECS)
        print('\tw2v after padding: ' + str(self.w2v_model.vectors.shape))

        print('generate embedding tensor')
        # Set the embedding module.
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(self.w2v_model.vectors).to(active_device),
            padding_idx = self.w2v_model.w2i[PAD_LABEL],
            freeze=True) #config[FREEZE_EMBEDDING])
        
        # LSTM layer.
        hidden_dim = HIDDEN_DIM
        self.lstm = nn.LSTM(self.w2v_model.vectors.shape[1],
                            hidden_dim,
                            num_layers=NUM_LAYERS,
                            bidirectional=BIDIRECTIONAL,
                            dropout=0,
                            batch_first=BATCH_FIRST).to(active_device)
        
        # Set the dropout for the embedding layer and the lstm's output layer.
        self.dropout = nn.Dropout(DROPOUT)

        # Set the last layer, fully connected.
        self.fc = nn.Linear(hidden_dim * (2 if BIDIRECTIONAL else 1), num_classes).to(active_device)

    def embed_text(self, texts):
      return self.dropout(self.embedding(texts))
        
    # The forward function.
    def forward(self, texts, lengths):
        # Get the embedding of the given text.
        # texts = [#batch size, sentence length, embed dim]
        #lengths = [#batch size]
        #embed_text = self.dropout(self.embedding(texts))
        # embed_text = [#batch size, sentence length, embed dim]
        
        # Get the packed sentences.
        packed_text = pack_padded_sequence(texts, lengths, batch_first=BATCH_FIRST)
        # packed_text = [[sum lengths, embed dim], [#sequence length (#active batch items for ech length)]]

        # Call the LSTM layer.
        packed_output, (hidden, cell) = self.lstm(packed_text)
        # packed output = [[sum lengths, hidden dim], [#sequence length]]
        # hidden = [1 (2 if bidirectional), #batch size, hidden dim]
        # cell   = [1 (2 if bidirectional), #batch size, hidden dim]
        
        # unpack the output.
        pad_packed_output, _ = pad_packed_sequence(packed_output, batch_first=BATCH_FIRST)
        # pad_packed_output = [batch size, sentence length, hidden dim * 2]
        
        # Prmute the output before pooling.
        permuted_output = self.dropout(pad_packed_output.permute(0, 2, 1))
        # permuted_output = [batch size, hidden dim * 2, sentence length]

        # Max pooling layer.        
        pooled_output = F.max_pool1d(permuted_output, kernel_size=permuted_output.shape[2])
        # pooled_output = [batch size, hidden dim * 2, 1]
        
        # Call the linear full connected layer, after droping out.
        logits = self.fc(torch.squeeze(pooled_output, dim=2))
        # logits = [batch size, #classes]
        
        return logits

In [14]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from torch.utils.data import SequentialSampler
from torch.utils.data import TensorDataset
import pandas as pd
import copy
import random

LENGTH_COL = "lengths"
TOKENS_COL = "tokens"
LABEL_COL = "sentiment"
MINI_BATCH_SIZE = 3000
MAX_TRAIN_LENGTH = 512
def break_by_batch_size(df: pd.DataFrame) -> list:
    sorted_df = df.sort_values(by=LENGTH_COL, axis=0, ascending=False, ignore_index=True)
    tokens = sorted_df[TOKENS_COL].to_list()
    labels = sorted_df[LABEL_COL].to_list()
    lengths = sorted_df[LENGTH_COL].to_list()
    
    df_list = []
    header = {TOKENS_COL:[], LABEL_COL:[], LENGTH_COL:[]}
    row_index = 0
    num_rows = len(labels)
    while row_index < num_rows:
        num_words = 0
        curr_df = copy.deepcopy(header)
        while num_words < MINI_BATCH_SIZE and row_index < num_rows:
            actual_length = min(MAX_TRAIN_LENGTH, lengths[row_index])
            num_words += actual_length
            curr_df[TOKENS_COL].append(tokens[row_index][:MAX_TRAIN_LENGTH])
            curr_df[LABEL_COL].append(labels[row_index])
            curr_df[LENGTH_COL].append(actual_length)
            row_index += 1
        
        df_list.append(pd.DataFrame(curr_df))
        
    return df_list

DROP_WORD_PROB = -1
RANDOM_SAMPLING = "random_sampling"
SEQUENTIAL_SAMPLING = "sequential_sampling"
def df_to_dataloader(df: pd.DataFrame, w2v_model: Embedded_Words, sampling_type: str, is_labeled: int) -> DataLoader:
    sorted_df = df.sort_values(by=LENGTH_COL, axis=0, ascending=False, ignore_index=True)
    texts = sorted_df[TOKENS_COL].values.tolist()
    labels = sorted_df[LABEL_COL].values.tolist()
    lengths = sorted_df[LENGTH_COL].to_list()
    max_len = sorted_df[TOKENS_COL].map(len).max()
    labeled = [is_labeled] * len(df)

    indexed_texts = []
    for sentence in texts:
        sentence += [PAD_LABEL] * (max_len - len(sentence))
        ids = []
        for word in sentence:
            if word not in w2v_model.w2i:
                ids.append(w2v_model.w2i[UNK_LABEL])
            elif np.random.random() < DROP_WORD_PROB:
                ids.append(w2v_model.w2i[UNK_LABEL])
            else:
                ids.append(w2v_model.w2i[word])
        
        indexed_texts.append(ids)
        
    inputs, labels, lengths, labeled = tuple(torch.tensor(data) for data in [indexed_texts, labels, lengths, labeled])

    data = TensorDataset(inputs, labels, lengths, labeled)
    
    if sampling_type == RANDOM_SAMPLING:
        sampler = RandomSampler(data)
    elif sampling_type == SEQUENTIAL_SAMPLING:
        sampler = SequentialSampler(data)
    else:
        print('Wrong Sampling Type: ' + sampling_type)
        return None
        
    dataloader = DataLoader(data, sampler=sampler, batch_size=MINI_BATCH_SIZE)
    return dataloader

TEXT_COL = "review"
def get_data_loaders(input_df: pd.DataFrame,
                     w2v_model: Embedded_Words,
                     sampling_type: str,
                     is_labeled: int,
                     break_df_func) -> list:
    input_df[TOKENS_COL] = input_df[TEXT_COL].apply(lambda x: x.split(' '))
    input_df[LENGTH_COL] = input_df[TOKENS_COL].map(len)
    df_list = break_df_func(input_df)
    dataloaders = []
    for df in df_list:
        dataloader = df_to_dataloader(df, w2v_model, sampling_type, is_labeled)
        dataloaders.append(dataloader)
        
    return dataloaders

RHO = 0.95
LEARNING_RATE = 0.001
OPT_NAME = "adam"
BETA_ONE = 0
BETA_TWO = 0.98
ADAM_EPS = 0.00000001
ADADELATA_OPT = "adadelta"
SGD_OPT = "sgd"
ADAM_OPT = "adam"
def get_optimizer(parameters):
    optimizer = None
    if OPT_NAME == ADADELATA_OPT:
        optimizer = optim.Adadelta(parameters,
                                   lr=LEARNING_RATE,
                                   rho=RHO)
    elif OPT_NAME == SGD_OPT:
        optimizer = optim.SGD(parameters, LEARNING_RATE)
    elif OPT_NAME == ADAM_OPT:
        optimizer = optim.Adam(parameters,
                               lr=LEARNING_RATE,
                               betas=(BETA_ONE,BETA_TWO,),
                               eps=ADAM_EPS)
    else:
        print('Wrong optimizer name: ' + OPT_NAME)
        
    return optimizer
    

CROSS_ENTROP_LOSS = "cross_entropy_loss"
BCE_LOSS = "bce_loss"
def get_loss_function(func_name: str):
    loss_func = None
    if func_name == CROSS_ENTROP_LOSS:
        loss_func = nn.CrossEntropyLoss()
    elif func_name == BCE_LOSS:
        loss_func = nn.BCELoss()
    else:
        print('Wrong loss function name: ' + func_name)
        
    return loss_func


def set_seed(seed_value: int):
    if seed_value >= 0:
        random.seed(seed_value)
        np.random.seed(seed_value)
        torch.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)


In [30]:
import time
import torch
import torch.nn as nn

def test(model: nn.Module, dataloaders: list):
    corrects = 0
    evaluated = 0
    start_time = time.time()
    model.eval()
    for dl in dataloaders:
        for texts, labels, lengths, _ in dl:
            texts = texts.to(active_device)
            labels = labels.to(active_device)
            lengths = lengths.to(torch.device("cpu"))

            with torch.no_grad():
                embed_text = model.embed_text(texts)
                logits = model(embed_text, lengths)
            preds = torch.argmax(logits, dim=1)
            corrects += (preds == labels).sum().item()
            evaluated += texts.shape[0]
        
    accuracy = corrects / evaluated
    run_time = time.time() - start_time
    return accuracy, run_time

In [31]:
def end_train(last_model: nn.Module, opt_model: nn.Module, test_dl: list, val_dl: list, log_file):
    accuracy, run_time = test(last_model, test_dl)
    str_acc = "{:.5f}".format(accuracy)
    str_time = "{:.1f}".format(run_time)
    log_file.write('Last Model\t' + str_acc + '\t' + str_time + '\n')
    log_file.flush()
    print('Last Model\t' + str_acc + '\t' + str_time)
        
    # Print optimal
    opt_acc, run_time = test(opt_model, test_dl)
    val_acc, run_time = test(opt_model, val_dl)

    test_acc = "test: {:.5f}".format(opt_acc)
    val_acc = "val: {:.5f}".format(val_acc)
    log_file.write('Optimal Model\tTest=' + test_acc + '\tVal=' + val_acc + '\n')
    log_file.flush()
    print('Optimal Model\tTest=' + test_acc + '\tVal=' + val_acc)
    log_file.write("\nConfiguraton:\n")


In [47]:
EPS_ADV = 0.1
VAT_EPS = 5

class EMLoss(nn.Module):
    def __init__(self):
        super(EMLoss, self).__init__()

    def forward(self, x):
        b = F.softmax(x, dim=1) * F.log_softmax(x, dim=1)
        b = -1.0 * b.sum()
        b = b / x.shape[0]
        return b

def generate_vat_texts(embed_texts: torch.Tensor) -> torch.Tensor:
    vat_vecs = torch.normal(0, 1, size=embed_texts.size())
    square = torch.sum(vat_vecs ** 2, [2], keepdim=True)
    vat_texts = vat_vecs / torch.sqrt(square)
    vat_texts = embed_texts + EPS_ADV*vat_texts
    return vat_texts

def generate_vat_loss(embed_texts: torch.Tensor, lengths: torch.Tensor, model: LSTM) -> torch.Tensor:
    x = embed_texts.clone().detach().to(torch.float)
    x = x.to(torch.device('cpu'))
    vat_texts = generate_vat_texts(x)

    vat_texts = vat_texts.to(active_device)
    vat_texts.requires_grad = True
    x = x.to(active_device)
    vat_logits = model(vat_texts, lengths)
    reg_logits = model(x, lengths)

    reg_log_logits = F.log_softmax(reg_logits, dim=1)
    vat_log_logits = F.log_softmax(vat_logits, dim=1)

    loss = F.kl_div(vat_log_logits, reg_log_logits, reduction="batchmean", log_target=True)
    loss.backward()

    grad = vat_texts.grad
    grad = grad.clone().detach().to(torch.float)
    square = torch.sum(grad ** 2, [2], keepdim=True)
    norm_grad = grad / torch.sqrt(square)
    star_vecs = embed_texts.clone().detach().to(torch.float) + VAT_EPS*norm_grad
    star_vecs = star_vecs.to(active_device)
    star_logits = model(star_vecs, lengths)
    star_log_logits = F.log_softmax(star_logits, dim=1)

    x = x.clone().detach().to(torch.float)
    x = x.to(active_device)
    reg_logits = model(x, lengths)
    reg_log_logits = F.log_softmax(reg_logits, dim=1)

    return F.kl_div(star_log_logits, reg_log_logits, reduction="batchmean", log_target=True)


In [52]:
import time
import copy
import torch.nn as nn
import random
import pandas as pd
NUM_EPOCHS = 200
MIN_EPOCHS_TO_STOP = 75
MAX_NO_IMP = 3
MAX_VALID_LOSS = 0.35
LOG_FILE_NAME = "drive/MyDrive/ColabLogs/lstm_imdb_ml_em_unlab.txt"
SEED_VALUE = -1
TRAIN_SET = "drive/MyDrive/ColabData/imdb_train_set.csv"
VALIDATION_SET = "drive/MyDrive/ColabData/imdb_val_set.csv"
TEST_SET = "drive/MyDrive/ColabData/imdb_test.csv"
UNLABELED_SET = "drive/MyDrive/ColabData/imdb_unlabeled_set.csv"
MIN_VALID_EPOCHS = 10
class Trainer:
    def __init__(self):
        pass

    def train(self) -> tuple:
        print('lstm trainer - start')
        log_file = open(LOG_FILE_NAME, "w", encoding="utf-8")
        set_seed(SEED_VALUE)

        unlabeled_df = pd.read_csv(UNLABELED_SET) 
        train_df = pd.read_csv(TRAIN_SET)
        val_df = pd.read_csv(VALIDATION_SET)
        test_df = pd.read_csv(TEST_SET)

        pending_model = LSTM(num_classes=train_df[LABEL_COL].unique().shape[0])
        optimal_model = None
        
        optimizer =  get_optimizer(pending_model.parameters())
        loss_func = nn.CrossEntropyLoss()
        
        print('start training loops. #epochs = ' + str(NUM_EPOCHS))
        print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Acc':^11} | {'Test Acc':^9} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*50)  
        
        log_file.write(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Acc':^11} | {'Test Acc':^9} | {'Val Acc':^9} | {'Elapsed':^9}\n")
        log_file.write("-"*50 + "\n")
            
        best_val_acc = 0
        best_test_acc = 0
        best_val_epoch = -1
        best_test_epoch = -1
        min_loss = 100
        num_no_imp = 0
        em_loss_func = EMLoss()
        ml_loss_func = nn.CrossEntropyLoss()
        validation_dl = get_data_loaders(val_df, pending_model.w2v_model, sampling_type=SEQUENTIAL_SAMPLING, is_labeled=1, break_df_func=break_by_batch_size)
        test_dl = get_data_loaders(test_df, pending_model.w2v_model, sampling_type=SEQUENTIAL_SAMPLING,is_labeled=1, break_df_func=break_by_batch_size)
        train_dl = get_data_loaders(train_df, pending_model.w2v_model, sampling_type=SEQUENTIAL_SAMPLING, is_labeled=1, break_df_func=break_by_batch_size)
        non_labeled_dl = get_data_loaders(unlabeled_df, pending_model.w2v_model, sampling_type=SEQUENTIAL_SAMPLING, is_labeled=0, break_df_func=break_by_batch_size)
        train_set_dl = train_dl + non_labeled_dl
        print("type train set: " + str(type(train_set_dl)))
        print("len train set: " + str(len(train_set_dl)))
        print("len labeled train: " + str(len(train_dl)))
        print("len non labeled: " + str(len(non_labeled_dl)))
        for i in range(NUM_EPOCHS):
            epoch = i + 1
            epoch_start_time = time.time()
            total_loss = 0
            num_batches = 0

            random.shuffle(train_set_dl)
            pending_model.train()
            for dl in train_set_dl:
                for texts, labels, lengths, is_labeled_indicators in dl:
                    texts = texts.to(active_device)
                    labels = labels.to(active_device)
                    lengths = lengths.to(torch.device("cpu"))
                    embed_texts = pending_model.embed_text(texts)

                    optimizer.zero_grad()
                    ml_logits = pending_model(embed_texts, lengths)
                    if is_labeled_indicators[0] > 0:
                        ml_loss = ml_loss_func(ml_logits, labels)
                    else:
                        ml_loss = None

                    x = embed_texts.clone().detach().to(torch.float)
                    x = x.to(active_device)
                    em_logits = pending_model(x, lengths)
                    em_loss = em_loss_func(em_logits)

                    #vat_loss = generate_vat_loss(embed_texts=embed_texts, lengths=lengths, model=pending_model)

                    if ml_loss is not None:
                        ml_loss.backward()
                    em_loss.backward()
                    #vat_loss.backward()
                    optimizer.step()

                    total_loss += em_loss.item()
                    #total_loss += vat_loss.item()
                    if ml_loss is not None:
                        total_loss += ml_loss.item()
                    num_batches += 1

            avg_loss = total_loss / num_batches
            epoch_time = time.time() - epoch_start_time
            
            # Validation test.
            val_acc, _ = test(pending_model, validation_dl)
            train_acc, _ = test(pending_model, train_dl)
            test_acc, _ = test(pending_model, test_dl)
            val_acc *= 100
            train_acc *= 100
            test_acc *= 100
            print(f"{epoch:^7} | {avg_loss:^12.6f} | {train_acc:^9.2f} | {test_acc:^9.2f} |  {val_acc:^9.4f} | {epoch_time:^9.2f}")
            log_file.write(f"{epoch:^7} | {avg_loss:^12.6f}  {train_acc:^9.2f} | {test_acc:^9.2f} |  {val_acc:^9.4f} | {epoch_time:^9.2f}\n")
            log_file.flush()
                  
            if avg_loss < min_loss:
                min_loss = avg_loss
                num_no_imp = 0
            else:
                num_no_imp += 1
                
            if num_no_imp > MAX_NO_IMP and epoch > MIN_EPOCHS_TO_STOP:
                print('early stop exit')
                log_file.write('\tEarly Stop exit\n')
                log_file.flush()
                break
            
            if epoch < MIN_VALID_EPOCHS:
                continue
            
            if avg_loss > MAX_VALID_LOSS:
                continue
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                optimal_model = copy.deepcopy(pending_model)
                best_val_epoch = epoch

            if test_acc > best_test_acc:
              best_test_acc = test_acc
              best_test_epoch = epoch
        
        print('train_lstm_nlp - end')
        print("Best test results: acc={:.3f}".format(best_test_acc) + ", epoch=" + str(best_test_epoch))
        print("Best val results: acc={:.3f}".format(best_val_acc) + ", epoch=" + str(best_val_epoch))
        log_file.write("Best test results: acc={:.3f}".format(best_test_acc) + ", epoch=" + str(best_test_epoch) + "\n")
        log_file.write("Best val results: acc={:.3f}".format(best_val_acc) + ", epoch=" + str(best_val_epoch) + "\n")
        log_file.flush()
        end_train(pending_model, optimal_model, test_dl, validation_dl, log_file)
        return pending_model, optimal_model, best_val_epoch


In [None]:
trainer = Trainer()
last_model, opt_model, best_epoch = trainer.train()
print(best_epoch)

lstm trainer - start
Freeze embedding matrix = True
load embedding model
drive/MyDrive/ColabData/embed.model
85899
85898 256


100%|██████████| 85898/85898 [00:08<00:00, 10013.40it/s]


	w2v after padding: (85900, 256)
generate embedding tensor
start training loops. #epochs = 200
 Epoch  |  Train Loss  |  Train Acc  | Test Acc  |  Val Acc  |  Elapsed 
--------------------------------------------------
type train set: <class 'list'>
len train set: 3802
len labeled train: 1869
len non labeled: 1933
   1    |   0.604605   |   85.38   |   85.12   |   83.3727  |  340.49  
   2    |   0.388632   |   88.23   |   88.15   |   87.9433  |  339.92  
   3    |   0.345055   |   88.86   |   88.27   |   87.1552  |  340.41  
   4    |   0.322034   |   90.09   |   89.34   |   88.0221  |  340.18  
   5    |   0.305352   |   91.06   |   90.04   |   88.3373  |  340.30  
   6    |   0.292830   |   90.81   |   89.94   |   89.1253  |  340.52  
   7    |   0.269239   |   91.60   |   90.33   |   90.3861  |  340.58  
   8    |   0.260764   |   91.43   |   90.12   |   88.9677  |  339.87  
   9    |   0.250274   |   93.30   |   91.48   |   90.7801  |  340.16  
  10    |   0.242584   |   93.65   |