In [None]:
# import
from pathlib import Path
from tqdm import tqdm 
from mosestokenizer import MosesTokenizer
import logging as log
from typing import List, Iterator, Set, Dict, Optional, Tuple
from collections import Counter
from pathlib import Path
import torch
import copy
import torch.nn as nn
import time
from tqdm import tqdm
import torch.optim as optim

In [None]:
from pathlib import Path
from tqdm import tqdm 
from mosestokenizer import MosesTokenizer
import logging as log

log.basicConfig(level=log.INFO)
tokr = MosesTokenizer()
## split a sentence in string into list whose element is a word

def read_tokenized(dir):
  """Tokenization wrapper"""
  ## call MosesTokenizer for each sentence
  inputfile = open(dir)
  for sent in inputfile:
     yield tokr(sent.strip())

## added function
def data_proessing(toks):
    # toks is list whose element is str
    # use list comprehension
    # str.lower() change str into lower case letter
    # number => <num>, str.isdecimal() check if str is number
    out = ['<num>' if s.isdecimal() else s.lower() for s in toks]
    # insert <bos> to head of out and <eos> to tail of out
    out.insert(0,'<bos>')
    out.append('<eos>')
    return out  

## create train and dev text
## call read_tokenized then write it to file 
train_file = Path('train.txt')
with train_file.open('w') as w:
  for toks in tqdm(read_tokenized(Path('CSCI-544/hw2/train.txt'))):
    #w.write(" ".join(toks) + '\n')
    w.write(" ".join(data_proessing(toks)) + '\n')
    
dev_file = Path('dev.txt')
with dev_file.open('w') as w:
  for toks in tqdm(read_tokenized(Path('CSCI-544/hw2/dev.txt'))):
    #w.write(" ".join(toks) + '\n')
    w.write(" ".join(data_proessing(toks)) + '\n')

In [None]:
! ls -l

In [None]:
! head train.txt

In [2]:
from typing import List, Iterator, Set, Dict, Optional, Tuple
from collections import Counter
from pathlib import Path
import torch

## padding and unknow ?
RESERVED = ['<pad>', '<unk>']

## fixed parameters
PAD_IDX = 0 
UNK_IDX = 1
MAX_TYPES = 10_000
BATCH_SIZE = 256
MIN_FREQ = 5


class Vocab:
  """ Mapper of words <--> index """

  def __init__(self, types):
    # types is list of strings
    assert isinstance(types, list)
    assert isinstance(types[0], str)

    self.idx2word = types
    self.word2idx = {word: idx for idx, word in enumerate(types)}
    assert len(self.idx2word) == len(self.word2idx)  # One-to-One

  def __len__(self):
    return len(self.idx2word)
  
  def save(self, path: Path):
    log.info(f'Saving vocab to {path}')
    with path.open('w') as wr:
      for word in self.idx2word:
        wr.write(f'{word}\n')
 
  @staticmethod
  def load(path):
    log.info(f'loading vocab from {path}')
    types = [line.strip() for line in path.open()]
    for idx, tok in enumerate(RESERVED): # check reserved
      assert types[idx] == tok
    return Vocab(types)

  @staticmethod
  def from_text(corpus: Iterator[str], max_types: int,
                             min_freq: int = 5):
    """
    corpus: text corpus; iterator of strings
    max_types: max size of vocabulary
    min_freq: ignore word types that have fewer ferquency than this number
    """
    log.info("building vocabulary; this might take some time")
    term_freqs = Counter(tok for line in corpus for tok in line.split())
    for r in RESERVED:
      if r in term_freqs:
        log.warning(f'Found reserved word {r} in corpus')
        del term_freqs[r]
    term_freqs = list(term_freqs.items())
    log.info(f"Found {len(term_freqs)} types; given max_types={max_types}")
    term_freqs = {(t, f) for t, f in term_freqs if f >= min_freq}
    log.info(f"Found {len(term_freqs)} after dropping freq < {min_freq} terms")
    term_freqs = sorted(term_freqs, key=lambda x: x[1], reverse=True)
    term_freqs = term_freqs[:max_types]
    types = [t for t, f in term_freqs]
    types = RESERVED + types   # prepend reserved words
    return Vocab(types)


train_file = Path('train.txt')
vocab_file = Path('vocab.txt')

if not vocab_file.exists():
  train_corpus = (line.strip() for line in train_file.open())
  vocab = Vocab.from_text(train_corpus, max_types=MAX_TYPES, min_freq=MIN_FREQ)
  vocab.save(vocab_file)
else:
  vocab = Vocab.load(vocab_file)

log.info(f'Vocab has {len(vocab)} types')

In [None]:
! head vocab.txt

In [3]:
import copy

class TextDataset:

  def __init__(self, vocab: Vocab, path: Path):
    self.vocab = vocab
    log.info(f'loading data from {path}')
    # for simplicity, loading everything to memory; on large datasets this will cause OOM

    text = [line.strip().split() for line in path.open()]

    # words to index; out-of-vocab words are replaced with UNK
    xs = [[self.vocab.word2idx.get(tok, UNK_IDX) for tok in tokss] 
                 for tokss in text]
    
    self.data = xs
    
    log.info(f"Found {len(self.data)} records in {path}")

  def as_batches(self, batch_size, shuffle=False): # data already shuffled
    data = self.data
    if shuffle:
      random.shuffle(data)
    for i in range(0, len(data), batch_size): # i incrememt by batch_size
      batch = data[i: i + batch_size]  # slice
      yield self.batch_as_tensors(batch)
  
  @staticmethod
  def batch_as_tensors(batch):
    
    n_ex = len(batch)
    max_len = max(len(seq) for seq in batch)
    seqs_tensor = torch.full(size=(n_ex, max_len), fill_value=PAD_IDX,
                             dtype=torch.long)
    
    for i, seq in enumerate(batch):
      seqs_tensor[i, 0:len(seq)] = torch.tensor(seq)
      
    return seqs_tensor

train_data = TextDataset(vocab=vocab, path=train_file)
dev_data = TextDataset(vocab=vocab, path=Path('dev.txt'))

In [4]:
import torch.nn as nn
class FNN_LM(nn.Module):

  def __init__(self, vocab_size, n_class, emb_dim=50, hid=100, dropout=0.2):
    super(FNN_LM, self).__init__()
    self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                  embedding_dim=emb_dim, 
                                  padding_idx=PAD_IDX)
    self.linear1 = nn.Linear(emb_dim, hid)
    self.linear2 = nn.Linear(hid, n_class)
    self.dropout = nn.Dropout(p=dropout)

  def forward(self, seqs, log_probs=True):
    """Return log Probabilities"""
    batch_size, max_len = seqs.shape
    embs = self.embedding(seqs)  # embs[Batch x SeqLen x EmbDim]
    embs = self.dropout(embs)
    embs = embs.sum(dim=1)   # sum over all all steps in seq    
    
    hid_activated = torch.relu(self.linear1(embs)) # Non linear
    scores = self.linear2(hid_activated)

    if log_probs:
      return torch.log_softmax(scores, dim=1)
    else:
      return torch.softmax(scores, dim=1)

In [5]:
def save_model_object(model, name):
  torch.save({'state_dict': model.state_dict()}, name +".pt")
  return

In [6]:
# Trainer Optimizer 
import time
from tqdm import tqdm
import torch.optim as optim

def train(model, n_epochs, batch_size, train_data, valid_data, device=torch.device('cuda')):
  log.info(f"Moving model to {device}")
  model = model.to(device)   # move model to desired device 
  optimizer = optim.Adam(params=model.parameters())
  log.info(f"Device for training {device}")
  losses = []
  for epoch in range(n_epochs):
    start = time.time()
    num_toks = 0
    train_loss = 0.
    n_train_batches = 0

    model.train() # switch to train mode 
    with tqdm(train_data.as_batches(batch_size=BATCH_SIZE), leave=False) as data_bar:
      for seqs in data_bar:
          
        seq_loss = torch.zeros(1).to(device)
        for i in range(1, seqs.size()[1]-1):
          # Move input to desired device
          cur_seqs = seqs[:, :i].to(device) # take w0...w_(i-1) python indexing
          cur_tars = seqs[:, i].to(device)  # predict w_i
          #print("seqs shape", cur_seqs.shape)
          #print("tasrs shape", cur_tars.shape)

          log_probs = model(cur_seqs)
          #print("cur_tars shape", cur_tars.shape) cur_tars torch.Size([256])
          #print("loss func shape", log_probs.shape) loss func torch.Size([256, 10002])
          seq_loss += loss_func(log_probs, cur_tars).sum() / len(seqs)
        
        seq_loss /= (seqs.shape[1] - 1) # only n-1 toks are predicted
        train_loss += seq_loss.item()
        n_train_batches += 1

        optimizer.zero_grad()         # clear grads
        seq_loss.backward()
        optimizer.step()

        pbar_msg = f'Loss:{seq_loss.item():.4f}'
        data_bar.set_postfix_str(pbar_msg)

    # Run validation
    with torch.no_grad():
      model.eval() # switch to inference mode -- no grads, dropouts inactive
      val_loss = 0
      n_val_batches = 0
      for seqs in valid_data.as_batches(batch_size=batch_size, shuffle=False):
        # Move input to desired device
        seq_loss = torch.zeros(1).to(device)
        for i in range(1, seqs.size()[1]-1):
          # Move input to desired device
          cur_seqs = seqs[:, :i].to(device)
          cur_tars = seqs[:, i].to(device)

          log_probs = model(cur_seqs)
          seq_loss += loss_func(log_probs, cur_tars).sum() / len(seqs)
        seq_loss /= (seqs.shape[1] - 1)
        val_loss += seq_loss.item() 
        n_val_batches += 1
        
    save_model_object(model, "fnn")  
    
    avg_train_loss = train_loss / n_train_batches
    avg_val_loss = val_loss / n_val_batches
    losses.append((epoch, avg_train_loss, avg_val_loss))
    log.info(f"Epoch {epoch} complete; Losses: Train={avg_train_loss:G} Valid={avg_val_loss:G}")
  return losses

model = FNN_LM(vocab_size=len(vocab), n_class=len(vocab))
loss_func = nn.NLLLoss(reduction='none')
losses = train(model, n_epochs=5, batch_size=BATCH_SIZE, train_data=train_data,
                valid_data=dev_data)

                                    

RuntimeError: CUDA error: out of memory