<a href="https://colab.research.google.com/github/therishabhmittal-05/NLP/blob/main/CharacterRNN_LanguagePredict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import numpy as np
import random
import string
import unicodedata
from torch.utils.data import Dataset, random_split
import glob
import os
import time

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
allowed_chars = string.ascii_letters + " .,;'"+ "_"
n_letters = len(allowed_chars)

def unicode_ascii(s):
  return "".join(
      c for c in unicodedata.normalize("NFD", s)
      if unicodedata.category(c) != 'Mn'
      and c in allowed_chars
  )

In [5]:
print(unicode_ascii('Ślusàrski'))

Slusarski


In [6]:
# Letter to index
def letter_index(letter):
  if letter not in allowed_chars:
    return allowed_chars.find('_')
  else:
     return allowed_chars.find(letter)

# Sentence to tensor
def sent_tensor(sent):
  tensor = torch.zeros(len(sent), 1, n_letters)
  for li, letter in enumerate(sent):
    tensor[li][0][letter_index(letter)] = 1
  return tensor

In [7]:
sent_tensor('Rishabh Mittal')
sent_tensor('$')

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 1.]]])

In [8]:
class LangDs(Dataset):

  def __init__(self, data_dir):
    self.data_dir = data_dir
    labels_set = set()

    self.data = []
    self.data_tensors = []
    self.labels = []
    self.label_tensors=[]

    text_files = glob.glob(os.path.join(data_dir, '*txt'))
    for filename in text_files:
      label = os.path.splitext(os.path.basename(filename))[0]
      labels_set.add(label)
      lines = open(filename, encoding='utf-8').read().strip().split('\n')
      for name in lines:
        self.data.append(name)
        self.data_tensors.append(sent_tensor(name))
        self.labels.append(label)

      self.labels_uniq = list(labels_set)
      for idx in range(len(self.labels)):
        temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)
        self.label_tensors.append(temp_tensor)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    data_item = self.data[idx]
    data_label = self.labels[idx]
    data_tensor = self.data_tensors[idx]
    label_tensor = self.label_tensors[idx]

    return data_item, data_label, data_tensor, label_tensor

In [9]:
alldata = LangDs('/content/drive/MyDrive/Colab Notebooks/data/data/names')

In [10]:
train_set, val_set = random_split(alldata, [(int)(len(alldata)*0.85), len(alldata) - (int)(len(alldata)*0.85)], generator=torch.Generator().manual_seed(42))

In [32]:
print(len(train_set))

17062


In [61]:
indices = list(range(len(train_set)))
print(len(indices))
random.shuffle(indices)
batches = np.array_split(indices, len(indices) // 64)
print(len(batches[37]))

17062
65


In [12]:
class LangModel(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(LangModel, self).__init__()

    self.rnn = nn.RNN(input_size, hidden_size)
    self.fc = nn.Linear(hidden_size, output_size)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, senttensor):
    rnn_out, ahat = self.rnn(senttensor)
    ahat = self.fc(ahat[0])
    yhat = self.softmax(ahat)
    return yhat


In [21]:
n_hidden = 128
model = LangModel(n_letters, n_hidden, len(alldata.labels_uniq)).to(device)
print("Device: cuda" if next(model.parameters()).is_cuda else "Device: cpu")
print(model)

Device : cuda
LangModel(
  (rnn): RNN(58, 128)
  (fc): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [22]:
input = sent_tensor('Alexandra').to(device)
output = model(input)
def get_label_from_output(output, output_labels):
  _, top_i = output.topk(1)
  label_i = top_i[0].item()
  return output_labels[label_i], label_i
print(get_label_from_output(output, alldata.labels_uniq))

('Polish', 9)


In [87]:
def train(model, train_data, n_epochs=10, batch_size=64, report=50, lr = 0.2, criterion = nn.NLLLoss()):
  losses=[] # to store loss after each epoch for plotting
  model.train() # train mode
  optimizer = torch.optim.SGD(model.parameters(), lr=lr)

  print(f"length of train data: {len(train_data)}")
  for iter in range(1, n_epochs+1):
    running_loss = 0.0 # One epoch loss

    # create some minibatches
    # we cannot use dataloaders because each of our names is a different length
    indices = list(range(len(train_data)))
    random.shuffle(indices)
    batches = np.array_split(indices, max(1, len(indices) // batch_size))

    for idx, batch in enumerate(batches):
      batch_loss = 0.0 # per batch loss, added to running loss after each batch run
      optimizer.zero_grad()

      # For each word in the batch
      for i in batch:
        (data_item, data_label, data_tensor, label_tensor) = train_data[i]
        data_tensor = data_tensor.to(device)
        label_tensor = label_tensor.to(device)
        output = model(data_tensor)
        loss = criterion(output, label_tensor)
        batch_loss += loss
      batch_loss /= len(batch)
      batch_loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=3)
      optimizer.step()
      running_loss += batch_loss.item()

    losses.append(running_loss/len(batches))
    if(iter%report==0):
      print(f"Epoch: {iter} | Epoch Loss: {losses[-1]} ")
    running_loss = 0 # set to zero again, to calculate loss for next epoch
  return losses # list of losses at each epoch

In [88]:
start = time.time()
losses = train(model, train_set, n_epochs=30, report=5)
end = time.time()

print(f"training took {end-start}s")

length of train data: 17062
Epoch: 5 | Epoch Loss: 0.7993450484105519 
Epoch: 10 | Epoch Loss: 0.7255371973702782 
Epoch: 15 | Epoch Loss: 0.6636278130730292 
Epoch: 20 | Epoch Loss: 0.6248615726940614 
Epoch: 25 | Epoch Loss: 0.57053168308466 
Epoch: 30 | Epoch Loss: 0.5407787313810865 
training took 519.369300365448s


In [90]:
input = sent_tensor('Jing').to(device)
output = model(input)
def get_label_from_output(output, output_labels):
  _, top_i = output.topk(1)
  label_i = top_i[0].item()
  return output_labels[label_i], label_i
print(get_label_from_output(output, alldata.labels_uniq))

('German', 1)


In [None]:
def eval()