# CS6910 Assignment 3 (RNN Frameworks for transliteration)

In [1]:
import torch
from torch import nn
import tqdm
import wandb
import unicodedata

In [2]:
rnn = nn.LSTM(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))

In [3]:
print(output.shape)

torch.Size([5, 3, 20])


In [5]:
a = torch.randn(4,4)
print(a)

tensor([[-2.0834, -0.5957, -0.7612, -0.2037],
        [ 0.0218, -0.9049, -0.3834,  1.2116],
        [ 0.0391,  0.0981,  0.6162, -1.0046],
        [-2.0050, -0.7412,  0.7707,  0.1779]])


In [7]:
c = a.view(8,-1)
print(c)
c[0,0] = 5
print(c)
print(a)

tensor([[-2.0834, -0.5957],
        [-0.7612, -0.2037],
        [ 0.0218, -0.9049],
        [-0.3834,  1.2116],
        [ 0.0391,  0.0981],
        [ 0.6162, -1.0046],
        [-2.0050, -0.7412],
        [ 0.7707,  0.1779]])
tensor([[ 5.0000, -0.5957],
        [-0.7612, -0.2037],
        [ 0.0218, -0.9049],
        [-0.3834,  1.2116],
        [ 0.0391,  0.0981],
        [ 0.6162, -1.0046],
        [-2.0050, -0.7412],
        [ 0.7707,  0.1779]])
tensor([[ 5.0000, -0.5957, -0.7612, -0.2037],
        [ 0.0218, -0.9049, -0.3834,  1.2116],
        [ 0.0391,  0.0981,  0.6162, -1.0046],
        [-2.0050, -0.7412,  0.7707,  0.1779]])


In [2]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
TARGET = 'tam'
SOURCE = 'english'

unicode_ranges = {'tam' : [0x0B80, 0x0BFF], 
                  'eng' : [0x0061, 0x007A],
                  'hin' : [0x0900, 0x097F]}

cuda


## Preprocessing Functions and Helpers

In [8]:
# function to load the 'cat' (= train/val/test) data of language 'lang'
def load_data(lang, cat):
    fcontents = open(f'aksharantar_sampled/{lang}/{lang}_{cat}.csv','r', encoding='utf-8').readlines()
    pairs = [tuple(l.strip().split(',')) for l in fcontents]
    x_data, y_data = list(map(list,zip(*pairs)))
    return x_data, y_data

# function to create the vocabulary using the words in 'data'
def create_vocabulary(*data):
    symbols = set()
    for wd in data:
        for c in wd:
            symbols.add(c)
    return symbols

def create_vocabulary_range(lang):
    symbols = set()
    begin, end = unicode_ranges[lang]
    for i in range(begin, end+1):
        if (unicodedata.category(chr(i)) != 'Cn'):
            symbols.add(chr(i))
    return symbols

In [21]:
torch.manual_seed(0)
m = nn.Dropout(p=0.2)
input = torch.randn(5, 5)
output = m(input)
print(output, input)
m = nn.Dropout(p=0.2)
input = torch.randn(5, 5)
output = m(input)
print(output, input)

tensor([[-0.0000, -1.4405, -0.3132, -0.5423,  1.0609],
        [ 0.8650, -0.3950, -2.6440,  0.4028, -0.1971],
        [ 1.8046,  0.0000,  0.2081,  0.0000, -0.0000],
        [-0.1395, -0.7670,  1.5738,  2.5062,  0.0672],
        [ 0.0000, -0.0000, -1.0513, -2.8951, -0.1279]]) tensor([[-1.1258, -1.1524, -0.2506, -0.4339,  0.8487],
        [ 0.6920, -0.3160, -2.1152,  0.3223, -0.1577],
        [ 1.4437,  0.2660,  0.1665,  0.8744, -0.1435],
        [-0.1116, -0.6136,  1.2590,  2.0050,  0.0537],
        [ 0.6181, -0.4128, -0.8411, -2.3160, -0.1023]])
tensor([[ 0.0000,  0.3245, -0.2175, -0.0000,  1.1728],
        [ 0.6111, -0.8414,  1.0910, -1.5002,  1.6123],
        [-1.8477,  0.0000, -0.5914,  0.4194,  1.8864],
        [ 2.6024,  2.1334, -0.6246, -1.3337,  1.3937],
        [-0.0000,  1.0072,  0.0000, -0.0000, -1.9989]]) tensor([[ 0.5627,  0.2596, -0.1740, -0.6787,  0.9383],
        [ 0.4889, -0.6731,  0.8728, -1.2001,  1.2899],
        [-1.4782,  2.5672, -0.4731,  0.3356,  1.5091],
       

In [4]:
x_train, y_train = load_data(TARGET, 'train')
x_valid, y_valid = load_data(TARGET, 'valid')
x_test, y_test = load_data(TARGET, 'test')

print(f'Number of train samples = {len(x_train)}')
print(f'Number of valid samples = {len(x_valid)}')
print(f'Number of test samples = {len(x_test)}')

Number of train samples = 51200
Number of valid samples = 4096
Number of test samples = 4096


In [9]:
tam_vocab = create_vocabulary_range(TARGET)

print(f'Source Vocabulary Size = {len(tam_vocab)}')
print(f'Source Vocabulary = {tam_vocab}')

Source Vocabulary Size = 72
Source Vocabulary = {'ண', 'ா', 'அ', 'ொ', '௺', 'ஸ', '௬', 'ோ', 'ம', 'ௐ', '௭', 'ஔ', 'ீ', 'ஜ', 'ட', 'ஹ', 'ஶ', 'க', 'ன', 'வ', '௸', 'ெ', '௱', 'ை', '௩', 'ஷ', 'ப', 'ழ', 'ஒ', 'ௗ', 'ே', 'ி', 'ஞ', 'ஏ', 'ய', 'ற', 'எ', '௳', 'ஓ', '௹', '௴', '௲', 'ஐ', '௮', 'இ', 'ந', '௧', '௪', 'ஂ', 'த', '௯', '௷', '௶', 'உ', 'ு', 'ங', 'ள', '்', 'ௌ', 'ஆ', '௫', '௰', '௵', 'ர', 'ல', 'ூ', 'ச', 'ஃ', '௨', 'ஊ', '௦', 'ஈ'}
