In [15]:
!nvidia-smi

Sun Mar 31 15:45:12 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.67                 Driver Version: 550.67         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2070 ...    Off |   00000000:0A:00.0  On |                  N/A |
|  0%   47C    P5             24W /  215W |     765MiB /   8192MiB |     32%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import opendatasets as od
import string
import random
import os

from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Normalize, ToTensor

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/joy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
if torch.cuda.is_available():
    device=torch.device(type="cuda", index=0)
else:
    device=torch.device(type="cpu", index=0)
print(device)

cuda:0


In [5]:
dataset_path = "Datasets/final.csv"
dataset = pd.read_csv(dataset_path)
dataset.head()

Unnamed: 0.1,Unnamed: 0,id,og,t
0,0,42928-1500614319216-63344,You do not meet a man but frowns:,Every man you meet these days is frowning.
1,1,42928-1500614326583-89821,our bloods No more obey the heavens than our...,Our bodies are in agreement with the planetar...
2,2,A-63849,But what's the matter?,What's wrong?
3,3,42930-1500614347266-80123,"His daughter, and the heir of's kingdom, whom...","The king wanted his daughter, the only heir to..."
4,4,42930-1500614355280-38326,she's wedded; Her husband banish'd; she impr...,"She's married, her husband is banished, she's..."


In [6]:
# dataset['og_tokens'] = dataset['og'].apply(word_tokenize)
# dataset['t_tokens'] = dataset['t'].apply(word_tokenize)

# dataset.head()

In [7]:
# Tokenize sentences into words, here split for only whitespace.
# IMPROVEMENT: can have custom split.
dataset['og_words'] = dataset['og'].str.split()
dataset['t_words'] = dataset['t'].str.split()

dataset.head()

Unnamed: 0.1,Unnamed: 0,id,og,t,og_words,t_words
0,0,42928-1500614319216-63344,You do not meet a man but frowns:,Every man you meet these days is frowning.,"[You, do, not, meet, a, man, but, frowns:]","[Every, man, you, meet, these, days, is, frown..."
1,1,42928-1500614326583-89821,our bloods No more obey the heavens than our...,Our bodies are in agreement with the planetar...,"[our, bloods, No, more, obey, the, heavens, th...","[Our, bodies, are, in, agreement, with, the, p..."
2,2,A-63849,But what's the matter?,What's wrong?,"[But, what's, the, matter?]","[What's, wrong?]"
3,3,42930-1500614347266-80123,"His daughter, and the heir of's kingdom, whom...","The king wanted his daughter, the only heir to...","[His, daughter,, and, the, heir, of's, kingdom...","[The, king, wanted, his, daughter,, the, only,..."
4,4,42930-1500614355280-38326,she's wedded; Her husband banish'd; she impr...,"She's married, her husband is banished, she's...","[she's, wedded;, Her, husband, banish'd;, she,...","[She's, married,, her, husband, is, banished,,..."


In [9]:
# creating a vocabulary/corpus for both types for texts
shakespeare_vocab = set()
modernEnglis_vocab = set()

for word in dataset['og_words']:
    shakespeare_vocab.update(word)
    
for word in dataset['t_words']:
    modernEnglis_vocab.update(word)
    
shakespeare_vocab.add('<PAD>')
shakespeare_vocab.add('<UNK>')

modernEnglis_vocab.add('<PAD>')
modernEnglis_vocab.add('<UNK>')

print(len(shakespeare_vocab))
print(len(modernEnglis_vocab))

73560
49281


In [11]:
# Word to index
shakespeare_word_to_idx = {word: idx for idx, word in enumerate(shakespeare_vocab)}
modernEnglis_word_to_idx = {word: idx for idx, word in enumerate(modernEnglis_vocab)}

print(shakespeare_word_to_idx)



In [13]:
# Converting words to the encoded indices.

dataset['og_indices'] = dataset['og_words'].apply(lambda words: [shakespeare_word_to_idx.get(word, shakespeare_word_to_idx['<UNK>']) for word in words])
dataset['t_indices'] = dataset['t_words'].apply(lambda words: [modernEnglis_word_to_idx.get(word, modernEnglis_word_to_idx['<UNK>']) for word in words])

dataset.head()

Unnamed: 0.1,Unnamed: 0,id,og,t,og_words,t_words,og_indices,t_indices
0,0,42928-1500614319216-63344,You do not meet a man but frowns:,Every man you meet these days is frowning.,"[You, do, not, meet, a, man, but, frowns:]","[Every, man, you, meet, these, days, is, frown...","[43525, 37492, 3073, 41869, 58861, 6935, 41152...","[3216, 4687, 45015, 28240, 4522, 38200, 38243,..."
1,1,42928-1500614326583-89821,our bloods No more obey the heavens than our...,Our bodies are in agreement with the planetar...,"[our, bloods, No, more, obey, the, heavens, th...","[Our, bodies, are, in, agreement, with, the, p...","[19207, 45623, 69451, 17208, 60335, 21625, 529...","[14967, 8289, 13640, 6442, 48937, 36966, 14685..."
2,2,A-63849,But what's the matter?,What's wrong?,"[But, what's, the, matter?]","[What's, wrong?]","[22162, 15007, 21625, 68726]","[22474, 15792]"
3,3,42930-1500614347266-80123,"His daughter, and the heir of's kingdom, whom...","The king wanted his daughter, the only heir to...","[His, daughter,, and, the, heir, of's, kingdom...","[The, king, wanted, his, daughter,, the, only,...","[58605, 43914, 1661, 21625, 1179, 69883, 35097...","[2819, 233, 39499, 48493, 29618, 14685, 31856,..."
4,4,42930-1500614355280-38326,she's wedded; Her husband banish'd; she impr...,"She's married, her husband is banished, she's...","[she's, wedded;, Her, husband, banish'd;, she,...","[She's, married,, her, husband, is, banished,,...","[8595, 60050, 28568, 748, 52080, 46541, 61853,...","[800, 22385, 33545, 495, 38243, 19969, 5850, 3..."


In [14]:
#  PADDING

max_len = max(dataset['og_indices'].apply(len).max(), dataset['t_indices'].apply(len).max())
print(max_len)

161


In [None]:
class ShakespeareDataset(Dataset):
    def __init__(self, file_path) -> None:
        super().__init__()
        self.data = self.loadData(file_path)
        self.vocab = self.getVocab()
        
    def loadData(self, file_path):
        # Preprocess as well
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
        lines = data.split('\n')
        lines = [line.strip(r'\"') for line in lines]
        words = ' '.join(lines).split()
        punct = set(string.punctuation)
        words = [''.join([char for char in list(word) if char not in punct]) for word in words]
        return words
    
    def getVocab(self):
        char_counts = Counter(self.data)
        vocab = {char: idx for idx, (char, _) in enumerate(sorted(char_counts.items()))}
        vocab_size = len(vocab)
        return vocab, vocab_size
    
    def __len__(self):
        return len(self.data)
    
    # def __getitem__(self, index):
    #     return 

In [1]:
from models.models import Encoder, Decoder, EncDec

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
e = Encoder(3,2,2,'lstm')
print(e)

Encoder(
  (rnn): LSTM(3, 2, num_layers=2)
)
