In [1]:
import random
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 


import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader, Dataset

In [2]:

random.seed(42)
pytorch_seed = torch.manual_seed(seed=42)

In [3]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:


df = pd.read_csv(filepath_or_buffer="1661-0.txt",sep="\t",names=["data"],header=None)
df


Unnamed: 0,data
0,Project Gutenberg's The Adventures of Sherlock...
1,This eBook is for the use of anyone anywhere a...
2,almost no restrictions whatsoever. You may co...
3,re-use it under the terms of the Project Guten...
4,with this eBook or online at www.gutenberg.net
...,...
9628,facility: www.gutenberg.org
9629,This Web site includes information about Proje...
9630,including how to make donations to the Project...
9631,"Archive Foundation, how to help produce our ne..."


<br>

# `#01: Data preprocessing`

- Make token
- Make vocabulary 

<br>

In [5]:

# get the token:
df["data"][0].split(" ")

['Project',
 "Gutenberg's",
 'The',
 'Adventures',
 'of',
 'Sherlock',
 'Holmes,',
 'by',
 'Arthur',
 'Conan',
 'Doyle']

In [6]:

# tokenization:
token = [df["data"][i].split(" ") for i in range(0,len(df))]
token

[['Project',
  "Gutenberg's",
  'The',
  'Adventures',
  'of',
  'Sherlock',
  'Holmes,',
  'by',
  'Arthur',
  'Conan',
  'Doyle'],
 ['This',
  'eBook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever.',
  '',
  'You',
  'may',
  'copy',
  'it,',
  'give',
  'it',
  'away',
  'or'],
 ['re-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'Project',
  'Gutenberg',
  'License',
  'included'],
 ['with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.net'],
 ['Title:', 'The', 'Adventures', 'of', 'Sherlock', 'Holmes'],
 ['Author:', 'Arthur', 'Conan', 'Doyle'],
 ['Release', 'Date:', 'November', '29,', '2002', '[EBook', '#1661]'],
 ['Last', 'Updated:', 'May', '20,', '2019'],
 ['Language:', 'English'],
 ['Character', 'set', 'encoding:', 'UTF-8'],
 ['***',
  'START',
  'OF',
  'THIS',
  'PROJECT',
  'GUTENBERG',
  'EBOOK',
  'THE',
  'ADVENTURES',
  'OF',


In [7]:

final_token = []
for i in range(len(token)):
    list_token = token[i]
    for tok in list_token:
        final_token.append(tok.lower().strip())
        
total_token = len(final_token)
print(f"Total token no: {total_token}")
print("-------------------------------")
final_token[:20]


Total token no: 108061
-------------------------------


['project',
 "gutenberg's",
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes,',
 'by',
 'arthur',
 'conan',
 'doyle',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere']

In [8]:

# Remove the duplicated words from token:
from collections import Counter
counter = Counter(final_token)
counter

Counter({'the': 5703,
         'and': 2882,
         'of': 2758,
         'to': 2720,
         'a': 2648,
         'i': 2533,
         'in': 1760,
         'that': 1605,
         'was': 1371,
         'he': 1278,
         'it': 1267,
         'you': 1173,
         'his': 1146,
         'is': 1080,
         'my': 955,
         'have': 903,
         'with': 870,
         'as': 848,
         'had': 813,
         'at': 768,
         'which': 754,
         'for': 727,
         'be': 615,
         'not': 609,
         'but': 541,
         'we': 502,
         'from': 498,
         'this': 467,
         'upon': 461,
         '': 459,
         'said': 447,
         'me': 414,
         'there': 396,
         'she': 389,
         'been': 385,
         'your': 379,
         'her': 377,
         'very': 376,
         'on': 366,
         'by': 357,
         '“i': 349,
         'all': 339,
         'were': 337,
         'so': 336,
         'an': 335,
         'are': 322,
         'would': 317,
      

In [9]:
counter.get

<function Counter.get(key, default=None, /)>

In [10]:

# from the dictionary just get only the key: 
sorted_val = sorted(counter,key=counter.get,reverse=True)
print(sorted_val)
print(len(sorted_val))

14556


In [11]:

vocab = ["UNK","PAD"] + sorted_val[:10000]
len(vocab)

10002

In [12]:

vocab_size = len(vocab)
word_to_idx = { word:idx for idx, word in enumerate(vocab)}
word_to_idx

{'UNK': 0,
 'PAD': 1,
 'the': 2,
 'and': 3,
 'of': 4,
 'to': 5,
 'a': 6,
 'i': 7,
 'in': 8,
 'that': 9,
 'was': 10,
 'he': 11,
 'it': 12,
 'you': 13,
 'his': 14,
 'is': 15,
 'my': 16,
 'have': 17,
 'with': 18,
 'as': 19,
 'had': 20,
 'at': 21,
 'which': 22,
 'for': 23,
 'be': 24,
 'not': 25,
 'but': 26,
 'we': 27,
 'from': 28,
 'this': 29,
 'upon': 30,
 '': 31,
 'said': 32,
 'me': 33,
 'there': 34,
 'she': 35,
 'been': 36,
 'your': 37,
 'her': 38,
 'very': 39,
 'on': 40,
 'by': 41,
 '“i': 42,
 'all': 43,
 'were': 44,
 'so': 45,
 'an': 46,
 'are': 47,
 'would': 48,
 'what': 49,
 'one': 50,
 'no': 51,
 'when': 52,
 'could': 53,
 'has': 54,
 'out': 55,
 'into': 56,
 'or': 57,
 'mr.': 58,
 'who': 59,
 'little': 60,
 'if': 61,
 'him': 62,
 'will': 63,
 'up': 64,
 'some': 65,
 'do': 66,
 'our': 67,
 'should': 68,
 'may': 69,
 'down': 70,
 'holmes': 71,
 'man': 72,
 'see': 73,
 'am': 74,
 'they': 75,
 'shall': 76,
 'about': 77,
 'must': 78,
 'can': 79,
 'over': 80,
 'any': 81,
 'then': 82,
 '

In [13]:

idx_to_word = { idx:word for idx,word in enumerate(vocab)}
idx_to_word

{0: 'UNK',
 1: 'PAD',
 2: 'the',
 3: 'and',
 4: 'of',
 5: 'to',
 6: 'a',
 7: 'i',
 8: 'in',
 9: 'that',
 10: 'was',
 11: 'he',
 12: 'it',
 13: 'you',
 14: 'his',
 15: 'is',
 16: 'my',
 17: 'have',
 18: 'with',
 19: 'as',
 20: 'had',
 21: 'at',
 22: 'which',
 23: 'for',
 24: 'be',
 25: 'not',
 26: 'but',
 27: 'we',
 28: 'from',
 29: 'this',
 30: 'upon',
 31: '',
 32: 'said',
 33: 'me',
 34: 'there',
 35: 'she',
 36: 'been',
 37: 'your',
 38: 'her',
 39: 'very',
 40: 'on',
 41: 'by',
 42: '“i',
 43: 'all',
 44: 'were',
 45: 'so',
 46: 'an',
 47: 'are',
 48: 'would',
 49: 'what',
 50: 'one',
 51: 'no',
 52: 'when',
 53: 'could',
 54: 'has',
 55: 'out',
 56: 'into',
 57: 'or',
 58: 'mr.',
 59: 'who',
 60: 'little',
 61: 'if',
 62: 'him',
 63: 'will',
 64: 'up',
 65: 'some',
 66: 'do',
 67: 'our',
 68: 'should',
 69: 'may',
 70: 'down',
 71: 'holmes',
 72: 'man',
 73: 'see',
 74: 'am',
 75: 'they',
 76: 'shall',
 77: 'about',
 78: 'must',
 79: 'can',
 80: 'over',
 81: 'any',
 82: 'then',
 8

In [14]:

# we took only 10000, other token will be unkwon in final token:
unk_id = word_to_idx["UNK"]
unk_id

0

In [15]:

final_idx = [word_to_idx.get(token,unk_id)  for token in final_token]
print(f"total size of final token: {len(final_idx)}")
final_idx[-10:-1]

total size of final token: 108061


[0, 5, 67, 5980, 0, 5, 335, 77, 323]

In [16]:

# create sequence and next word:
seq_length=30
sequences = []
next_words = []
for i in range(len(final_idx)-seq_length):
    sequences.append(final_idx[i:(i+seq_length)])
    next_words.append(final_idx[i+seq_length])


# total number of sequence:
print(f"Total number of sequence : {len(sequences)}")
print(f"Total number of next_words : {len(next_words)}")


Total number of sequence : 108031
Total number of next_words : 108031


<br>
<br>


# `#02: making the dataset:`

<br>
<br>

In [17]:


class CustomDataset(Dataset):
    def __init__(self,sequences,next_words):
        super().__init__()
        self.seq = torch.tensor(data=sequences,device=device,dtype=torch.long)
        self.nex = torch.tensor(data=next_words,device=device,dtype=torch.long)
        
    def __getitem__(self,idx):
        return self.seq[idx],self.nex[idx]
    
    def __len__(self):
        return len(self.seq)
    

dataset = CustomDataset(sequences,next_words)
dataloader = DataLoader(dataset=dataset,
                        batch_size=64,
                        shuffle=True)



In [None]:




class LSTMPredictor(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim)
        #batch_first == True means, we have batch:
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_size,batch_first=True,num_layers=2)
        self.fc = nn.Linear(hidden_size,vocab_size)
        
    def forward(self,x):
        embed = self.embedding(x)
        # 3 thing in output in  LSTM cell
        lstm_out, (hidden,_) = self.lstm(embed)
        out = self.fc(torch.squeeze(hidden,dim=0))
        return out 
    
    
    
model = LSTMPredictor(vocab_size=vocab_size,embedding_dim=100,hidden_size=512)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(),lr=0.01)




In [None]:

# tranning loop:
epochs = 100

for epoch in range(epochs):
    total_loss = 0
    for input,next_word in dataloader:
        input = input.to(device)
        next_word = next_word.to(device)
        
        #forward pass
        out = model(input)
        
        #calculate loss:
        loss = criterion(out,next_word)
        
        #gradient accumulation:
        optimizer.zero_grad()
        
        #backward pass 
        loss.backward()
        
        #update value 
        optimizer.step()
        
        total_loss += loss.item()
    print(f"epoch: {epoch+1}  loss: {total_loss/len(dataloader)}")
    
        

epoch: 1  loss: 9.215628981590271
epoch: 2  loss: 9.2156290352627


KeyboardInterrupt: 