In [65]:
import random
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 


import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader, Dataset

In [66]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/next-word-prediction/1661-0.txt


In [67]:

random.seed(42)
pytorch_seed = torch.manual_seed(seed=42)

In [68]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [69]:


df = pd.read_csv(filepath_or_buffer="/kaggle/input/next-word-prediction/1661-0.txt",
                 sep="\t",names=["data"],header=None)
df


Unnamed: 0,data
0,Project Gutenberg's The Adventures of Sherlock...
1,This eBook is for the use of anyone anywhere a...
2,almost no restrictions whatsoever. You may co...
3,re-use it under the terms of the Project Guten...
4,with this eBook or online at www.gutenberg.net
...,...
9628,facility: www.gutenberg.org
9629,This Web site includes information about Proje...
9630,including how to make donations to the Project...
9631,"Archive Foundation, how to help produce our ne..."


<br>

# `#01: Data preprocessing`

- Make token
- Make vocabulary 

<br>

In [70]:

# get the token:
df["data"][0].split(" ")

['Project',
 "Gutenberg's",
 'The',
 'Adventures',
 'of',
 'Sherlock',
 'Holmes,',
 'by',
 'Arthur',
 'Conan',
 'Doyle']

In [71]:

# tokenization:
token = [df["data"][i].split(" ") for i in range(0,len(df))]
token[:5]

[['Project',
  "Gutenberg's",
  'The',
  'Adventures',
  'of',
  'Sherlock',
  'Holmes,',
  'by',
  'Arthur',
  'Conan',
  'Doyle'],
 ['This',
  'eBook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever.',
  '',
  'You',
  'may',
  'copy',
  'it,',
  'give',
  'it',
  'away',
  'or'],
 ['re-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'Project',
  'Gutenberg',
  'License',
  'included'],
 ['with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.net']]

In [72]:

final_token = []
for i in range(len(token)):
    list_token = token[i]
    for tok in list_token:
        final_token.append(tok.lower().strip())
        
total_token = len(final_token)
print(f"Total token no: {total_token}")
print("-------------------------------")
final_token[:20]


Total token no: 108061
-------------------------------


['project',
 "gutenberg's",
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes,',
 'by',
 'arthur',
 'conan',
 'doyle',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere']

In [73]:

# Remove the duplicated words from token:
from collections import Counter
counter = Counter(final_token)
counter

Counter({'project': 85,
         "gutenberg's": 1,
         'the': 5703,
         'adventures': 9,
         'of': 2758,
         'sherlock': 101,
         'holmes,': 127,
         'by': 357,
         'arthur': 14,
         'conan': 4,
         'doyle': 4,
         'this': 467,
         'ebook': 8,
         'is': 1080,
         'for': 727,
         'use': 39,
         'anyone': 27,
         'anywhere': 3,
         'at': 768,
         'no': 299,
         'cost': 5,
         'and': 2882,
         'with': 870,
         'almost': 20,
         'restrictions': 2,
         'whatsoever.': 2,
         '': 459,
         'you': 1173,
         'may': 205,
         'copy': 10,
         'it,': 93,
         'give': 60,
         'it': 1267,
         'away': 84,
         'or': 267,
         're-use': 2,
         'under': 46,
         'terms': 23,
         'gutenberg': 26,
         'license': 11,
         'included': 3,
         'online': 4,
         'www.gutenberg.net': 1,
         'title:': 1,
        

In [74]:
counter.get

<function Counter.get(key, default=None, /)>

In [75]:

# from the dictionary just get only the key: 
sorted_val = sorted(counter,key=counter.get,reverse=True)
print(sorted_val[:5])
print(len(sorted_val))

['the', 'and', 'of', 'to', 'a']
14556


In [76]:

vocab = ["UNK","PAD"] + sorted_val[:10000]
len(vocab)

10002

In [77]:

vocab_size = len(vocab)
word_to_idx = { word:idx for idx, word in enumerate(vocab)}
list(word_to_idx.items())[:5]

[('UNK', 0), ('PAD', 1), ('the', 2), ('and', 3), ('of', 4)]

In [78]:

idx_to_word = { idx:word for idx,word in enumerate(vocab)}
dict(list(idx_to_word.items())[:5])

{0: 'UNK', 1: 'PAD', 2: 'the', 3: 'and', 4: 'of'}

In [79]:

# we took only 10000, other token will be unkwon in final token:
unk_id = word_to_idx["UNK"]
unk_id

0

In [80]:

final_idx = [word_to_idx.get(token,unk_id)  for token in final_token]
print(f"total size of final token: {len(final_idx)}")
final_idx[-10:-1]

total size of final token: 108061


[0, 5, 67, 5980, 0, 5, 335, 77, 323]

In [81]:

# create sequence and next word:
seq_length=30
sequences = []
next_words = []
for i in range(len(final_idx)-seq_length):
    sequences.append(final_idx[i:(i+seq_length)])
    next_words.append(final_idx[i+seq_length])


# total number of sequence:
print(f"Total number of sequence : {len(sequences)}")
print(f"Total number of next_words : {len(next_words)}")


Total number of sequence : 108031
Total number of next_words : 108031


<br>
<br>


# `#02: making the dataset:`

<br>
<br>

In [82]:


class CustomDataset(Dataset):
    def __init__(self,sequences,next_words):
        super().__init__()
        self.seq = torch.tensor(data=sequences,device=device,dtype=torch.long)
        self.nex = torch.tensor(data=next_words,device=device,dtype=torch.long)
        
    def __getitem__(self,idx):
        return self.seq[idx],self.nex[idx]
    
    def __len__(self):
        return len(self.seq)
    

dataset = CustomDataset(sequences,next_words)
dataloader = DataLoader(dataset=dataset,
                        batch_size=64,
                        shuffle=True)



In [97]:




class LSTMPredictor(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim)
        #batch_first == True means, we have batch:
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_size,batch_first=True)
        self.fc = nn.Linear(hidden_size,vocab_size)
        
    def forward(self,x):
        embed = self.embedding(x)
        # 3 thing in output in  LSTM cell
        lstm_out, (hidden,_) = self.lstm(embed)
        out = self.fc(torch.squeeze(hidden,dim=0))
        return out 
    
    
    
model = LSTMPredictor(vocab_size=vocab_size,embedding_dim=100,hidden_size=512,)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(),lr=0.01)




In [98]:

# tranning loop:
epochs = 100

for epoch in range(epochs):
    total_loss = 0
    for input,next_word in dataloader:
        input = input.to(device)
        next_word = next_word.to(device)
        
        #forward pass
        out = model(input)
        
        #calculate loss:
        loss = criterion(out,next_word)
        
        #gradient accumulation:
        optimizer.zero_grad()
        
        #backward pass 
        loss.backward()

        #update parameters
        optimizer.step()
        
        total_loss += loss.item()
    print(f"epoch: {epoch+1}  loss: {total_loss/len(dataloader)}")
    
        

epoch: 1  loss: 6.236257062139105
epoch: 2  loss: 5.225727141185959
epoch: 3  loss: 4.579768206412194
epoch: 4  loss: 4.1105269084892
epoch: 5  loss: 3.7528984755701362
epoch: 6  loss: 3.488329418603843
epoch: 7  loss: 3.2931196915594887
epoch: 8  loss: 3.1347986291652608
epoch: 9  loss: 3.034739357035307
epoch: 10  loss: 2.939432405323779
epoch: 11  loss: 2.852806206704316
epoch: 12  loss: 2.794746967288555
epoch: 13  loss: 2.7376766732377464
epoch: 14  loss: 2.7048472960554593
epoch: 15  loss: 2.664928324555899
epoch: 16  loss: 2.6310500129704226
epoch: 17  loss: 2.613844110643694
epoch: 18  loss: 2.5926960302755164
epoch: 19  loss: 2.559420862680928
epoch: 20  loss: 2.5230712576358805
epoch: 21  loss: 2.5091636327064433
epoch: 22  loss: 2.5215754332418125
epoch: 23  loss: 2.4853225355323456
epoch: 24  loss: 2.465808705201646
epoch: 25  loss: 2.4647261320273457
epoch: 26  loss: 2.443318507965142
epoch: 27  loss: 2.420736211806677
epoch: 28  loss: 2.411819362174278
epoch: 29  loss: 2.

In [101]:


# prediction:
def prediction(model,sentences,seq_length,word_to_idx,idx_to_word):
    model.eval()
    with torch.no_grad():
        token = sentences.lower().split()
        generated = token.copy()
        
        for _ in range(seq_length):
            # 1. seq_length
            if len(token)>=seq_length:
                current_tokens = generated[-seq_length:]
            else:
                current_tokens = generated 
                
            # 2. get unk,pad idx
            unk_idx = word_to_idx["UNK"]
            pad_idx = word_to_idx["PAD"]
    
            # 3. get idx value
            indices = [word_to_idx.get(tok,unk_idx) for tok in current_tokens]
    
            # 4. fit with seq_length
            if len(indices)<seq_length:
                indices += [pad_idx]*(seq_length-len(indices)) 
    
            # 5. convert into tensor:
            input_tensor = torch.tensor([indices], dtype=torch.long).to(device)
            output = model(input_tensor)
            probabilities = torch.softmax(output, dim=1)
            predicted_idx = torch.argmax(probabilities,dim=1).item()
            predicted_words = idx_to_word.get(predicted_idx,"<UNKNOWN>")
            generated.append(predicted_words)
        return " ".join(generated)

test_sentences = [
    "to sherlock holmes she is always",
    "i have never seen",
    "the quick brown fox",
    "my dear watson",
    "it was a dark and"]

for sentence in test_sentences:
    result = prediction(model,sentence,seq_length,word_to_idx,idx_to_word)
    print(f"sentence: {sentence}  prediction: {result}")
    print()



sentence: to sherlock holmes she is always  prediction: to sherlock holmes she is always conversation others it.” bundle surprise equally are, chair. really lie got morning, nine breath, me. set beneath. suddenly miserable bowed shortly round fallen appointment this file lady could not mine

sentence: i have never seen  prediction: i have never seen myself, locked, suddenly shut sitting-room attention. now press on, suddenly conduct resolute if returned purpose. shortly gained formerly, overtook lips on, doubt interest stair. chair. chair. the project copying, hand

sentence: the quick brown fox  prediction: the quick brown fox announced letter completed explanation suddenly think.” explanation fifty clutched belonging since upon. UNK good slammed remain massive approaching sending wander wander occasional “we keen limbs suits you,” said he, rising

sentence: my dear watson  prediction: my dear watson effort UNK sitting-room part hypothesis listen story. rather exact wants, holmes. giv