<a href="https://colab.research.google.com/github/yacinebouaouni/Text-Generation-RNN-Pytorch/blob/master/Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# 1-Loading the text data:

In [0]:
with open('data/anna.txt','r') as file:
  text=file.read()


In [58]:
## Encode all the caracters of the text as integers:

char=tuple(set(text)) #This will give all the unique caracters of the text
int2char=dict(enumerate(char))#dictionary of int keys
char2int={ch:ii for ii,ch in int2char.items()} #dictionary of char keys
encoded=np.array([char2int[ch] for ch in text])
print('the number of characters in the text : '+str(encoded.shape))

the number of characters in the text : (1985223,)


#2-Preprocessing the data:

---
The input of LSTM is a one hot encoded vector so we have to encode each caracter with a vector full of zeros and has a 1 in the corresponding index:



In [0]:
def one_hot_encode(arr,n_labels):

  """
  Function takes as input encoded text in integers and return it in one hot
  arr is the array of encoded text in integers
  n_labels:number of labels (size of the encoded vector)

  """

  shape=(np.multiply(*arr.shape),n_labels) #*arr.shape ==> *args of the method multiply ==> multiply dimension1*dimension2 
  one_hot=np.zeros(shape=shape,dtype=np.float32)
  one_hot[range(arr.shape[0]),arr.flatten()]=1

  return one_hot


#3-Creating Mini batches:
---

* The text will be split into Batches (K batches) and complete bacthes!!
* Each Batch contains N sequence (batch_size=N)
* Each sequence contains M caracter (step) 
* The last batch will be connected to the first one (last column )

In [0]:
def create_batches(arr,nb_seq,nb_step):

  """
       Create a generator that returns batches of size
       n_seqs x n_steps from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       n_seqs: Batch size, the number of sequences per batch
       n_steps: Number of sequence steps per batch
  """


  batch_size=nb_seq*nb_step

  #Generate complete batches
  K=len(arr)//batch_size    
  arr=arr[:K*batch_size]

  #Reshape it into n_seq rows n_seq is the number of sequence/batch
  arr=arr.reshape((nb_seq,-1)) 

  #Create the batch

  for n in range(0,arr.shape[1],nb_step):

    input=arr[:,n:n+nb_step]
    target=np.zeros_like(input)
    
    #The target is the next caracter of the input so we will shift it
    try:

      target[:,:-1],target[:,-1]=input[:,1:],arr[:,n+nb_step]
    
    except IndexError: #This is the case of the last batch The last column of x correspond to the last column of data==> y become the first column

      target[:,:-1],target[:,-1]=input[:,1:],arr[:,0]

    yield input,target



In [64]:
batches=create_batches(encoded,10,50)
x,y = next(batches)
print('x\n', x[:, :10])
print('\ny\n', y[:,:10])

x
 [[36 24 58 33 15 56 53  1  4  0]
 [ 1 58  6  1 32 19 15  1 77 19]
 [57  3 32 79  0  0 23 28 56  2]
 [32  1 46 22 53  3 32 77  1 24]
 [ 1  3 15  1  3  2 54  1  2  3]
 [ 1 73 15  1 66 58  2  0 19 32]
 [24 56 32  1  7 19  6 56  1 78]
 [25  1 38 22 15  1 32 19 66  1]
 [15  1  3  2 32 10 15 79  1 82]
 [ 1  2 58  3 46  1 15 19  1 24]]

y
 [[24 58 33 15 56 53  1  4  0  0]
 [58  6  1 32 19 15  1 77 19  3]
 [ 3 32 79  0  0 23 28 56  2 54]
 [ 1 46 22 53  3 32 77  1 24  3]
 [ 3 15  1  3  2 54  1  2  3 53]
 [73 15  1 66 58  2  0 19 32 12]
 [56 32  1  7 19  6 56  1 78 19]
 [ 1 38 22 15  1 32 19 66  1  2]
 [ 1  3  2 32 10 15 79  1 82 24]
 [ 2 58  3 46  1 15 19  1 24 56]]


#4-Create the structure of the Model:

* Create and store the necessary dictionaries (this has been done for you)
* Define an LSTM layer that takes as params: an input size (the number of characters), a hidden layer size `n_hidden`, a number of layers `n_layers`, a dropout probability `drop_prob`, and a batch_first boolean (True, since we are batching)
* Define a dropout layer with `dropout_prob`
* Define a fully-connected layer with params: input size `n_hidden` and output size (the number of characters)
* Finally, initialize the weights (again, this has been given)

Note that some parameters have been named and given in the `__init__` function, and we use them and store them by doing something like `self.drop_prob = drop_prob`.

In [0]:
class charRNN(nn.Module):

  def __init__(self,chars,n_steps,n_hidden,n_layers,drop_prob,batch_first=True):

    super(charRNN,self).__init__()

    self.drop_prob=drop_prob
    self.n_hidden=n_hidden
    self.n_layers=n_layers

    #Creating characters dictionaries

    self.chars=chars
    self.int2char=dict(enumerate(self.chars))#dictionary of int keys
    self.char2int={ch:ii for ii,ch in int2char.items()} #dictionary of char keys

    #Define LSTM

    self.lstm=nn.LSTM(input_size=len(int2char),hidden_size=self.n_hidden,num_layers=self.n_layers,batch_first=batch_first,dropout=drop_prob)

    #Dropout layer 

    self.dropout=nn.Dropout(drop_prob)

    #Fully connected layer predicts the probability for each char 
    
    self.fc1=nn.Linear(in_features=n_hidden,out_features=len(self.int2char))