# RNN for Language Modeling in Python

## Keras

In [None]:
# keras.models.Sequential
# keras.models.Model

# layers: LSTM/ GRU/ Dense/ Dropout/ Embedding/ Bidirectional

# keras.preprocessing.sequence.pad_sequences(texts, maxlen=3)
    #-->transforms text data into fixed-length vectors

In [None]:
# import required modules
from keras.models import Sequential
from keras.layers import LSTM, Dense

# instantiate the model class
model = Sequential()

# add the layers
model.add(Embedding(10000, 128))
model.add(LSTM(128, dropout=0.2))
# model.add(Dense(64, activation='relu', input_dim=100))
model.add(Dense(1, activaton='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

# train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# evaluate the model
model.evaluate(X_test, y_test)

# make predictions
model.predict(new_data)

# summary shows the layers and the number of parameters
model.summary()

In [None]:
##### example
# Import relevant classes/functions
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Build the dictionary of indexes
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Change texts into sequence of indexes
texts_numeric = tokenizer.texts_to_sequences(texts)
print("Number of words in the sample texts: ({0}, {1})".format(len(texts_numeric[0]), len(texts_numeric[1])))

# Pad the sequences
texts_pad = pad_sequences(texts_numeric, 60)
print("Now the texts have fixed length: 60. Let's see the first one: \n{0}".format(texts_pad[0]))

# Build model
model = Sequential()
model.add(SimpleRNN(units=128, input_shape=(None, 1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

# Load pre-trained weights
model.load_weights('model_weights.h5')

# Method '.evaluate()' shows the loss and accuracy
loss, acc = model.evaluate(x_test, y_test, verbose=0)
print("Loss: {0} \nAccuracy: {1}".format(loss, acc))

## RNN Architecture

### exploding and vanishing gradients

In [None]:
# model.complie(...., clipvalue=3.0)
# use LSTM or GRU cells
from keras.layers import GRU, LSTM
# add the layers to a model
model.add(GRU(units=128, return_sequences=True, name='GRU layer'))
model.add(LSTM(units=64, return_sequences=False, name='LSTM layer'))

### embedding layer

In [None]:
'''
advantages:
reduce the dimension / dense representation / transfer learning

disadvantages:
lots of parameters to train: training takes longer
'''
from keras.layers import Embedding
model = Sequential()
#use as the first layer
model.add(Embedding(input_dim=100000,
                   output_dim=300,
                   trainable=True,
                   embeddings_inititalizer=None,
                   input_length=120))

### transfer model

In [None]:
## transfer learning for language models
#GloVE/ word2vec/ BERT

#In keras:
from keras.initializers import Constant
model.add(Embedding(input_dim=vocabulary_size,
                   out_dim=embedding_dim,
                   embeddings_initializer=Constant(pre_trained_vectors)))
###example
# Load the glove pre-trained vectors
glove_matrix = load_glove('glove_200d.zip')

# Create a model with embeddings
model = Sequential(name="emb_model")
model.add(Embedding(input_dim=vocabulary_size + 1, output_dim=wordvec_dim, 
                    embeddings_initializer=Constant(glove_matrix), 
                    input_length=sentence_len, trainable=False))
model.add(GRU(128))
model.add(Dense(1))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the summaries of the model with embeddings
model.summary()

In [None]:
#https://nlp.stanford.edu/projects/glove/
####get the GloVE vectors
def get_glove_vectors(filename='glove.6B.300d.text'):
    #get all word vectors from pre_trained model
    glove_vector_dict={}
    with open(filename) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = values[1:]
            glove_vector_dict[word] = np.asarray(coefs, dtype='float32')
            
###filter GloVE vectors to specific task
def filter_glove(vocabulary_dict, glove_dict, wordvec_dim=300):
    #create a matrix to store the vectors
    embedding_matrix = np.zeros((len(vocabulary_dict)+1, wordvec_dim))
    for word, i in vocabulary_dict.items():
        embedding_vector = glove_dict.get(word)
        if embedding_vector is not None:
            #words not found in the glove_dict will be all_zeros
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
'''
improving RNN model
 1 add the embedding layer
 2 increase the number of layers
 3 tune the parameters
 4 increase vocabulary size
 5 accept longer sentences with more memory cells
 
avoiding overfitting
 test different batch sizes
 add dropout layers
 add dropout and recurrent_dropout parameters on RNN layers
 add convolution layer, it does feature selection on the embedding vector
'''
# removes 20% of input to add noise
model.add(Dropout(rate=0.2))

# removes 10% of input and memory cells respectively
model.add(LSTM(128, dropout=0.1, recurrent_dropout=0.1))

# add convolution and pooling
model.add(Embedding(vocabulary_size, wordvec_dim,...))
model.add(Conv1D(num_filters=32, lernel_size=3, padding='same'))
model.add(MaxPooling1D(pool_size=2))

### example - Sentiment classification

In [None]:
# Build and compile the model
model = Sequential()
model.add(Embedding(vocabulary_size, wordvec_dim, trainable=True, input_length=max_text_len))
model.add(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.15))
model.add(LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.15))
model.add(Dense(16))
model.add(Dropout(rate=0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Load pre-trained weights
model.load_weights('model_weights.h5')

# Print the obtained loss and accuracy
print("Loss: {0}\nAccuracy: {1}".format(*model.evaluate(X_test, y_test, verbose=0)))

## Multi-class classification

In [None]:
# changes from binary classification
## shape of the output variable y
## number of units on the output layer
model.add(Dense(num_classes)) #output layer
## activation function on the output layer
model.add(Dense(num_classes, activation='softmax'))
## loss function
model.compile(loss='categorical_crossentropy')

### transfer learning for language models

In [None]:
'''
available architectures
 gensim:
    Word2Vec (CBOW/ Skip-gram)
    FastText
 tensorflow_hub:
    ElMo

In [None]:
from gensim.models import word2vec
#Train the model
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=embedding_dim,
                             window=neighbor_words-num, iter=100)
#get top 3 similar words to 'captain'
w2v_model.wv.most_similar(['captain'], topn=3)

In [None]:
from gensim.models import fasttext
#instantiate the model
ft_model=fasttext.FastText(size=embedding_dim, window=neighbor_word_num)
#build vacabulary
ft_model.build_vocab(sentences=tokenized_corpus)
#train the model
ft_model.train(sentences=tokenized_corpus,
              total_examples=len(tokenized_corpus),
               epochs=100)

### Multi-class classification models

In [None]:
#import the function to load the data
from sklearn.datasets import fetch_20newsgroups
#download train and test sets
new_train = fetch_20newsgroups(subset='train')

#import modules
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

#create and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(news_train.data)

#create the (X,Y) variables
X_train = tokenizer.texts_to_sequences(news_train.data)
X_train = pad_sequences(X_train, maxlen=400)
Y_train = to_categorical(news_train.target) #one-hot encoded

#train the model
model.fit(X_train, Y_train, batch_size=64. epochs=100)

#evaluate on test data
model.evaluate(X_test, Y_test)

### Assessing the model's performance

In [None]:
# Get probabilities for each class
pred_probabilities = model.predict_proba(X_test)

# Thresholds at 0.5 and 0.8
y_pred_50 = [np.argmax(x) if np.max(x) >= 0.5 else DEFAULT_CLASS for x in pred_probabilities]
y_pred_80 = [np.argmax(x) if np.max(x) >= 0.8 else DEFAULT_CLASS for x in pred_probabilities]

trade_off = pd.DataFrame({
    'Precision_50': precision_score(y_true, y_pred_50, average=None), 
    'Precision_80': precision_score(y_true, y_pred_80, average=None), 
    'Recall_50': recall_score(y_true, y_pred_50, average=None), 
    'Recall_80': recall_score(y_true, y_pred_80, average=None)}, 
  index=['Class 1', 'Class 2', 'Class 3'])

## Sequence to Sequence Models

### text generation models

In [None]:
'''
#similar to a classification model
1 uses the vocabulary as classes
2 the last layer applies a softmax with wocabulary size units
3 uses categorical_crossentropy as loss function

#difference to classification
1 computes loss, but not performance metrics(accuracy)
    -humans see results and evaluate performance
    -if not good, train more epochs or add complexity to the model
    (add more memory cells, add layers, etc.)
2 used with generation rules according to task
    -genereate next char/ one word/ one sentence/ one paragraph

In [None]:
model = Sequential()
model.add(LSTM(units, input_shape=(chars_window, n_vocab),
              dropout=0.15, recurrent_dropout=0.15, return_sequences=True))
model.add(LSTM(units, dropout=dropout, recurrent_dropout=0.15,
              return_sequences=False))
model.add(Dense(n_vocab, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

### Neural Machine Translation

In [None]:
################ Preparing the input text
# Get maximum length of the sentences
pt_length = max([len(sentence.split()) for sentence in pt_sentences])

# Transform text to sequence of numerical indexes
X = input_tokenizer.texts_to_sequences(pt_sentences)

# Pad the sequences
X = pad_sequences(X, maxlen=pt_length, padding='post')

# Print first sentence
print(pt_sentences[0])

# Print transformed sentence
print(X[0])


################ Preparing the output text
# Initialize the variable
Y = transform_text_to_sequences(en_sentences, output_tokenizer)

# Temporary list
ylist = list()
for sequence in Y:
  	# One-hot encode sentence and append to list
    ylist.append(to_categorical(sequence, num_classes=en_vocab_size))

# Update the variable
Y = np.array(ylist).reshape(Y.shape[0], Y.shape[1], en_vocab_size)

# Print the raw sentence and its transformed version
print("Raw sentence: {0}\nTransformed: {1}".format(en_sentences[0], Y[0]))


################ Translate Portuguese to English
# Function to predict many phrases
def predict_many(model, sentences, index_to_word, raw_dataset):
    for i, sentence in enumerate(sentences):
        # Translate the Portuguese sentence
        translation = predict_one(model, sentence, index_to_word)
        
        # Get the raw Portuguese and English sentences
        raw_target, raw_src = raw_dataset[i]
        
        # Print the correct Portuguese and English sentences and the predicted
        print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))

predict_many(model, X_test[0:10], en_index_to_word, test)