In [42]:
! pip install sentence-transformers



In [145]:
# Importing the necessary libraries

import pandas as pd
import random
import numpy as np
import contractions,re
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.metrics import categorical_crossentropy
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize as w_tokenizer
import unicodedata
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

In [2]:
s_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [29]:
data = pd.read_csv('Web Scrapped data.csv')

In [30]:
data.head()

Unnamed: 0,Questions,Answers
0,What is Machine Learning?,Machine learning is the science of getting co...
1,Explain the basic difference between Supervise...,Supervised Learning: A model is trained on the...
2,What do you mean by Reinforcement Learning?,reinforcement learning is an area of machine ...
3,What are the different types of data used in M...,There Are Two Types of Data. Structured and U...
4,Features vs. Labels?,Features are the input information. On the ot...


In [52]:
questions = data['Questions'].to_list()
answers = data['Answers'].to_list()
print('Maximum no.of.words in a answer : ',max((questions[i].split()) for i in range(len(questions))))
print(f'The average length of the answers are : {sum(len(answers[i].split()) for i in range(0,len(answers)))// len(answers)}')

Maximum no.of.words in a answer :  ['You', 'have', 'to', 'train', 'a', '12GB', 'dataset', 'using', 'a', 'neural', 'network', 'with', 'a', 'machine', 'which', 'has', 'only', '3GB', 'RAM.', 'How', 'would', 'you', 'go', 'about', 'it?']
The average length of the answers are : 76


In [184]:
# creating a class and creating functions inside the class to perform necessary preprocessing functions

class Dataprep:
    def __init__(self):
        self.correct_cnt = 0
        self.incorrect_cnt = 0
        self.score = 0
        self.model = s_model
    
    def unicode_to_ascii(self,s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    ## Step 1 and Step 2 
    def preprocess_sentence(self,w, answer = False):
        #preprocessing sentences
        w = self.unicode_to_ascii(w.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        w = w.strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.

        if answer != False:
            w = '<start> ' + w + ' <end>'
        return w
    def tokenize(self, sentence, max_len):
        # tokenize and add padding to the tokenized sentences

        # print(len(lang), "example sentence: {}".format(lang[0]))
        sent_tokenizer = Tokenizer(filters='', oov_token='<OOV>')
        sent_tokenizer.fit_on_texts(sentence)

        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
        tensor = sent_tokenizer.texts_to_sequences(sentence) 

        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
        ## and pads the sequences to match the longest sequences in the given input
        tensor = pad_sequences(tensor, padding='post', maxlen = max_len)

        return tensor, sent_tokenizer
    def final_data(self, questions, answers):
        q_processed = [self.preprocess_sentence(q_) for q_ in questions]
        a_processed = [self.preprocess_sentence(a_) for a_ in answers]
        
        q_tensor, q_tokenizer = self.tokenize(q_processed, 58)
        a_tensor, a_tokenizer = self.tokenize(a_processed, 150)
        
        return q_tensor, q_tokenizer, a_tensor, a_tokenizer
    def vocabsize(self, questions, answers):
        # finding vocabulary size
        q_processed = [self.preprocess_sentence(q_) for q_ in questions]
        a_processed = [self.preprocess_sentence(a_) for a_ in answers]
        target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n\'0123456789'
        # Tokenizer allows to vectorize our corpus by turning each sentence
        # into a sequence of integers where each integer is an index
        # of a token in an internal dictionary
        tokenizer = Tokenizer(filters=target_regex)
        tokenizer.fit_on_texts(q_processed + a_processed)
        VOCAB_SIZE = len(tokenizer.word_index) + 1
        return VOCAB_SIZE
        
    def call(self):
        q_tensor, self.q_tokenizer, a_tensor, self.a_tokenizer = self.final_data(questions, answers)
        
        return q_tensor, a_tensor, self.q_tokenizer, self.a_tokenizer
    
    def create_ohe(self, questions, answers):
        #creating one hot encoded answers to match with the output
        a_processed = [self.preprocess_sentence(a_, answer = True) for a_ in answers]
        q_processed = [self.preprocess_sentence(q_) for q_ in questions]
        sent_tokenizer = Tokenizer(filters = '', oov_token = '<OOV>')
        sent_tokenizer.fit_on_texts(a_processed + q_processed)
        token_ans = sent_tokenizer.texts_to_sequences(a_processed)
        for i in range(len(token_ans)):
            token_ans[i] = token_ans[i][1:]
        pad_ans = pad_sequences(token_ans, maxlen = 150, padding = 'post')
        dec_out_data = to_categorical(pad_ans,2483) 
        return dec_out_data
    def str_to_tokens(self, sentence: str):
        # convert input string to lowercase, 
        # then split it by whitespaces and then convert it into tokens
        tokenizer = Tokenizer(filters = '', oov_token = '<OOV>')
        tokenizer.fit_on_texts([self.preprocess_sentence(q_) for q_ in questions] + [self.preprocess_sentence(a_, answer = True) for a_ in answers])
        words = sentence.lower().split()
        # and then convert to a sequence 
        # of integers padded with zeros
        tokens_list = list()
        for current_word in words:
            result = tokenizer.word_index.get(current_word, '')
            if result != '':
                tokens_list.append(result)
        return pad_sequences([tokens_list],
                             maxlen=58,
                             padding='post')

In [185]:
data_creator = Dataprep()

In [186]:

enc_input_data, dec_input_data, q_tokenizer, a_tokenizer = data_creator.call()
enc_input_data.shape, dec_input_data.shape

((226, 58), (226, 150))

In [187]:
dec_out_data = data_creator.create_ohe(questions, answers)
dec_out_data.shape

(226, 150, 2483)

In [188]:
vocab_size = data_creator.vocabsize(questions, answers)
vocab_size

2483

### Defining the model

In [223]:
# encoder will be used to capture space-dependent 
# relations between words from the questions
enc_inputs = Input(shape=(None,))
enc_embedding = Embedding(vocab_size, 200, mask_zero=True)(enc_inputs)
enc_outputs, state_h, state_c = LSTM(200, return_state=True)(enc_embedding)
enc_states = [state_h, state_c]
# decoder will be used to capture space-dependent relations 
# between words from the answers using encoder's 
# internal state as a context
dec_inputs = Input(shape=(None,))
dec_embedding = Embedding(vocab_size, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True)
dec_outputs, _, _ = dec_lstm(dec_embedding,  
                             initial_state=enc_states)
# decoder is connected to the output Dense layer
dec_dense = Dense(vocab_size, activation=softmax)
output = dec_dense(dec_outputs)
model = Model([enc_inputs, dec_inputs], output)

In [224]:
model.compile(optimizer = RMSprop(), loss = 'categorical_crossentropy')
model.summary()

Model: "model_17"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_32 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_16 (Embedding)       (None, None, 200)    496600      ['input_31[0][0]']               
                                                                                                  
 embedding_17 (Embedding)       (None, None, 200)    496600      ['input_32[0][0]']               
                                                                                           

In [232]:
model.fit([enc_input_data, dec_input_data],dec_out_data, epochs = 200)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x298664c3e80>

In [233]:
model.save('seq2seq_mod2.h5')

In [234]:
def str_to_tokens(sentence: str):
    # convert input string to lowercase, 
    # then split it by whitespaces
    tokenizer = Tokenizer(filters = '', oov_token = '<OOV>')
    tokenizer.fit_on_texts([self.preprocess_sentence(q_) for q_ in questions] + [self.preprocess_sentence(a_, answer = True) for a_ in answers])
    words = sentence.lower().split()
    # and then convert to a sequence 
    # of integers padded with zeros
    tokens_list = list()
    for current_word in words:
        result = tokenizer.word_index.get(current_word, '')
        if result != '':
            tokens_list.append(result)
    return pad_sequences([tokens_list],
                         maxlen=maxlen_q,
                         padding='post')

In [235]:
def make_inference_models():
    # two inputs for the state vectors returned by encoder
    dec_state_input_h = Input(shape=(200,))
    dec_state_input_c = Input(shape=(200,))
    dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    # these state vectors are used as an initial state 
    # for LSTM layer in the inference decoder
    # third input is the Embedding layer as explained above   
    dec_outputs, state_h, state_c = dec_lstm(dec_embedding,
                                    initial_state=dec_states_inputs)
    dec_states = [state_h, state_c]
    # Dense layer is used to return OHE predicted word
    dec_outputs = dec_dense(dec_outputs)
    dec_model = Model(
        inputs=[dec_inputs] + dec_states_inputs,
        outputs=[dec_outputs] + dec_states)
   
    # single encoder input is a question, represented as a sequence 
    # of integers padded with zeros
    enc_model = Model(inputs=enc_inputs, outputs=enc_states)
   
    return enc_model,  dec_model
enc_model, dec_model = make_inference_models()

In [236]:
q_processed = [data_creator.preprocess_sentence(q_) for q_ in questions]
a_processed = [data_creator.preprocess_sentence(a_, answer = True) for a_ in answers]
tokenizer_s = Tokenizer(filters = '', oov_token = '<OOV>')
tokenizer_s.fit_on_texts(q_processed + a_processed)

## Main interface

In [237]:
#print('***---Welcome to the Data Science and Machine Learning interview---***')    
#print('Are you ready for the interview? (Y/N)')
#res = input()
#if res == "Y" or "y":
 #   print('Lets start the interview... \n')
  #  num = 5  # Defining the number of questions to be asked in the interview
   # q_list = []
    #count = 0
    #if count <= num:
    
    
ques = random.choice(data.Questions)
print(ques)
   # if ques not in q_list:
    #    q_list.append(ques)
     #   print(f'{count +1 } ', ques)

        # encode the input sequence into state vectors
states_values = enc_model.predict(
    data_creator.str_to_tokens(ques))
# start with a target sequence of size 1 - word 'start'   
empty_target_seq = np.zeros((1, 1))
empty_target_seq[0, 0] = tokenizer_s.word_index['start']
stop_condition = False
decoded_translation = ''
while not stop_condition:
    # feed the state vectors and 1-word target sequence 
    # to the decoder to produce predictions for the next word
    dec_outputs, h, c = dec_model.predict([empty_target_seq] 
                                          + states_values)         
    # sample the next word using these predictions
    sampled_word_index = np.argmax(dec_outputs[0, -1, :])
    sampled_word = None
    # append the sampled word to the target sequence
    for word, index in tokenizer_s.word_index.items():
        if sampled_word_index == index:
            if word != 'end':
                decoded_translation += ' {}'.format(word)
            sampled_word = word
    # repeat until we generate the end-of-sequence word 'end' 
    # or we hit the length of answer limit
    if sampled_word == 'end' \
            or len(decoded_translation.split()) \
            > 150:
        stop_condition = True
            # prepare next iteration
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = sampled_word_index
    states_values = [h, c]
print(decoded_translation)


When does regularization come into play in Machine Learning?
 parameters use variance performance between value test then dataset s bayes function but distribution how when will so based accuracy time mean validation element supervised case true also well very parameters class algorithms some different random would most b about a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a there test model be is is is is discard observed function but distribution how when will so based accuracy on you used set one error given bias type features there using do classifier classification number better find difference decision method list has them these no any just called split could
