In [3]:
#download this dependencies
!pip install tensorflow
!pip install keras
!pip install numpy
!pip install pandas
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /home/tauseefnawaz/anaconda3/lib/python3.7/site-packages (0.3.9)




In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [5]:
import numpy as np
import pandas as pd
import string

import scipy
from sentence_transformers import SentenceTransformer

In [6]:
df = pd.read_excel("Dirty data from RFP COE.xlsx")
df.head()

Unnamed: 0,Non-Functional Requirement,Description
0,Is system based on a multi-tiers architecture?...,Depending on the current state and future sele...
1,How is the system architected? (i.e. is it bas...,For more information on the current architectu...
2,Is it possible to add further modules in the f...,Extensibility covers a broad spectrum of topic...
3,Please provide a detailed interface descriptio...,SAP API Management provides enterprises a comp...
4,Is it possible to make the system data availab...,SAP supports all standard integration capabili...


In [7]:
df = df.dropna()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 716 entries, 0 to 716
Data columns (total 2 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Non-Functional Requirement  716 non-null    object
 1   Description                 716 non-null    object
dtypes: object(2)
memory usage: 16.8+ KB


In [9]:
def clean_sentence(sentence):
    lower_case_sent = sentence.lower()
    # remove punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
    return clean_sentence

In [10]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [11]:
#Text preprocessing
question_sentences = [clean_sentence(sentence) for sentence in df['Non-Functional Requirement']]
descrition_sentences = [clean_sentence(sentence) for sentence in df['Description']]
# Tokenize words
question_text_tokenized, question_text_tokenizer = tokenize(question_sentences)
descrition_text_tokenized, descrition_text_tokenizer = tokenize(descrition_sentences)

# Check length
question_length = len(question_text_tokenizer.word_index) + 1
description_length = len(descrition_text_tokenizer.word_index) + 1

In [12]:
#get Maximum length
max_question_len = int(len(max(question_text_tokenized,key=len)))
max_descrition_len = int(len(max(descrition_text_tokenized,key=len)))

#get padding
question_pad_sentence = pad_sequences(question_text_tokenized, max_question_len, padding = "post")
decription_pad_sentence = pad_sequences(descrition_text_tokenized, max_descrition_len, padding = "post")

# Reshape data
question_pad_sentence = question_pad_sentence.reshape(*question_pad_sentence.shape, 1)
decription_pad_sentence = decription_pad_sentence.reshape(*decription_pad_sentence.shape, 1)


In [13]:
#building Model
input_sequence = Input(shape=(max_question_len,))
embedding = Embedding(input_dim=question_length, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_descrition_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(description_length))(decoder)

In [14]:
#Compiling
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 202)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 202, 128)          343040    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
repeat_vector (RepeatVector) (None, 1520, 64)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 1520, 64)          33024     
_________________________________________________________________
time_distributed (TimeDistri (None, 1520, 5472)        355680    
_________________________________________________________________
activation_4 (Activation)    (None, 1520, 5472)       

In [None]:
#training model
model_results = enc_dec_model.fit(question_pad_sentence, decription_pad_sentence, batch_size=20, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])


index = 0
print("\nThe question sentence is: {}".format(question_sentences[index]))
print("\nThe description sentence is: {}".format(descrition_sentences[index]))
print('\nThe predicted sentence is :\n')
print(logits_to_sentence(enc_dec_model.predict(question_pad_sentence[index])[0], descrition_text_tokenizer))