In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle
import tensorflow as tf
import numpy as np
import json
import re
import pandas as pd

**Parameters**

In [2]:
max_len_desc = 100
max_len_comm = 20
num_words = 12000
oov_token = '<oov>'
padding = 'pre'
truncating = 'pre'
embedding_dim = 100

**Utils**

In [3]:
def cleaning_coments(x):
    x = re.sub(r'\[.+\]','', str(x))
    x = re.sub("[?!@#$\r\n.:0123456789\t-]", '', x)
    x = x.strip()
    return x

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
              "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
              "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
              "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's",
              "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my",
              "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same",
              "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them",
              "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through",
              "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when",
              "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll",
              "you're", "you've", "your", "yours", "yourself", "yourselves"]
def clean_description(x):
    x = re.sub("[!@#$\r\n.:0123456789\t-]", '', x)
    x = x.replace("*", ' ').replace(",", ' ').replace("[", '').replace("]", '').replace("|", '')
    arr = x.split()
    new_arr = [word for word in arr if not word in stopwords]
    sentence = ' '.join(word for word in new_arr)
    return sentence

FEATURE_COLUMNS = ['Status', 'Description', 'Comments', 'Priority', 'RootCause', 'Reporter', 
           'IssueKey', 'Summary', 'Sprint', 'Component', 'Application', 'Assigne']
def process_json_file(file_name, FEATURE_COLUMNS):
    f = open(file_name)
    Status = []
    Description = []
    Comments = []
    Priority = []
    RootCause = []
    Reporter = []
    IssueKey = []
    Summary = []
    Sprint = []
    Component = []
    Application = []
    Assigne = []
    data = json.load(f)
    for i in data:
        Status.append(i.get(FEATURE_COLUMNS[0], np.nan))
        Description.append(i.get(FEATURE_COLUMNS[1], np.nan))
        if len(i.get(FEATURE_COLUMNS[2], [])) !=0:
            Comments.append(i.get(FEATURE_COLUMNS[2], [np.nan])[0]['body'])
        else:
            Comments.append(np.nan)
        Priority.append(i.get(FEATURE_COLUMNS[3], np.nan))
        RootCause.append(i.get(FEATURE_COLUMNS[4], np.nan))
        Reporter.append(i.get(FEATURE_COLUMNS[5], np.nan))
        IssueKey.append(i.get(FEATURE_COLUMNS[6], np.nan))
        Summary.append(i.get(FEATURE_COLUMNS[7], np.nan))
        Sprint.append(i.get(FEATURE_COLUMNS[8], np.nan))
        Component.append(i.get(FEATURE_COLUMNS[9], np.nan))
        Application.append(i.get(FEATURE_COLUMNS[10], np.nan))
        Assigne.append(i.get(FEATURE_COLUMNS[11], np.nan))
    f.close()
    df = pd.DataFrame(data=Status, columns=['Status'])
    df['Description'] = Description
    df['Comments'] = Comments
    df['Priority'] = Priority
    df['RootCause'] = RootCause
    df['Reporter'] = Reporter
    df['IssueKey'] = IssueKey
    df['Summary'] = Summary
    df['Sprint'] = Sprint
    df['Component'] = Component
    df['Application'] = Application
    df['Assigne'] = Assigne
    return df

def get_first_n_words_desc(x):
    arr = x.split()
    len_x = len(arr)
    if len_x < max_len_desc:
        return ' '.join(word for word in arr)
    else:
        new_arr = arr[:max_len_desc]
        return ' '.join(word for word in new_arr)

def get_first_n_words_comments(x):
    arr = x.split()
    len_x = len(arr)
    if len_x < max_len_comm:
        return ' '.join(word for word in arr)
    else:
        new_arr = arr[:max_len_comm]
        return ' '.join(word for word in new_arr)

**Data Prep**

In [4]:
df_cancelled = process_json_file('cancelled.json', FEATURE_COLUMNS)
df_cancelled['Description'] = df_cancelled['Description'].apply(clean_description)
df_cancelled = df_cancelled[['Description', 'Comments']]
df_cancelled['Comments'] = df_cancelled['Comments'].apply(cleaning_coments)
df_cancelled = df_cancelled[df_cancelled['Comments'] != 'nan']
df_cancelled['Description'] = df_cancelled['Description'].apply(get_first_n_words_desc)
df_cancelled['Comments'] = df_cancelled['Comments'].apply(get_first_n_words_comments)
df_cancelled['all'] = df_cancelled['Description'] + ' ' +df_cancelled['Comments']

#Create Tokenizer
tokenizer_cancelled = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer_cancelled.fit_on_texts(df_cancelled['all'].values)

#Create Sequence
sequences = df_cancelled['all'].values
sequences = tokenizer_cancelled.texts_to_sequences(sequences)
sequences = pad_sequences(sequences, maxlen=max_len_desc+max_len_comm, 
                          padding=padding, truncating=truncating)

#Prepare Data for next word prediction
train_data = []
for seq in sequences:
    start = 0
    end = max_len_desc + 1
    for i in range(max_len_comm):
        dat = seq[start:end]
        start = start + 1
        end = end + 1
        if len(set(dat)) != 1:
            train_data.append(dat)
            
#Define sequence and labels
sequences_x = []
labels = []
for seq in train_data:
    sequences_x.append(seq[0:-1])
    labels.append(seq[-1:])

sequences_x = np.array(sequences_x)
labels = np.array(labels)
labels = labels.reshape(labels.shape[0])
labels = tf.keras.utils.to_categorical(labels, num_words)

**Model**

In [11]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words+1, output_dim=embedding_dim, input_length=max_len_desc),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_words, activation='softmax')
])
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(sequences_x, labels, epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


**Save Model**

In [15]:
model.save("reason_generator.h5")
with open('tokenizer_cancelled.pickle', 'wb') as handle:
    pickle.dump(tokenizer_cancelled, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
from google.colab import files
files.download('tokenizer_cancelled.pickle') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Next Word**

In [19]:
#Get next words
seed_text = "Steps reproduce Launch url https//qavirginvoyagescom/booking User done voyage selection User \
clicked Choose Cabin User entered access key details Access Key First Name Last Name Email Address click \
continue buttonUser summary pageuser changed currency USD GBP url stringObserve currency Summary page navigate \
confirmation page Expected Result Currency Should not Display GBP currency user not able complete Booking Actual \
Result Currency Displaying GBP currency user able complete booking details reflected saleforce refer attached \
recording"
next_words = 20
word_dict = dict((value, key) for (key, value) in tokenizer_cancelled.word_index.items())
for i in range(next_words):
    seed_sentence = [seed_text]
    seed_sequence = tokenizer_cancelled.texts_to_sequences(seed_sentence)
    if len(seed_sequence[0]) > max_len_desc:
        seed_sequence = [list(seed_sequence[0][(len(seed_sequence[0])-max_len_desc):])]
    padded_seed_sequence = pad_sequences(seed_sequence, truncating=truncating, padding=padding, maxlen=max_len_desc)
    seed_text = seed_text + " " + word_dict[np.argmax(model.predict(padded_seed_sequence)[0])]
print(' '.join(word for word in seed_text.split()[-20:]))

we have a duplicate asap ticket please find the correct notes and blocked to be the issue of metas can
