# ReqCRF

by Tooraj Helmi (thelmi@usc.edu)

## Introduction
ReqCRF is a model based on BILSTM and CRF used to tag requirments based on the grammar explained in the paper. 

Note: I ran this code locally on Windows. 

## Configurations

In [None]:
use_embedding = False
parse_data = False
encoding = 'word' # 'word' 'word+pos'
tag = 'TAG7' 
max_len = 50
apps = [ 'Trading', 'TicTacToe','WordGuess', 'News', 'Food Delivery', 'Calendar', 'Bank']
data_path = '..\\data\\'

## Parsing

If parse_data is set to True, the following code will parse a plain text requirement file, break it down to words and POS tags and make it ready for applying NER tags. 
Note: Make sure to rename input and output files to match you need.

In [None]:
from nltk import word_tokenize 
from nltk import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords 
import string
import pandas as pd
import numpy as np

if parse_data:
    reqset = pd.read_csv(data_path + "Reqset-train.csv", names=["SENT", "APP", "REQ"], skiprows=range(1))
    reqset.head(10)

    reqs = reqset["REQ"].to_list()
    print(reqs[0])
    n_req = len(reqs); n_req
    
    sents = []
    for  req in reqs:
        for sent in req.split('.'):
            sents.append([reqs.index(req), sent])
    n_sent = len(sents); n_sent

    lemmetizer = WordNetLemmatizer() 
    no_punc_tokenizer = RegexpTokenizer(r"\w+")
    # st = StanfordPOSTagger('C:/Users/thelm/AppData/Roaming/nltk_data/taggers/stanford-tagger-4.1.0/stanford-postagger-full-2020-08-06/models/english-bidirectional-distsim.tagger', 'C:/Users/thelm/AppData/Roaming/nltk_data/taggers/stanford-tagger-4.1.0/stanford-postagger-full-2020-08-06/stanford-postagger.jar', encoding='utf8')

    dataset = []
    dataset = pd.DataFrame(columns=['APP', 'REQ', 'SENT', 'WORD', 'POS', 'TAG']) 
    for sent in sents:
        tokens = word_tokenize(sent[1])
        # tokens = [word_tokenize(token.lower()) for seq in tokens for token in seq]
        # tokens = [no_punc_tokenizer.tokenize(token.lower()) for seq in tokens for token in seq]  
        print(tokens)  
        tagged = pos_tag(tokens)
        df = pd.DataFrame(tagged, columns=['WORD', 'POS']) 
        df['TAG'] = 'O'
        df['APP'] = reqset.at[sent[0], 'APP']
        df['REQ'] = sent[0]
        df['SENT'] = sents.index(sent)
        dataset = dataset.append(df, ignore_index = True)
    dataset.to_csv(data_path +  "reqset-train-tagged-new.csv")
    dataset.head(20)


## Load Data
Loads the tagged dataset 

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv(data_path + "reqset-train-tagged.csv", encoding="latin1")
data = data.fillna(value=np.NaN)
data = data.dropna(thresh=2)
data = data[data[tag].notnull()]
data = data[data['APP'].isin(apps)]
data = data[["SENT", "WORD", "POS", tag]]
data.tail(10)
print(data.count)

## Preprocessing
The following sections apply indexes to works and labels. Optionally is can also apply GloVe embbeding.

In [None]:
from nltk.corpus import stopwords 

words = [w.lower() for w in list(set(data["WORD"].values))]
n_words = len(words); n_words

In [None]:
tags = list(set(data[tag].values))
n_tags = len(tags); n_tags
print(tags)

In [None]:
pos = list(set(data['POS'].values))
n_pos = len(pos); n_pos

In [None]:
class SentenceGetter(object):  
    def __init__(self, data, tag):
        self.n_sent = 95
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["WORD"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s[tag].values.tolist())]
        self.grouped = self.data.groupby("SENT").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data, tag)
sentences = getter.sentences
sent = getter.get_next()
print(sent)

### Glove Embedding
If use_embedding is True, and the glove.6B.50d.txt is available in the data folder will embed tokens

In [None]:
import numpy as np
embeddings_index = {}

if use_embedding:
    with open(data_path + 'glove.6B.50d.txt', 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split(' ')
            word = values[0] ## The first entry is the word
            coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
            embeddings_index[word] = coefs
        f.close()
    print('GloVe data loaded')
    print(embeddings_index["'s"])

In [None]:
word2idx = {w: i + 1 for i, w in enumerate(words)}
word2idx["user"]

In [None]:
tag2idx = {t: i for i, t in enumerate(tags)}
# print(tag2idx['B-APP-ALG'])

In [None]:
pos2Idx = {p: i for i, p in enumerate(pos)}
print(pos2Idx['NNP'])

## Statistics

In [None]:
print(len(sentences))
print(len(word2idx))
print(len(tag2idx))

## Generating X and Y
To following sections will produce necessary data for training. First one uses GloVe embedding and second one does not

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import tensorflow as tf

def encode_with_embedding(sentences):
    max_words = 40
    embedding_dim = embeddings_index.get('a').shape[0]

    n_row = len(sentences)
    n_word_dim = embedding_dim + n_pos
    X = np.zeros((n_row, n_word_dim * max_words))
    Y = []
    for sent_idx, sent in enumerate(sentences):
        Y_sent = np.tile(to_categorical(tag2idx["O"], num_classes=n_tags), (n_word_dim * max_words, 1))
        for word_idx, w in enumerate(w for w in sent[:max_words]):
            embedding_vector = embeddings_index.get(w[0].lower()) 
            pos_vector = to_categorical(pos2Idx[w[1]], num_classes=n_pos) 
            word_encoded = np.append(embedding_vector, pos_vector)

            for i in range(n_word_dim):   
                X[sent_idx, i + word_idx * n_word_dim] = word_encoded[i]
                Y_sent[i + word_idx * n_word_dim] = to_categorical(tag2idx[w[2]], num_classes=n_tags)
        Y.append(Y_sent)

    print(X.shape)
    print(Y[0].shape)
    print(len(Y))

    return X, Y

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def encode_no_embedding(sentences):
    if encoding == 'word':
        X = [[word2idx[w[0].lower()] for w in s] for s in sentences]
    elif encoding == 'pos':
        X = [[pos2Idx[w[1]] for w in s] for s in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0) 

    Y = [[tag2idx[w[2]] for w in s] for s in sentences]
    Y = pad_sequences(maxlen=max_len, sequences=Y, padding="post", value=tag2idx["O"])
    Y = [to_categorical(i, num_classes=n_tags) for i in Y]
    
    print('X:', X.shape)
    print('Y:',Y[0].shape)

    return X, Y

## Define Model
Model includes 3 layers:
1. embedding
2. BILSTM
3. Linear Chain CRF

In [None]:

from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

if use_embedding:
    input_len = n_word_dim * max_words
else:
    input_len = max_len
input = Input(shape=(input_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=input_len, mask_zero=True, trainable=False)(input) 
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model) 
model = TimeDistributed(Dense(50, activation="relu"))(model) 
crf = CRF(n_tags) 
out = crf(model) 

  

In [None]:
model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

## Train the Model

In [None]:
from sklearn.model_selection import train_test_split
if use_embedding:
    X, Y = encode_with_embedding(sentences)
else:
    X, Y = encode_no_embedding(sentences)

X_tr, X_te, y_tr, y_te = train_test_split(X, Y, test_size=0.001)
# X_tr = X
# Y_tr = Y
# print(X_tr.shape, X_te.shape, y_tr[0].shape)
history = model.fit(X_tr, np.array(y_tr), batch_size=1, epochs=50, validation_split=0.1, verbose=1)

In [None]:
hist = pd.DataFrame(history.history)
print(hist)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12, 12))
plt.plot(hist["crf_viterbi_accuracy"])
plt.plot(hist["val_crf_viterbi_accuracy"])
plt.show()

## Testing

In [None]:
data = pd.read_csv(data_path + "reqset-test-tagged.csv", encoding="latin1")
data = data.fillna(value=np.NaN)
data = data.dropna(thresh=2)
data = data[data[tag].notnull()]
data = data[["SENT", "WORD", "POS", tag]]
# print(data.tail(10))
print(data.count)

X_tr, X_te, y_tr, y_te = train_test_split(X, Y, test_size=0.001)

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
test_pred = model.predict(X_tr, verbose=1)

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

## Model Performance

In [None]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out   

pred_labels = pred2label(test_pred)
test_labels = pred2label(y_tr)

f1_micro = f1_score(test_labels, pred_labels, average='micro')
f1_macro = f1_score(test_labels, pred_labels, average='macro')
f1_weighted = f1_score(test_labels, pred_labels, average='weighted')
print("Micro F1-score: {:.2%}".format(f1_micro))
print("Macro F1-score: {:.2%}".format(f1_macro))
print("Weighted F1-score: {:.2%}".format(f1_weighted))
print(classification_report(test_labels, pred_labels))

In [None]:
i = 2
p = model.predict(np.array([X_tr[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_tr[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_tr[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))

## Predict Labels for a Requirement

In [None]:
test_sentence = ["if", "user", "picks", "the", "first", "choice", ",", "he", "should", "be", "shown", "a", "white", "baloon", "below", "the", "text"]
x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]],
                            padding="post", value=0, maxlen=max_len)

p = model.predict(np.array([x_test_sent[0]]))
p = np.argmax(p, axis=-1)
print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for w, pred in zip(test_sentence, p[0]):
    print("{:15}: {:5}".format(w, tags[pred]))