Extract Syntactic Info using DP

# ReqCRF

by Tooraj Helmi (thelmi@usc.edu)

## Introduction
ReqCRF is a model based on BILSTM and CRF used to tag requirments based on the grammar explained in the paper. 

Note: I ran this code locally on Windows. 

## Configurations

In [1]:
ext_embedding = False
parse_data = False
encoding = ['word', 'pos', 'dep'] # word, pos, dep
tag = 'TAG' # name of the tag column 
max_len = 50
apps = [ 'Trading', 'TicTacToe','WordGuess', 'News', 'Food Delivery', 'Calendar', 'Bank']
data_path = '../data/'
use_dp = True
embedding_dim = { 'word' : 10, 'pos': 3, 'dep': 3}

## Parsing

If parse_data is set to True, the following code will parse a plain text requirement file, break it down to words and POS tags and make it ready for applying NER tags. 
Note: Make sure to rename input and output files to match you need.

In [22]:
from nltk import word_tokenize 
from nltk import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords 
import string
import pandas as pd
import numpy as np
import spacy

if parse_data:
    reqset = pd.read_csv(data_path + 'Reqset-train.csv', names=["SENT", "APP", "REQ"], skiprows=range(1))
    reqset.head(10)

    reqs = reqset["REQ"].to_list()
    n_req = len(reqs); n_req
    
    sents = []
    for  req in reqs:
        for sent in req.split('.'):
            sents.append([reqs.index(req), sent])
    n_sent = len(sents); n_sent
    
    nlp = spacy.load("en_core_web_sm")
    dataset = pd.DataFrame(columns=['APP', 'REQ', 'SENT', 'WORD', 'LEMMA', 'POS', 'DEP', 'TAG']) 
    for sent in sents:
        doc = nlp(sent[1])
        for token in doc:
            dataset = dataset.append({'APP': reqset.at[sent[0], 'APP'], 'REQ': sent[0], 
                'SENT': sents.index(sent), 'WORD': token.text, 'LEMMA': token.lemma_, 
                'POS': token.pos_, 'DEP': token.dep_}, ignore_index = True)

    dataset.to_csv(data_path +  "reqset-train-tagged-new.csv")
    print(dataset.head(20))

     APP REQ SENT       WORD    LEMMA    POS        DEP  TAG
0   News   0    0       When     when    ADV     advmod  NaN
1   News   0    0        the      the    DET        det  NaN
2   News   0    0       user     user   NOUN      nsubj  NaN
3   News   0    0   launches   launch   VERB      advcl  NaN
4   News   0    0        app      app   NOUN       dobj  NaN
5   News   0    0        for      for    ADP       prep  NaN
6   News   0    0        the      the    DET        det  NaN
7   News   0    0      first    first    ADJ       amod  NaN
8   News   0    0       time     time   NOUN       pobj  NaN
9   News   0    0          ,        ,  PUNCT      punct  NaN
10  News   0    0         he   -PRON-   PRON  nsubjpass  NaN
11  News   0    0     should   should   VERB        aux  NaN
12  News   0    0         be       be    AUX    auxpass  NaN
13  News   0    0  presented  present   VERB       ROOT  NaN
14  News   0    0       with     with    ADP       prep  NaN
15  News   0    0       

## Load Data
Loads the tagged dataset 

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv(data_path + "reqset-train-tagged.csv", encoding="latin1")
data = data.fillna(value=np.NaN)
data = data.dropna(thresh=2)
data = data[data[tag].notnull()]
data = data[data['APP'].isin(apps)]
data = data[["SENT", "WORD", "POS", "DEP", tag]]
data.tail(10)
print(data.count)

<bound method DataFrame.count of        SENT      WORD   POS     DEP      TAG
0       0.0      When   ADV  advmod  APP-IFF
1       0.0       the   DET     det        O
2       0.0      user  NOUN   nsubj  APP-ENT
3       0.0  launches  VERB   advcl  APP-ACT
4       0.0       app  NOUN    dobj  APP-ENT
...     ...       ...   ...     ...      ...
3573  166.0       the   DET     det        O
3574  166.0  detailed   ADJ    dobj  APP-ENT
3575  166.0        of   ADP    prep        O
3576  166.0       the   DET     det        O
3577  166.0    events  NOUN    pobj  APP-ENT

[3578 rows x 5 columns]>


## Preprocessing
The following sections apply indexes to works and labels. Optionally is can also apply GloVe embbeding.

In [3]:
words = [w.lower() for w in list(set(data["WORD"].values))]
n_words = len(words); 

tags = list(set(data[tag].values))
n_tags = len(tags); 

pos = list(set(data['POS'].values))
n_pos = len(pos); 

deps = list(set(data['DEP'].values))
n_deps = len(deps); 

print('WORD count:', n_words)
print('POS count:', n_pos)
print('DEP count:', n_deps)
print('TAG count:', n_tags)

print('TAGS:', tags)

WORD count: 638
POS count: 16
DEP count: 44
TAG count: 9
TAGS: ['APP-QLF', 'O', 'APP-IFF', 'APP-ENT', 'APP-POS', 'APP-SHO', 'APP-FIX', 'APP-INP', 'APP-ACT']


In [5]:
class SentenceGetter(object):  
    def __init__(self, data, tag):
        self.n_sent = 95
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, d, t) for w, p, d, t in zip(s["WORD"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["DEP"].values.tolist(),
                                                           s[tag].values.tolist())]
        self.grouped = self.data.groupby("SENT").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [6]:
getter = SentenceGetter(data, tag)
sentences = getter.sentences
sent = getter.get_next()
print(sent)

[(' ', 'SPACE', nan, 'O'), ('User', 'PROPN', 'nsubj', 'APP-ENT'), ('can', 'VERB', 'aux', 'APP-IFF'), ('press', 'VERB', 'ROOT', 'APP-INP'), ('enter', 'VERB', 'dobj', 'APP-ENT'), ('to', 'PART', 'aux', 'O'), ('go', 'VERB', 'advcl', 'APP-ACT'), ('back', 'ADV', 'advmod', 'O'), ('to', 'ADP', 'prep', 'O'), ('the', 'DET', 'det', 'O'), ('main', 'ADJ', 'amod', 'APP-ENT'), ('menu', 'NOUN', 'pobj', 'APP-ENT')]


### Glove Embedding
If ext_embedding is True, and the glove.6B.50d.txt is available in the data folder will embed tokens

In [7]:
import numpy as np
embeddings_index = {}

if ext_embedding:
    with open(data_path + 'glove.6B.50d.txt', 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split(' ')
            word = values[0] ## The first entry is the word
            coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
            embeddings_index[word] = coefs
        f.close()
    print('GloVe data loaded')
    print(embeddings_index["'s"])

## Create Indexes

In [8]:
word2Idx = {w: i + 1 for i, w in enumerate(words)}
pos2Idx = {p: i for i, p in enumerate(pos)}
dep2Idx = {p: i for i, p in enumerate(deps)}
tag2Idx = {t: i for i, t in enumerate(tags)}

print('# Sents: ', len(sentences))
print('# Unique Words: ', len(word2Idx))
print('# Unique POS: ', len(pos2Idx))
print('# Unique DEPS: ', len(dep2Idx))
print('# Unique TAGS: ', len(tag2Idx))

# Sents:  150
# Unique Words:  560
# Unique POS:  16
# Unique DEPS:  44
# Unique TAGS:  9


## Generating X and Y
To following sections will produce necessary data for training. First one uses GloVe embedding and second one does not

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import tensorflow as tf

def encode_with_embedding(sentences):
    max_words = 40
    embedding_dim = embeddings_index.get('a').shape[0]

    n_row = len(sentences)
    n_word_dim = embedding_dim + n_pos
    X = np.zeros((n_row, n_word_dim * max_words))
    Y = []
    for sent_idx, sent in enumerate(sentences):
        Y_sent = np.tile(to_categorical(tag2idx["O"], num_classes=n_tags), (n_word_dim * max_words, 1))
        for word_idx, w in enumerate(w for w in sent[:max_words]):
            embedding_vector = embeddings_index.get(w[0].lower()) 
            pos_vector = to_categorical(pos2Idx[w[1]], num_classes=n_pos) 
            word_encoded = np.append(embedding_vector, pos_vector)

            for i in range(n_word_dim):   
                X[sent_idx, i + word_idx * n_word_dim] = word_encoded[i]
                Y_sent[i + word_idx * n_word_dim] = to_categorical(tag2idx[w[2]], num_classes=n_tags)
        Y.append(Y_sent)

    print(X.shape)
    print(Y[0].shape)
    print(len(Y))

    return X, Y

In [9]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def encode_no_embedding(sentences):
    if 'word' in encoding:
        X = X.append[[word2idx[w[0].lower()] for w in s] for s in sentences]
    if  encoding == 'pos':
        X = [[pos2Idx[w[1]] for w in s] for s in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0) 

    Y = [[tag2idx[w[2]] for w in s] for s in sentences]
    Y = pad_sequences(maxlen=max_len, sequences=Y, padding="post", value=tag2idx["O"])
    Y = [to_categorical(i, num_classes=n_tags) for i in Y]
    
    print('X:', X.shape)
    print('Y:',Y[0].shape)

    return X, Y

SyntaxError: invalid syntax (<ipython-input-9-336b4d0fbfef>, line 6)

## Define Model
Model includes 3 layers:
1. embedding
2. BILSTM
3. Linear Chain CRF

In [11]:

from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Concatenate
from keras_contrib.layers import CRF

input = Input(shape=(max_len,))

# model = Embedding(input_dim=n_words + 1, output_dim=20,
#                   input_length=input_len, mask_zero=True, trainable=False)(input) 
# model = Bidirectional(LSTM(units=50, return_sequences=True,
#                            recurrent_dropout=0.1))(model) 
# model = TimeDistributed(Dense(50, activation="relu"))(model) 
# crf = CRF(n_tags) 
# out = crf(model) 

embeddings = []
p
if 'word' in encoding:
    embeddings.append(Embedding(input_dim=n_words + 1, output_dim=embedding_dim['word'],
        input_length=input_len, mask_zero=True, trainable=True)(input))

if 'pos' in encoding:
    embeddings.append(Embedding(input_dim=n_pos + 1, output_dim=embedding_dim['pos'],
        input_length=input_len, mask_zero=True, trainable=True)(input))

if 'dep' in encoding:
    embeddings.append(Embedding(input_dim=n_deps + 1, output_dim=embedding_dim['dep'],
        input_length=input_len, mask_zero=True, trainable=True)(input)) 

model = Concatenate(axis=1)(embeddings)
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model) 
model = TimeDistributed(Dense(50, activation="relu"))(model) 
crf = CRF(n_tags) 
out = crf(model) 

  

ModuleNotFoundError: No module named 'keras'

In [36]:
embedding_dim['word']

10

In [None]:
model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

## Train the Model

In [None]:
from sklearn.model_selection import train_test_split
if use_embedding:
    X, Y = encode_with_embedding(sentences)
else:
    X, Y = encode_no_embedding(sentences)

X_tr, X_te, y_tr, y_te = train_test_split(X, Y, test_size=0.001)
# X_tr = X
# Y_tr = Y
# print(X_tr.shape, X_te.shape, y_tr[0].shape)
history = model.fit(X_tr, np.array(y_tr), batch_size=1, epochs=50, validation_split=0.1, verbose=1)

In [None]:
hist = pd.DataFrame(history.history)
print(hist)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12, 12))
plt.plot(hist["crf_viterbi_accuracy"])
plt.plot(hist["val_crf_viterbi_accuracy"])
plt.show()

## Testing

In [None]:
data = pd.read_csv(data_path + "reqset-test-tagged.csv", encoding="latin1")
data = data.fillna(value=np.NaN)
data = data.dropna(thresh=2)
data = data[data[tag].notnull()]
data = data[["SENT", "WORD", "POS", tag]]
# print(data.tail(10))
print(data.count)

X_tr, X_te, y_tr, y_te = train_test_split(X, Y, test_size=0.001)

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
test_pred = model.predict(X_tr, verbose=1)

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

## Model Performance

In [None]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out   

pred_labels = pred2label(test_pred)
test_labels = pred2label(y_tr)

f1_micro = f1_score(test_labels, pred_labels, average='micro')
f1_macro = f1_score(test_labels, pred_labels, average='macro')
f1_weighted = f1_score(test_labels, pred_labels, average='weighted')
print("Micro F1-score: {:.2%}".format(f1_micro))
print("Macro F1-score: {:.2%}".format(f1_macro))
print("Weighted F1-score: {:.2%}".format(f1_weighted))
print(classification_report(test_labels, pred_labels))

In [None]:
i = 2
p = model.predict(np.array([X_tr[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_tr[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_tr[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))

## Predict Labels for a Requirement

In [None]:
test_sentence = ["if", "user", "picks", "the", "first", "choice", ",", "he", "should", "be", "shown", "a", "white", "baloon", "below", "the", "text"]
x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]],
                            padding="post", value=0, maxlen=max_len)

p = model.predict(np.array([x_test_sent[0]]))
p = np.argmax(p, axis=-1)
print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for w, pred in zip(test_sentence, p[0]):
    print("{:15}: {:5}".format(w, tags[pred]))