## Terminology Extraction

### 1. Data Preparation

The term annotated dataset is from:

**Data Repository**: https://github.com/tgem/crowdre-glossary

**Paper**: Tim Gemkow, Miro Conzelmann, Kerstin Hartig, and Andreas Vogelsang. *Automatic glossary term extraction from large-scale requirements specifications*. In 2018 IEEE 26th International Requirements Engineering Conference (RE), pages 412–417. IEEE, 2018.

import jsonlines
from utils.fileUtils import readCsvToList

lines = readCsvToList('./data/term_extraction.csv')

lines[:2]

lines[100:102]

dict_list = []

for line in lines[1:101]:
    req_id = line[0]
    terms = line[1].strip().split(', ')
    role = line[2]
    feature = line[3]
    benefit = line[4]
    req_sent = 'As a ' + role + ' I want ' + feature + ' so that ' + benefit + '.'
    dict_line = {'id': req_id,
                'Req': req_sent,
                'Terms': terms}
    dict_list.append(dict_line)

with jsonlines.open('./data/term_extraction.jsonl',mode='w') as writer:
     writer.write_all(dict_list)

### 2. ChatGPT for Terminology Extraction

In [8]:
# ! pip install openai

In [9]:
import openai

In [10]:
openai.api_key = 'YOUR_OWN_API_KEY'

In [98]:
test_req = dict_list[1]
print(test_req)

{'id': '12', 'Req': 'As a home occupant I want my smart home to turn on certain lights at dusk so that I can come home to a well-lit house.', 'Terms': ['home occupant', 'smart home', 'light', 'dusk', 'home', 'house']}


In [99]:
prompt = 'Extract all single-word and multi-words noun terms from the following software requirements statement:\n\n'

In [100]:
response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt + test_req['Req']}],
        temperature = 0,
        top_p = 1
    )

In [101]:
content = response['choices'][0]['message']['content']

In [102]:
print(content)



- Home occupant
- Smart home
- Certain lights
- Dusk
- House


In [103]:
content.strip().strip('- ').split('\n- ')

['Home occupant', 'Smart home', 'Certain lights', 'Dusk', 'House']

In [104]:
p = content.strip().strip('- ').split('\n- ')

In [105]:
set(p) & set(test_req['Terms'])

set()

In [122]:
def query_ChatGPT(req_str):
    prompt = 'Extract all single-word and multi-words noun terms from the following software requirements statement:\n\n'
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt + req_str}],
        temperature=0,  # default1
        top_p=1
    )
    content = response['choices'][0]['message']['content']
    result_list = content.strip().strip('- ').lower().split('\n- ')
    return result_list

In [123]:
import jsonlines

reqs = []

with open('./data/term_extraction.jsonl', 'r+', encoding="utf8") as f:
    for line in jsonlines.Reader(f):
        reqs.append(line)

In [124]:
extract_all = set()
answer_all = set()
for req in reqs:
    answer = req['Terms']
    answer_total += len(answer)
    extract = query_ChatGPT(req['Req'])
    extract_all.update(extract)
    answer_all.update(answer)

49.86 69.19999999999999


In [1]:
from utils.fileUtils import *

In [134]:
dumpJson('./extraction_result.json',sorted(list(extract_all)))

In [135]:
dumpJson('./ground_truth.json',sorted(list(answer_all)))

In [76]:
# import spacy
# nlp = spacy.load('en_core_web_sm')

# doc = nlp("team‘s goal home")

# word_list = []
# for token in doc:
#     word = token.lemma_
#     word_list.append(word)
# print(' '.join(word_list))

In [2]:
extract_all = loadJson('./extraction_result.json')

In [3]:
answer_all = loadJson('./ground_truth.json')

In [42]:
import spacy
nlp = spacy.load('en_core_web_sm')

extract_all_lemma = set()
answer_all_lemma = set()

for item in extract_all:
    doc = nlp(item)
    word_list = []
    for token in doc:
        word = token.lemma_.lower()
        word_list.append(word) 
    extract_all_lemma.add(' '.join(word_list))
    
for item in answer_all:
    item = item.replace('‘',"'")
    doc = nlp(item)
    word_list = []
    for token in doc:
        word = token.lemma_.lower()
        word_list.append(word)
    if ' '.join(word_list) in answer_all_lemma:
        print(item, ' '.join(word_list))
    answer_all_lemma.add(' '.join(word_list))

# doc = nlp("team‘s goal home")

# word_list = []
# for token in doc:
#     word = token.lemma_
#     word_list.append(word)
# print(' '.join(word_list))

lighting light
tv tv


In [56]:
# As the two terms *lighting* and *TV* in the ground truth (before lemmatization) are extracted by the ChatGPT. See the extraction_result.json.
# After lemmatization, these two terms and their other variants are normalized to the base forms, i.e. tv and light.
# To compare with previous extraction results on this benchmark. Both the sizes of *answer_all_lemma* and *extract_all_lemma* addded by 2.
# Thus answer_all_num = len(answer_all_lemma) + 2, extract_all_num = len(extract_all_lemma) + 2

In [57]:
len(answer_all_lemma)

248

In [58]:
len(extract_all_lemma)

332

In [59]:
answer_all_num = len(answer_all_lemma) + 2
extract_all_num = len(extract_all_lemma) + 2

In [60]:
tp = len(answer_all_lemma & extract_all_lemma) + 2

In [61]:
r = round(tp/answer_all_num,4)*100

In [62]:
p = round(tp/(extract_all_num),4)*100

In [63]:
f1 = round((2*p*r)/(p+r),2)

In [64]:
print(p,r,f1)

67.07 89.60000000000001 76.72


In [22]:
extract_len_l = [len(item.split()) for item in extract_all_lemma]

In [23]:
answer_len_l = [len(item.split()) for item in answer_all_lemma]

In [24]:
sum(extract_len_l)/len(extract_len_l)

1.5993975903614457

In [25]:
sum(answer_len_l)/len(answer_len_l)

1.6088709677419355