<a href="https://colab.research.google.com/github/victorrborges/thesaurus-traceability-study/blob/main/vocabulary_unifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
!pip install nltk



In [64]:
import pandas as pd
import nltk
import requests
import json

from nltk.corpus import wordnet 
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

conceptnet_syns = {}
conceptnet_check = {}

tcs_csv = pd.read_csv('https://gist.githubusercontent.com/victorrborges/c12f4f21d3774505ec4b21976d5c29cd/raw/bea8ea328a1cf5f2ff5fcca3438b0f2cdb137055/testcases_final.csv', ",")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [65]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')
def tokenizeDoc(string):
  word_tokens = tokenizer.tokenize(string)
  filtered_sentence = [w for w in word_tokens if not w in stop_words]
  filtered_sentence = [token.lower() for token in filtered_sentence]
  return filtered_sentence

In [66]:
wordFreq = {}

def countWordFreq(words_token):
  for token in words_token:
    if token.isalpha():
      if token not in wordFreq:
          wordFreq[token] = 0 
      wordFreq[token] += 1
    

for index, row in tcs_csv.iterrows():
  tokens = tokenizeDoc(row['tc_desc'])
  countWordFreq(tokens)

In [67]:
nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')}
computer_science_relations = ['computing', 'computer programming', 'computation', 'computer', 'software',  'code', 'programming', 'programming language', 'program']

def expandDoc(words_token, n, api, technical_terms_conditional, nouns_conditional):
  expandedDoc = []
  for token in words_token:
    if token.isalpha() and shouldExpandTerm(token, technical_terms_conditional, nouns_conditional):
      finalTokenExpanded = list(set([token] + getTopSynsets(getSynonyms(token, api), n)))
      expandedDoc += finalTokenExpanded
    else:
      expandedDoc += [token]
  return expandedDoc

def shouldExpandTerm(token, technical_terms_conditional, nouns_conditional):
  shouldExpand = True
  
  first_check = nouns_conditional and token not in nouns
  second_check = technical_terms_conditional and not isRelatedToComputerScience(token)
  
  if first_check or second_check:
    shouldExpand = False

  return shouldExpand

def isRelatedToComputerScience(word):
  if word not in conceptnet_check:
    url = "http://api.conceptnet.io/query?start=/c/en/" + word + "&rel=/r/RelatedTo&end=/c/en"
    relationsNodeList = requests.get(url).json()['edges']
    relationsList = [word['end']['label'] for word in relationsNodeList]
    check =  any(item in relationsList for item in computer_science_relations)
    conceptnet_check[word] = check
    return check
  else:
    return conceptnet_check[word]

def getTopSynsets(words, n):
  words = list(set(words))
  topN = []
  wordsFrequency = {}
  for word in words:
    if word in wordFreq:
      wordsFrequency[word] = wordFreq[word]
  wordsFrequency = sorted(wordsFrequency.items(), key=lambda item: item[1], reverse=True)
  wordsFrequency = [word[0] for word in wordsFrequency]
  return wordsFrequency[:n]

def getSynonyms(token, api):
  if api == 'wordnet':
    return wordNetSynonyms(token)
  else:
    return conceptNetSynonyms(token)

def conceptNetSynonyms(word):
  if word not in conceptnet_syns:
    url = "http://api.conceptnet.io/query?start=/c/en/" + word + "&rel=/r/Synonym&end=/c/en"
    synonymNodeList = requests.get(url).json()['edges']
    synonymList = [word['end']['label'] for word in synonymNodeList]
    conceptnet_syns[word] = synonymList
    return synonymList
  else:
    return conceptnet_syns[word]

def wordNetSynonyms(token):
  tokenExpanded = []
  for syn in wordnet.synsets(token):
    tokenExpanded += syn.lemma_names()
  return tokenExpanded

In [71]:
# SET PARAMETERS:
api = 'conceptnet' #'wordnet' # or 'conceptnet'
technical_terms_conditional = True
nouns_conditional = True

def getCSVLabel(limit):
  label = '{}_expanded_'.format(api)
  if nouns_conditional:
    label += 'nouns_'
  if technical_terms_conditional:
    label += 'related_'
  label += 'top_{}_selected_bugreports_final.csv'.format(limit)
  return label

for i in range(1, 6):
  limit = i
  bug_csv = pd.read_csv('https://gist.githubusercontent.com/victorrborges/d5a7b41c1fb61608b6c4e7d91be69517/raw/f4e8ec20adad82634a5d5e4be66b4ea0c5cfb69e/selected_bugreports_final.csv', ",")

  for index, row in bug_csv.iterrows():
    tokens = tokenizeDoc(row['br_desc'])
    expanded_br_desc = " ".join(expandDoc(tokens, limit, api, technical_terms_conditional, nouns_conditional))
    bug_csv.at[index, 'br_desc'] =  expanded_br_desc

  bug_csv.to_csv(getCSVLabel(limit), index=False)