# Analise de Swagger (OpenAPI Specification) with LDA + WordNet

## 1. Importando Arquivos e Parse do Formato

Verificando se o arquivo está no formato adequado, caso não esteja é descartado

In [4]:
# 

from prance import ResolvingParser, ValidationError
import json 
dataFolder = "dados"
listSwagger = []
count = 0
countReadFiles = 0
countGenericErros = 0
countInvalidFormat = 0
for fileName in os.listdir(dataFolder):
    countReadFiles += 1
    try:
        parser = ResolvingParser(os.path.join(dataFolder,fileName),backend = 'openapi-spec-validator')
        listSwagger.append(parser.json())
        count += 1
    except ValidationError:
        countInvalidFormat += 1 
        #print(f"{fileName} droped format invalid")
    except Exception:
        countGenericErros += 1
        #print(f"{fileName} generic error")
    if count == 1000:
        break
    print("\r",end="")
    print(f"file:{count}/{countReadFiles}",end="")

print(f"CorpusSize:{count} ReadFiles:{countReadFiles} InvalidFiles:{countInvalidFormat} GenericErros:{countGenericErros}")

file:999/1392CorpusSize:1000 ReadFiles:1393 InvalidFiles:245 GenericErros:148


## 2. Gerando o Corpus - Seleção dos atributos do arquivo

In [48]:
docs = []

for i in range(len(listSwagger)):
    try:
        swagger = json.loads(listSwagger[i])
        doc = ""
        doc+= f"{i}\n" # indice do documento
        doc+= "TITLE: " + swagger['info']['title'] + '\n' # titulo
        doc+= "DESCRIPTION: " + (swagger['info']['description'] if 'description' in swagger['info'] else "")  # descricao

        docs.append(doc)
    except KeyError as e:
        print(f"{i} - KeyError:{e}")

  and should_run_async(code)


In [49]:
print(docs[30])

30
TITLE: Zeus API
DESCRIPTION: API for Poseidon frontend
  and should_run_async(code)


## 3. Tokenizando o documento

In [50]:
import re
from nltk.tokenize import RegexpTokenizer
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

# Remove distracting single quotes
docs = [[re.sub("\'", "", token) for token in doc] for doc in docs]

stopWords = stopwords.words('english')
stopWords.extend(['api','apis','www','http','com','app','use'])
# Remove StopWords
docs = [[word for word in simple_preprocess(str(doc)) if word not in stopWords] for doc in docs]


  and should_run_async(code)
  self._context.run(self._callback, *self._args)


In [51]:
print(docs[0])

['title', 'simple', 'inventory', 'description', 'simple']
  and should_run_async(code)


## 4. Lamatizatizando utilizando Wordnet

In [52]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

  and should_run_async(code)
  self._context.run(self._callback, *self._args)


In [53]:
print(docs[0])

['title', 'simple', 'inventory', 'description', 'simple']
  and should_run_async(code)


## 5. Computando o Bigramas

In [54]:
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

  and should_run_async(code)
2020-12-13 20:11:17,606 : INFO : collecting all words and their counts
2020-12-13 20:11:17,608 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-12-13 20:11:17,662 : INFO : collected 12222 word types from a corpus of 16078 words (unigram + bigrams) and 1000 sentences
2020-12-13 20:11:17,663 : INFO : using 12222 counts as vocab in Phrases<0 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>


In [55]:
print(docs[0])

['title', 'simple', 'inventory', 'description', 'simple', 'simple_inventory']
  and should_run_async(code)


## 6. Removendo os tokens comuns e raros

In [56]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

  and should_run_async(code)
2020-12-13 20:11:18,837 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-13 20:11:18,913 : INFO : built Dictionary(3511 unique tokens: ['description', 'inventory', 'simple', 'simple_inventory', 'title']...) from 1000 documents (total 17386 corpus positions)
2020-12-13 20:11:18,936 : INFO : discarding 3448 tokens: [('description', 1000), ('title', 1000), ('ach', 1), ('central', 1), ('da', 8), ('simulação', 1), ('sistema', 4), ('以免泄露敏感信息', 1), ('智掌柜产品提供的开放api', 1), ('智掌柜开放api', 1)]...
2020-12-13 20:11:18,938 : INFO : keeping 63 tokens which were in no less than 20 and no more than 500 (=50.0%) documents
2020-12-13 20:11:18,944 : INFO : resulting dictionary: Dictionary(63 unique tokens: ['inventory', 'simple', 'simple_inventory', 'endpoint', 'io']...)


## 7. Vetorizando os documentos e computando a frequencia de cada palavra

In [57]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

  and should_run_async(code)
  self._context.run(self._callback, *self._args)


In [58]:
print(corpus)

23, 2), (24, 1), (25, 6), (26, 3), (29, 1), (30, 1), (31, 1)], [(4, 3), (10, 1), (11, 1), (13, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 2), (24, 1), (25, 6), (26, 3), (29, 1), (30, 1), (31, 1)], [], [], [(45, 2)], [(5, 1), (6, 1), (8, 1), (34, 1), (35, 1), (45, 5), (47, 1), (52, 1), (59, 2)], [], [(7, 1), (32, 1), (45, 1), (48, 1), (53, 1), (56, 1), (59, 1)], [(39, 1), (56, 1)], [(45, 2), (56, 2)], [], [(45, 2)], [(48, 1), (54, 1)], [], [], [], [], [(3, 1), (45, 2)], [], [(45, 1), (47, 1), (57, 1)], [(45, 1), (47, 1)], [(45, 1), (47, 1), (57, 1)], [(45, 1), (47, 1), (53, 1), (61, 1)], [(45, 1), (47, 1), (53, 2)], [(45, 1), (47, 1), (55, 1)], [(45, 2)], [(45, 1), (47, 1), (56, 1)], [(1, 1), (5, 2), (13, 1), (40, 1), (41, 2), (52, 1), (53, 1), (56, 2), (57, 3)], [(45, 1), (47, 1)], [(45, 1), (47, 1)], [(8, 1), (45, 1), (56, 1)], [(45, 1), (47, 1)], [(45, 1), (47, 1), (56, 1)], [(45, 1), (47, 1), (53, 1), (56, 1)], [(45, 1), (47, 1)], [], [(3, 1), (37, 1)], [(47, 1)], [(8, 2)

## 8. Verificando quantos tokens foram gerados

In [59]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 63
Number of documents: 1000
  and should_run_async(code)


## 9. Habilitando os Log

In [60]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

  and should_run_async(code)


# LDA Treining Model

## 10 - LDA

In [61]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

ocument #1000/1000
2020-12-13 20:11:29,087 : INFO : optimized alpha [0.08804015, 0.08973661, 0.07762326, 0.09650665, 0.09440061, 0.06744907, 0.06966846, 0.06927856, 0.081678264, 0.08012235]
2020-12-13 20:11:29,090 : INFO : topic #5 (0.067): 0.172*"key" + 0.164*"token" + 0.095*"authorization" + 0.083*"use" + 0.045*"http" + 0.044*"data" + 0.030*"test" + 0.028*"swagger" + 0.028*"create" + 0.027*"request"
2020-12-13 20:11:29,093 : INFO : topic #7 (0.069): 0.738*"de" + 0.043*"rest" + 0.034*"http" + 0.020*"com" + 0.016*"information" + 0.015*"token" + 0.013*"simple" + 0.013*"application" + 0.011*"status" + 0.009*"web"
2020-12-13 20:11:29,096 : INFO : topic #1 (0.090): 0.186*"service" + 0.099*"oauth" + 0.091*"specification" + 0.089*"flow" + 0.086*"project" + 0.077*"application" + 0.062*"using" + 0.061*"sample" + 0.052*"example" + 0.051*"security"
2020-12-13 20:11:29,104 : INFO : topic #4 (0.094): 0.496*"simple" + 0.223*"inventory" + 0.210*"simple_inventory" + 0.047*"document" + 0.004*"web" + 0

In [62]:
#View Top Topic
from pprint import pprint

top_topics = model.top_topics(corpus) #, num_words=20)

pprint(top_topics)

  and should_run_async(code)
2020-12-13 20:11:39,047 : INFO : CorpusAccumulator accumulated stats from 1000 documents
[([(0.14248498, 'order'),
   (0.12449872, 'app'),
   (0.11186804, 'http'),
   (0.10665513, 'com'),
   (0.095363766, 'doc'),
   (0.09269815, 'payment'),
   (0.068181396, 'business'),
   (0.061121587, 'status'),
   (0.03589227, 'operation'),
   (0.032751247, 'transaction'),
   (0.029731214, 'get'),
   (0.027047811, 'link'),
   (0.023426395, 'information'),
   (0.019572964, 'create'),
   (0.016585676, 'new'),
   (0.004366364, 'used'),
   (0.0036796355, 'authorization'),
   (0.00010294368, 'use'),
   (9.507902e-05, 'detail'),
   (9.482935e-05, 'user')],
  -1.728712285196215),
 ([(0.2301088, 'swagger'),
   (0.11608842, 'io'),
   (0.11125875, 'swagger_io'),
   (0.073894225, 'irc'),
   (0.07345442, 'petstore'),
   (0.045331195, 'server'),
   (0.04044126, 'sample'),
   (0.038483597, 'net'),
   (0.037509367, 'find'),
   (0.036969062, 'net_swagger'),
   (0.03696906, 'freenode'),


## Avaliação do Modelo

In [64]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

# Compute Coherence Score
coherence_model = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)
  self._context.run(self._callback, *self._args)
2020-12-13 20:15:17,111 : INFO : -3.053 per-word bound, 8.3 perplexity estimate based on a held-out corpus of 1000 documents with 5619 words
2020-12-13 20:15:17,118 : INFO : using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows

Perplexity:  -3.053097612096398
Average topic coherence: -4.9772.
2020-12-13 20:15:20,440 : INFO : 3 accumulators retrieved from output queue
2020-12-13 20:15:20,473 : INFO : accumulated word occurrence stats for 3220 virtual documents

Coherence Score:  0.44694853189865336


# Visualização do Modelo

In [65]:
import pyLDAvis
import pyLDAvis.gensim
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis

  and should_run_async(code)
2020-12-13 20:15:26,876 : INFO : NumExpr defaulting to 4 threads.


# Salvar Modelo pre-treinado em disco

In [None]:
from gensim.test.utils import datapath
# Save model to disk.

temp_file = datapath("model")

lda.save(temp_file)
# Load a potentially pretrained model from disk.
#lda = LdaModel.load(temp_file)