In [230]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Analise de Swagger (OpenAPI Specification) with LDA + WordNet

## 1. Importando Arquivos e Parse do Formato

Verificando se o arquivo está no formato adequado, caso não esteja é descartado

In [231]:
# 

from prance import ResolvingParser, ValidationError
import json 
dataFolder = "dados"
listSwagger = []
count = 0
countReadFiles = 0
countGenericErros = 0
countInvalidFormat = 0
for fileName in os.listdir(dataFolder):
    countReadFiles += 1
    try:
        parser = ResolvingParser(os.path.join(dataFolder,fileName),backend = 'openapi-spec-validator')
        listSwagger.append(parser.json())
        count += 1
    except ValidationError:
        countInvalidFormat += 1 
        #print(f"{fileName} droped format invalid")
    except Exception:
        countGenericErros += 1
        #print(f"{fileName} generic error")
    #if count == 1000:
    #    break
    print("\r",end="")
    print(f"file:{count}/{countReadFiles}",end="")

print(f"CorpusSize:{count} ReadFiles:{countReadFiles} InvalidFiles:{countInvalidFormat} GenericErros:{countGenericErros}")

file:15800/20008CorpusSize:15800 ReadFiles:20008 InvalidFiles:2683 GenericErros:1525


## 2. Gerando o Corpus - Seleção dos atributos do arquivo

In [232]:
docs = []

for i in range(len(listSwagger)):
    try:
        swagger = json.loads(listSwagger[i])
        doc = ""
        doc+= f"{i}\n" # indice do documento
        doc+= "TITLE: " + swagger['info']['title'] + '\n' # titulo
        #doc+= "DESCRIPTION: " + (swagger['info']['description'] if 'description' in swagger['info'] else "") + "\n" # descricao
        '''
        for path in swagger['paths']:
            doc+= "OPERATION: " + path + "\n"
            doc+= "OPERATION DESCRIPTION: "
            for ope in swagger['paths'][path]:
                if 'description' in swagger['paths'][path][ope]:
                    doc+= swagger['paths'][path][ope]['description']+"; "
            doc+= "\n"
            doc+="RESPONSE DESCRIPTION: "
            for ope in swagger['paths'][path]:
                if 'responses' in swagger['paths'][path][ope]:
                    for code in swagger['paths'][path][ope]['responses']:
                        doc+= swagger['paths'][path][ope]['responses'][code]['description']+"; "
            doc+= "\n"
        '''
        docs.append(doc)
    except KeyError as e:
        print(f"{i} - KeyError:{e}")

In [233]:
print(docs[0])

0
TITLE: Simple Inventory API



## 3. Tokenizando o documento

In [234]:
import re
from nltk.tokenize import RegexpTokenizer
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

# Remove distracting single quotes
docs = [[re.sub("\'", "", token) for token in doc] for doc in docs]

stopWords = stopwords.words('english')
stopWords.extend(['api','apis','www','http','com','app','use','swagger'])
# Remove StopWords
docs = [[word for word in simple_preprocess(str(doc)) if word not in stopWords] for doc in docs]


In [235]:
print(docs[0])

['title', 'simple', 'inventory']


## 4. Lamatizatizando utilizando Wordnet

In [236]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [237]:
print(docs[0])

['title', 'simple', 'inventory']


## 5. Computando o Bigramas

In [238]:
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

2020-12-14 00:21:00,104 : INFO : collecting all words and their counts
2020-12-14 00:21:00,109 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-12-14 00:21:00,228 : INFO : PROGRESS: at sentence #10000, processed 28701 words and 16839 word types
2020-12-14 00:21:00,309 : INFO : collected 25033 word types from a corpus of 45328 words (unigram + bigrams) and 15800 sentences
2020-12-14 00:21:00,312 : INFO : using 25033 counts as vocab in Phrases<0 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>


In [239]:
print(docs[0])

['title', 'simple', 'inventory', 'simple_inventory']


## 6. Removendo os tokens comuns e raros

In [240]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

2020-12-14 00:21:00,882 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-14 00:21:01,051 : INFO : adding document #10000 to Dictionary(6311 unique tokens: ['inventory', 'simple', 'simple_inventory', 'title', 'ach']...)
2020-12-14 00:21:01,133 : INFO : built Dictionary(9127 unique tokens: ['inventory', 'simple', 'simple_inventory', 'title', 'ach']...) from 15800 documents (total 48131 corpus positions)
2020-12-14 00:21:01,163 : INFO : discarding 9007 tokens: [('title', 15800), ('ach', 1), ('central', 10), ('da', 17), ('simulação', 1), ('sistema', 6), ('智掌柜开放api', 1), ('zeuz', 1), ('zxpense', 1), ('bcx', 2)]...
2020-12-14 00:21:01,172 : INFO : keeping 120 tokens which were in no less than 20 and no more than 7900 (=50.0%) documents
2020-12-14 00:21:01,184 : INFO : resulting dictionary: Dictionary(120 unique tokens: ['inventory', 'simple', 'simple_inventory', 'external', 'gateway']...)


## 7. Vetorizando os documentos e computando a frequencia de cada palavra

In [241]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [242]:
print(corpus)

, [(6, 1)], [], [(6, 1)], [(6, 1)], [], [], [(6, 1)], [(1, 1)], [(113, 1)], [(12, 1)], [], [], [(1, 1), (31, 1)], [(0, 1), (1, 1), (2, 1)], [(28, 1), (53, 1)], [], [], [], [(0, 1), (1, 1), (2, 1)], [], [], [], [], [(36, 1)], [], [(28, 1)], [], [(105, 1)], [(82, 1), (97, 1)], [], [], [(96, 1)], [], [], [(0, 1), (1, 1), (2, 1)], [], [], [(47, 1)], [], [(69, 1)], [(28, 1), (73, 1)], [(69, 1)], [(28, 1), (69, 1)], [], [], [], [(28, 1)], [(9, 1), (32, 1), (76, 1)], [(6, 1)], [(51, 1), (89, 1)], [], [(0, 1), (1, 1), (2, 1)], [(42, 1)], [], [(16, 1), (17, 1), (19, 1), (20, 1), (33, 1), (34, 1), (35, 1)], [(60, 1)], [(31, 1)], [], [(0, 1), (1, 1), (2, 1)], [(0, 1), (1, 1), (2, 1)], [(0, 1), (1, 1), (2, 1)], [(6, 1)], [], [(0, 1), (1, 1), (2, 1)], [(31, 1)], [(6, 1)], [(6, 1)], [(0, 1), (1, 1), (2, 1)], [], [], [(0, 1), (1, 1), (2, 1)], [(6, 1)], [(19, 1), (31, 1)], [(6, 1)], [(6, 1)], [(6, 1)], [(31, 1)], [(31, 1)], [(6, 1)], [(6, 1)], [(6, 1)], [(6, 1)], [(87, 1)], [(0, 1), (1, 1), (2, 1)], [

## 8. Verificando quantos tokens foram gerados

In [243]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 120
Number of documents: 15800


## 9. Habilitando os Log

In [244]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

# LDA Treining Model

## 10 - LDA

In [245]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

diff=0.036543, rho=0.192807
2020-12-14 00:23:47,048 : INFO : -3.443 per-word bound, 10.9 perplexity estimate based on a held-out corpus of 2000 documents with 1907 words
2020-12-14 00:23:47,049 : INFO : PROGRESS: pass 18, at document #10000/15800
2020-12-14 00:23:47,480 : INFO : optimized alpha [0.04159801, 0.057979144, 0.033892687, 0.07947053, 0.07265257, 0.07910896, 0.043136507, 0.20112897, 0.19131367, 0.074838944]
2020-12-14 00:23:47,483 : INFO : merging changes from 2000 documents into a model of 15800 documents
2020-12-14 00:23:47,492 : INFO : topic #2 (0.034): 0.274*"public" + 0.209*"open" + 0.151*"client" + 0.142*"hub" + 0.104*"device" + 0.076*"model" + 0.000*"data" + 0.000*"smart" + 0.000*"list" + 0.000*"portal"
2020-12-14 00:23:47,498 : INFO : topic #0 (0.042): 0.298*"customer" + 0.183*"payment" + 0.134*"gateway" + 0.120*"search" + 0.081*"digital" + 0.060*"new" + 0.059*"business" + 0.041*"authentication" + 0.000*"request" + 0.000*"engine"
2020-12-14 00:23:47,505 : INFO : topic

In [246]:
#View Top Topic
from pprint import pprint

top_topics = model.top_topics(corpus) #, num_words=20)

pprint(top_topics)

2020-12-14 00:23:58,933 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2020-12-14 00:23:58,945 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2020-12-14 00:23:58,951 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2020-12-14 00:23:58,960 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2020-12-14 00:23:58,966 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2020-12-14 00:23:58,979 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2020-12-14 00:23:58,996 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2020-12-14 00:23:59,013 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2020-12-14 00:23:59,027 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2020-12-14 00:23:59,039 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2020-12-14 00:23:59,047 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2020-12-14 00:23:59

## Avaliação do Modelo

In [247]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

# Compute Coherence Score
coherence_model = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda)

2020-12-14 00:24:04,009 : INFO : -3.210 per-word bound, 9.3 perplexity estimate based on a held-out corpus of 15800 documents with 16677 words
2020-12-14 00:24:04,092 : INFO : using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows

Perplexity:  -3.210316815262698
Average topic coherence: -17.9319.
2020-12-14 00:24:17,916 : INFO : 3 accumulators retrieved from output queue
2020-12-14 00:24:17,962 : INFO : accumulated word occurrence stats for 9297 virtual documents

Coherence Score:  0.6398457768655382


# Visualização do Modelo

In [248]:
import pyLDAvis
import pyLDAvis.gensim
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis

# Salvar Modelo pre-treinado em disco

In [250]:
from gensim.test.utils import datapath
# Save model to disk.

temp_file = datapath("model")

model.save(temp_file)
# Load a potentially pretrained model from disk.
#lda = LdaModel.load(temp_file)

2020-12-14 00:25:35,562 : INFO : saving LdaState object under c:\Users\fialho\Desktop\py_extract_swagger\.venv\lib\site-packages\gensim\test\test_data\model.state, separately None
2020-12-14 00:25:35,579 : INFO : saved c:\Users\fialho\Desktop\py_extract_swagger\.venv\lib\site-packages\gensim\test\test_data\model.state
2020-12-14 00:25:35,585 : INFO : saving LdaModel object under c:\Users\fialho\Desktop\py_extract_swagger\.venv\lib\site-packages\gensim\test\test_data\model, separately ['expElogbeta', 'sstats']
2020-12-14 00:25:35,589 : INFO : storing np array 'expElogbeta' to c:\Users\fialho\Desktop\py_extract_swagger\.venv\lib\site-packages\gensim\test\test_data\model.expElogbeta.npy
2020-12-14 00:25:35,613 : INFO : not storing attribute dispatcher
2020-12-14 00:25:35,622 : INFO : not storing attribute id2word
2020-12-14 00:25:35,629 : INFO : not storing attribute state
2020-12-14 00:25:35,640 : INFO : saved c:\Users\fialho\Desktop\py_extract_swagger\.venv\lib\site-packages\gensim\test