## Importando bibliotecas

In [None]:
%%capture

import nltk
nltk.download('omw-1.4')

!pip install rdflib
!pip install SPARQLWrapper
!pip install wikipedia
!pip install pattern

import string
import wikipedia
from pprint import pprint
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON, N3
from pattern.text.en import singularize


## Candidate Selection

In [None]:
#pega um conjunto de categorias da wikipedia
def search_wikipedia_categories(search_term):
  try:
    search_term = string.capwords(search_term)
    searched_categories = wikipedia.page(search_term, auto_suggest=False).categories
    print(f"Categorias encontradas: {searched_categories}")
    return searched_categories
  except wikipedia.DisambiguationError as e:
    for i, search_option in enumerate(e.options):
      print(f"{i}. {search_option}")

    choice_number = int(input("Escolha a busca e insira o número correspondente: "))
    chosen_option = e.options[choice_number]
    print(f"\nOpção escolhida: {chosen_option}\n")
    searched_categories = wikipedia.page(chosen_option, auto_suggest=False).categories
    return searched_categories
  except wikipedia.PageError as e:
    print(f"Página não encontrada: {e}")

searched_categories = search_wikipedia_categories("la la land")

Categorias encontradas: ['2010s English-language films', '2010s dance films', '2010s musical comedy-drama films', '2010s romantic musical films', '2016 drama films', '2016 films', '2016 romantic comedy-drama films', 'AC using state parameter: expanded', 'AC with 0 elements', 'All Wikipedia articles written in American English', 'American dance films', 'American musical comedy films', 'American musical drama films', 'American romantic comedy-drama films', 'American romantic musical films', 'Articles with short description', 'BAFTA winners (films)', 'Best Film BAFTA Award winners', 'Best Musical or Comedy Picture Golden Globe winners', 'Black Label Media films', 'CS1 German-language sources (de)', 'English-language films', 'Films about Hollywood, Los Angeles', 'Films about actors', 'Films about mass media people', 'Films about pianos and pianists', 'Films directed by Damien Chazelle', 'Films featuring a Best Actress Academy Award-winning performance', 'Films featuring a Best Musical or C

### Formando grupos de categorias que compartilham mesmo sufixo ou prefixo

In [None]:
#separa as strings das categorias
#analisa sufixos e prefixos de cada categoria
#cria dicionario que usar prefixos e sufixos como chave, e categorias como valor

def create_candidate_category_sets(categories):

	candidate_sets = dict()

	for category in categories:
		category_words = category.split()
		prefix = category_words[0].lower()
		sufix = category_words[-1].lower()
		if prefix not in candidate_sets:
			candidate_sets[prefix] = [category]
		else:
			candidate_sets[prefix].append(category)
		
		if prefix == sufix:
			continue

		if sufix not in candidate_sets:
			candidate_sets[sufix] = [category]
		else:
			candidate_sets[sufix].append(category)

	return candidate_sets


candidates = create_candidate_category_sets(searched_categories)
pprint(candidates)


{'(de)': ['CS1 German-language sources (de)'],
 '(films)': ['BAFTA winners (films)'],
 '2010s': ['2010s English-language films',
           '2010s dance films',
           '2010s musical comedy-drama films',
           '2010s romantic musical films'],
 '2016': ['2016 drama films', '2016 films', '2016 romantic comedy-drama films'],
 '2018': ['Use mdy dates from June 2018'],
 '2022': ['Use American English from June 2022'],
 'ac': ['AC using state parameter: expanded', 'AC with 0 elements'],
 'actors': ['Films about actors'],
 'all': ['All Wikipedia articles written in American English'],
 'american': ['American dance films',
              'American musical comedy films',
              'American musical drama films',
              'American romantic comedy-drama films',
              'American romantic musical films'],
 'angeles': ['Films about Hollywood, Los Angeles',
             'Films set in Los Angeles',
             'Films shot in Los Angeles'],
 'articles': ['Articles with short d

## Pattern Mining and Axiom Application

### Converte prefixo/sufixo para singular e capitaliza primeira letra

In [None]:
def format_str_type(str_type):
  #converte a string para singular caso ela esteja no plural
  str_type = singularize(str_type)
  #altera o primeiro caracter para ficar maiusculo
  return str_type.capitalize()


### Encontra possíveis propriedades que contém a string com o uso de uma querie SPARQL

In [None]:
def get_possible_rdf_type(str_type):

  rdf_type_list = []

  str_type = format_str_type(str_type)

  sparql = SPARQLWrapper("http://dbpedia.org/sparql")

  type_finder_query = """
	  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
  
    SELECT DISTINCT ?type
    WHERE {
     ?s a ?type .
    FILTER regex(?type, "%s$")
    }
    LIMIT 5
  """

  #substitui %s na querie SPARQL por str_type
  type_finder_query = type_finder_query % str_type

  sparql.setQuery(type_finder_query)
  sparql.setReturnFormat(JSON)
  qresults = sparql.query().convert()

  #preenche lista com possíveis rdf:type
  for result in qresults["results"]["bindings"]:
    rdf_type_list.append(result['type']['value'])

  #retorna lista com rdf:type que contém a string str_type
  return rdf_type_list


In [None]:
def is_rdf_type(str_type):
  lenght = len(get_possible_rdf_type(str_type))
  if lenght == 0:
    return False
  elif lenght > 0:
    return True
  else:
    print("return type for get_possible_rdf_type() is not number, something went wrong")
    return -1


In [None]:
def select_rdf_type(set_of_candidates):
  rdf_types = []
  for key in set_of_candidates.keys():
    if is_rdf_type(key):
      rdf_types.append()
  
  return rdf_types

### Pega o prefixo/sufixo mais frequente, seu número de ocorrências e frequência

In [None]:
#Recebe os conjunto de candidatos e retorna o tipo (prefixo/sufixo) mais frequente, com o numero de ocorrencias e a frequencia

def get_type_frequency(category_dict):
	count = 0
	most_frequent_type_frequency = 0
	most_frequent_type = None

	if 'articles' in category_dict:
		del category_dict['articles']

	if 'identifiers' in category_dict:
		del category_dict['identifiers']

	if 'text' in category_dict:
		del category_dict['text']

	if 'all' in category_dict:
		del category_dict['all']

	if 'pages' in category_dict:
		del category_dict['pages']

	if 'cs1' in category_dict:
		del category_dict['cs1']

	for key in category_dict:

		count += len(category_dict[key])
	
		if key.isnumeric():
			continue

		if len(category_dict[key]) > most_frequent_type_frequency:
			most_frequent_type_frequency = len(category_dict[key])
			most_frequent_type = key
	
	frequency = most_frequent_type_frequency / count

	return format_str_type(most_frequent_type), count, frequency


## Pattern Application and Post-filtering

### Calcula pontuação de confiabilidade

In [None]:
# define cfix and lex_score

# score of relation (p,v)
def type_score(category_set):
	lex_score = 0
	most_frequent_type, count, frequency = get_type_frequency(candidates)
	lex_score = len(get_possible_rdf_type(most_frequent_type))
	score = frequency * (1 + lex_score)
	return most_frequent_type, score


In [None]:
if __name__ == '__main__':

  wikipedia_term = input("Insira um tópico da wikipedia para ser analisado: ")
  minimum_conf = input("\nInsira a pontuação mínima de confiabilidade (preferencialmente algum valor entre 0 e 1): ")
  print("\n")
  searched_categories = search_wikipedia_categories(wikipedia_term)
  candidates = create_candidate_category_sets(searched_categories)
  pprint(candidates)
  most_frequent_type, count, frequency = get_type_frequency(candidates)
  selected_property, conf_score = type_score(candidates)
  if conf_score >= float(minimum_conf):
    print("**********************")
    print("Propriedade selecionada:", most_frequent_type)
    print("Pontuação de confiabilidade:", conf_score)
    print("**********************")
  else:
    print("Asserção de tipo não teve pontuação de confiabilidade suficiente")


Insira um tópico da wikipedia para ser analisado: sun

Insira a pontuação mínima de confiabilidade (preferencialmente algum valor entre 0 e 1): 0


Categorias encontradas: ['All Wikipedia articles written in American English', 'Articles containing Ancient Greek (to 1453)-language text', 'Articles containing Bavarian-language text', 'Articles containing Dutch-language text', 'Articles containing German-language text', 'Articles containing Gothic-language text', 'Articles containing Icelandic-language text', 'Articles containing Latin-language text', 'Articles containing Low German-language text', 'Articles containing Old English (ca. 450-1100)-language text', 'Articles containing Old Norse-language text', 'Articles containing Persian-language text', 'Articles containing Proto-Germanic-language text', 'Articles containing Russian-language text', 'Articles containing Sanskrit-language text', 'Articles containing Swedish-language text', 'Articles containing Welsh-language text', 'Articles 