In [2]:
from pprint import pprint
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON, N3
import wikipedia


## Candidate Selection

In [9]:
#pega um conjunto de categorias da wikipedia

pagina = "stephen hawking"

try:
	searched_categories = wikipedia.page(pagina).categories
	print(f"Categorias encontradas: {searched_categories}")
except wikipedia.DisambiguationError as e:
	for i, search_option in enumerate(e.options):
		print(f"{i}. {search_option}")

	choice_number = int(input("Escolha a busca e insira o número correspondente: "))
	chosen_option = e.options[choice_number]
	print(f"\nOpção escolhida: {chosen_option}\n")
	searched_categories = wikipedia.page(chosen_option).categories
except wikipedia.PageError as e:
	print(f"Página não encontrada: {e}")

Categorias encontradas: ['1942 births', '2018 deaths', '20th-century British astronomers', '20th-century English male writers', '20th-century atheists', '21st-century British astronomers', '21st-century English male writers', '21st-century atheists', 'AC with 37 elements', 'Albert Einstein Medal recipients', 'All articles lacking reliable references', 'Alumni of Trinity Hall, Cambridge', 'Alumni of University College, Oxford', 'Anti–Iraq War activists', 'Articles containing French-language text', 'Articles containing video clips', 'Articles lacking reliable references from August 2018', 'Articles with BIBSYS identifiers', 'Articles with BNE identifiers', 'Articles with BNF identifiers', 'Articles with CANTICN identifiers', 'Articles with CINII identifiers', 'Articles with DTBIO identifiers', 'Articles with FAST identifiers', 'Articles with GND identifiers', 'Articles with Google Scholar identifiers', 'Articles with ICCU identifiers', 'Articles with ISNI identifiers', 'Articles with J9U

Formando grupos de categorias que compartilham mesmo sufixo ou prefixo

In [10]:
#divide as strings das categorias
#analisa sufixos e prefixos de cada categoria
#cria dicionario que usar prefixos e sufixos como chave, e categorias como valor

def create_candidate_category_sets(categories):

	candidate_sets = dict()

	for category in categories:
		category_words = category.split()
		prefix = category_words[0].lower()
		sufix = category_words[-1].lower()
		if prefix not in candidate_sets:
			candidate_sets[prefix] = [category]
		else:
			candidate_sets[prefix].append(category)
		
		if prefix == sufix:
			continue

		if sufix not in candidate_sets:
			candidate_sets[sufix] = [category]
		else:
			candidate_sets[sufix].append(category)

	return candidate_sets


candidates = create_candidate_category_sets(searched_categories)
pprint(candidates)


{'1942': ['1942 births'],
 '2018': ['2018 deaths',
          'Articles lacking reliable references from August 2018'],
 '2019': ['Use British English from August 2019'],
 '2021': ['Wikipedia external links cleanup from June 2021',
          'Wikipedia spam cleanup from June 2021'],
 '2022': ['Use dmy dates from March 2022'],
 '20th-century': ['20th-century British astronomers',
                  '20th-century English male writers',
                  '20th-century atheists'],
 '21st-century': ['21st-century British astronomers',
                  '21st-century English male writers',
                  '21st-century atheists'],
 'abbey': ['Burials at Westminster Abbey'],
 'ac': ['AC with 37 elements'],
 'activists': ['Anti–Iraq War activists'],
 'albans': ['People from St Albans'],
 'albert': ['Albert Einstein Medal recipients'],
 'all': ['All articles lacking reliable references'],
 'alumni': ['Alumni of Trinity Hall, Cambridge',
            'Alumni of University College, Oxford'],
 'ant

## Pattern Mining

### For type assertions

In [11]:
#Recebe candidade set de categorias e retorna o tipo mais frequente, com o numero de ocorrencias e a frequencia

def get_type_frequency(category_dict):
	count = 0
	most_frequent_type_frequency = 0
	most_frequent_type = None
	for key in category_dict:
		count += len(category_dict[key])
		if len(category_dict[key]) > most_frequent_type_frequency:
			most_frequent_type_frequency = len(category_dict[key])
			most_frequent_type = key
	
	frequency = most_frequent_type_frequency / count

	return most_frequent_type, count, frequency


print(get_type_frequency(candidates))

('articles', 234, 0.18376068376068377)


In [12]:
most_frequent_type, count, frequency = get_type_frequency(candidates)

In [13]:
# define cfix and lex_score

# score of relation (p,v)
def type_score(category_set):
	lex_score = 0
	most_frequent_type, count, frequency = get_type_frequency(candidates)

	score = frequency * lex_score
	return type, score
	

# SPARQL Queries

In [14]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

sparql.setQuery("""
	PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
	PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
	PREFIX foaf: <http://xmlns.com/foaf/0.1/>
	
	SELECT DISTINCT ?s ?label
	WHERE{
		?s a dbo:Film .
		?s dbp:name ?name .
		?s rdfs:label ?label .
		FILTER regex(?name, "^La La Land")
	}
	LIMIT 100
""")

sparql.setReturnFormat(JSON)
qresults = sparql.query().convert()
pprint(qresults)
for result in qresults["results"]["bindings"]:
	print(result["label"]["value"])
	print(result["s"]["value"])
	print("\n")


{'head': {'link': [], 'vars': ['s', 'label']},
 'results': {'bindings': [{'label': {'type': 'literal',
                                     'value': 'La La Land',
                                     'xml:lang': 'en'},
                           's': {'type': 'uri',
                                 'value': 'http://dbpedia.org/resource/La_La_Land'}},
                          {'label': {'type': 'literal',
                                     'value': 'لا لا لاند',
                                     'xml:lang': 'ar'},
                           's': {'type': 'uri',
                                 'value': 'http://dbpedia.org/resource/La_La_Land'}},
                          {'label': {'type': 'literal',
                                     'value': 'La La Land',
                                     'xml:lang': 'ca'},
                           's': {'type': 'uri',
                                 'value': 'http://dbpedia.org/resource/La_La_Land'}},
                          {'label':