In [1]:
import re
import ast
import spacy
import wikipedia

import pandas as pd

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [3]:
def flatten(array):
    for item in array:
        if isinstance(item, list):
            yield from flatten(item)
        else:
            yield item

In [4]:
nlp = spacy.load("en_core_web_md")

## Sparql data

In [5]:
query1 = """
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT ?film ?year ?budget
WHERE {
?film rdf:type dbo:Film .
?film dbo:director dbr:Christopher_Nolan .
OPTIONAL {?film dct:subject ?year FILTER (regex (?year, "\\d+_films"))} .
OPTIONAL {?film dbo:budget ?budget .}
}
"""
# query2 = """
# PREFIX dbo: <http://dbpedia.org/ontology/>

# SELECT ?film
# WHERE {
# ?film rdf:type dbo:Film .
# ?film dbo:director dbr:Christopher_Nolan .
# }
# """

query works in sparql, but fails here, so I extracted data to xlsx

In [6]:
# sparql = SPARQLWrapper("http://dbpedia.org/sparql")
# sparql.setQuery(query)
# sparql.setReturnFormat(JSON)
# results = sparql.query().convert()

# sparql_data = pd.DataFrame(
#     [item["film"]["value"].rsplit("/", 1)[1] for item in results["results"]["bindings"]], columns=["Films"]
# )

In [7]:
sparql_data = pd.read_excel('sparql_data.xlsx')
sparql_data['film'] = sparql_data['film'].map(lambda x: re.sub("[(].*[)]", " ", x.rsplit("/", 1)[1]).replace("_", " ").strip())
sparql_data['year'] = sparql_data['year'].fillna(" ").map(lambda x: list(reversed(x.rsplit(":", 1)))[0].replace("_films", ""))
sparql_data['budget'] = sparql_data['budget'].fillna("0").map(lambda x: ast.literal_eval(x.split("^")[0].replace('"', '')))

In [8]:
sparql_data.head()

Unnamed: 0,film,year,budget
0,The Prestige,2006,40000000.0
1,Doodlebug,1997,1000.0
2,Insomnia,2002,46000000.0
3,Interstellar,2014,165000000.0
4,The Dark Knight,2008,185000000.0


### Wiki data

In [9]:
wikipedia.set_lang("en")
wiki_page = wikipedia.page("Christopher_Nolan")

In [10]:
# cut the list of filmography to make sure it will not appear in plain text
content = wiki_page.content.split('== Filmography and awards ==')
wiki_text = content[0]
# wiki_data = content[1]

In [11]:
paragraphs = [re.sub(r"[=]+.+?[=]+", " ", paragraph).replace("\n", " ").strip()
              for paragraph in wiki_text.split("\n\n")]

In [12]:
paragraphs = list(filter(None, paragraphs))

In [13]:
doc = nlp(paragraphs[0])

In [14]:
def get_budget(page):
    try:
        wiki_page = wikipedia.page(page)
    except wikipedia.DisambiguationError as e:
        page = list(filter(lambda x: "film" in x, e.options))
        if len(page) > 0:
            page = page[0]
            wiki_page = wikipedia.page(page)
        else:
            return ''
    except wikipedia.PageError:
        return ''
        
    res = 0
    for sentence in nlp(wiki_page.content).sents:
        if re.search('budget', str(sentence)):
            res = " ".join([token.text for token in sentence if token.pos_ == 'NUM'])
            break
    return res

In [15]:
core_verbs = ['release', 'work', 'premiere', 'direct', 'film', 'produce', 'announce']
filter_words = ['Nolan', 'Warner Bros.', 'English', 
                'January', 'February', 'March', 'April', 'May',
                'June', 'July', 'August', 'September', 'October', 'November', 'December']
res = []

for paragraph in paragraphs:
#     paragraph = paragraph.lower()
    doc = nlp(paragraph)
    for sentence in doc.sents:
        verbs = [(token.lemma_, [word for word in token.children 
                                 if str(word).strip()[0].isupper() and nlp(str(word))[0].pos_ == 'NOUN']) 
                 for token in sentence 
                 if token.pos_ == "VERB" and token.lemma_ in core_verbs]
        nums = [(token.text, [word.lemma_ for word in token.ancestors]) 
                 for token in sentence 
                 if token.pos_ == "NUM" and str(token.text).isdigit() and len(str(token.text)) == 4]
        if len(verbs) > 0 and len(nums) == 1:
            nums = list(nums[0])
            nums[1] = list(flatten([dict(verbs).get(item, item) for item in nums[1]])) + sentence.ents
            nums[1] = [str(word) for word in nums[1] if str(word).strip()[0].isupper()]
            nums[1] = [word for word in nums[1] if nlp(str(word).lower())[0].pos_ in ('NOUN', 'PROPN')]
            nums[1] = set(filter(lambda x: not any([word in x for word in filter_words]), nums[1]))
            # TODO: filter whether it is a person name or not
#             nums = {nums[0]: list(nums[1])}
            if len(nums[1]) > 0:
                for item in nums[1]:
                    res.append({"year": nums[0], "film": item})

In [16]:
parsed_films = pd.DataFrame(res)

In [17]:
%%time

parsed_films['budget'] = parsed_films['film'].map(get_budget)



  lis = BeautifulSoup(html).find_all('li')


CPU times: user 32 s, sys: 2.8 s, total: 34.8 s
Wall time: 1min 49s


In [18]:
target = set(sparql_data.film.unique())
preds = set(parsed_films.film.unique())

In [19]:
TP = set(target).intersection(preds)

In [20]:
FN = target - preds

In [21]:
FP = preds - TP

In [22]:
precision = len(TP) / (len(TP) + len(FP))
precision

0.13333333333333333

In [23]:
recal = len(TP) / (len(TP) + len(FN))
recal

0.46153846153846156

In [24]:
f_score = 2*precision*recal /(precision + recal)
f_score

0.20689655172413796