In [1]:
import re
import ast
import spacy
import wikipedia

import pandas as pd
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
from bs4 import BeautifulSoup

In [2]:
nlp = spacy.load("en_core_web_md")

## Sparql data

In [3]:
query1 = """
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT ?film ?year ?budget
WHERE {
?film rdf:type dbo:Film .
?film dbo:director dbr:Christopher_Nolan .
OPTIONAL {?film dct:subject ?year FILTER (regex (?year, "\\d+_films"))} .
OPTIONAL {?film dbo:budget ?budget .}
}
"""
# query2 = """
# PREFIX dbo: <http://dbpedia.org/ontology/>

# SELECT ?film
# WHERE {
# ?film rdf:type dbo:Film .
# ?film dbo:director dbr:Christopher_Nolan .
# }
# """

query works in sparql, but fails here, so I extracted data to xlsx

In [4]:
# sparql = SPARQLWrapper("http://dbpedia.org/sparql")
# sparql.setQuery(query)
# sparql.setReturnFormat(JSON)
# results = sparql.query().convert()

# sparql_data = pd.DataFrame(
#     [item["film"]["value"].rsplit("/", 1)[1] for item in results["results"]["bindings"]], columns=["Films"]
# )

In [5]:
sparql_data = pd.read_excel('sparql_data.xlsx')
sparql_data['film'] = sparql_data['film'].map(lambda x: re.sub("[(].*[)]", " ", x.rsplit("/", 1)[1]).replace("_", " ").strip())
sparql_data['year'] = sparql_data['year'].fillna(" ").map(lambda x: list(reversed(x.rsplit(":", 1)))[0].replace("_films", ""))
sparql_data['budget'] = sparql_data['budget'].fillna("0").map(lambda x: ast.literal_eval(x.split("^")[0].replace('"', '')))
sparql_data.sort_values('year', inplace=True)

In [6]:
sparql_data

Unnamed: 0,film,year,budget
6,Inception,,160000000.0
7,The Dark Knight Rises,,230000000.0
8,The Dark Knight Trilogy,,585000000.0
9,Memento,,9000000.0
10,Batman Begins,,150000000.0
1,Doodlebug,1997.0,1000.0
5,Following,1998.0,6000.0
2,Insomnia,2002.0,46000000.0
0,The Prestige,2006.0,40000000.0
4,The Dark Knight,2008.0,185000000.0


### Wiki data

In [7]:
wikipedia.set_lang("en")
wiki_page = wikipedia.page("Christopher_Nolan")

In [8]:
# cut the list of filmography to make sure it will not appear in plain text
content = wiki_page.content.split('== Filmography and awards ==')
wiki_text = content[0]
# wiki_data = content[1]

In [9]:
paragraphs = [re.sub(r"[=]+.+?[=]+", " ", paragraph).replace("\n", " ").strip()
              for paragraph in wiki_text.split("\n\n")]

In [10]:
paragraphs = list(filter(None, paragraphs))

In [11]:
doc = nlp(paragraphs[0])

In [12]:
def flatten(array):
    for item in array:
        if isinstance(item, list):
            yield from flatten(item)
        else:
            yield item

In [13]:
core_verbs = ['release', 'work', 'premiere', 'direct', 'film', 'produce', 'announce']
filter_words = ['Nolan', 'Warner Bros.', 'English', 
                'January', 'February', 'March', 'April', 'May',
                'June', 'July', 'August', 'September', 'October', 'November', 'December']
res = []

for paragraph in paragraphs:
    doc = nlp(paragraph)
    for sentence in doc.sents:
        # Get all nouns which are directly ruled by from core_verbs list
        # core_verbs is the list of most common used words in case of directing movies
        verbs = [(token.lemma_, [word for word in token.children 
                                 if str(word).strip()[0].isupper() and nlp(str(word))[0].pos_ == 'NOUN']) 
                 for token in sentence 
                 if token.pos_ == "VERB" and token.lemma_ in core_verbs]
        
        # all posible dates in the sentence
        nums = [(token.text, [word.lemma_ for word in token.ancestors]) 
                 for token in sentence 
                 if token.pos_ == "NUM" and str(token.text).isdigit() and len(str(token.text)) == 4]
        
        # attempt to map probable movie (noun from verbs list) with probable date
        if len(verbs) > 0 and len(nums) == 1:
            nums = list(nums[0])
            nums[1] = list(flatten([dict(verbs).get(item, item) for item in nums[1]]))

            # adding to possilbe movies all entities except persons
#             nums[1] += list(sentence.ents)
            nums[1] += [token for token in sentence.ents if token.label_!='PERSON']
            
            # filtering only capitalized words
            nums[1] = [str(word) for word in nums[1] if str(word).strip()[0].isupper()]
            # filtering only NOUNs
            nums[1] = [word for word in nums[1] if nlp(str(word).lower())[0].pos_ in ('NOUN', 'PROPN')]
            # filter typical words that may be occure in text, bu definately can't be a movie
            nums[1] = set(filter(lambda x: not any([word in x for word in filter_words]), nums[1]))
            
            # TODO: filter whether it is a person name or not
            if len(nums[1]) > 0:
                for item in nums[1]:
                    res.append({"year": nums[0], "film": item})

In [14]:
parsed_films = pd.DataFrame(res)
parsed_films['year'] = parsed_films['year'].astype(int)
parsed_films['film'] = parsed_films['film'].map(lambda x: re.sub(r"[^\w\s]", '', x))
parsed_films.sort_values('year', inplace=True)
parsed_films.reset_index(inplace=True, drop=True)
parsed_films

Unnamed: 0,year,film
0,1940,France
1,1940,Dunkirk
2,1940,World War II
3,1989,Roko co
4,1989,Tarantella
5,1989,Image Union
6,1997,Doodlebug
7,2001,Mori
8,2002,Insomnia
9,2002,Academy Award


In [15]:
def get_budget(page):
    
    def convert_to_num(text_num):
        regex_res = re.findall("\d+\s+\w+", text_num)
        if len(regex_res) > 0:
            res = ast.literal_eval(regex_res[0].replace("million", "000000").replace(" ", ""))
            return res
        else:
            return 0
    try:
        wiki_page = wikipedia.page(page)
#         soup = BeautifulSoup(wiki_page, 'lxml')
        table = pd.read_html(wiki_page.html())[0].fillna("")
        budget = table.loc[table[table.columns[0]].str.contains('budget', case=False)]
        if budget.shape[0] > 0:
            budget = budget.values[0][1]
        if len(budget) > 0:
            return convert_to_num(budget)

    except wikipedia.DisambiguationError as e:
        page = list(filter(lambda x: "film" in x, e.options))
        if len(page) > 0:
            page = page[0]
            wiki_page = wikipedia.page(page)
        else:
            return 0
    except wikipedia.PageError:
        return 0

    res = ""
    for sentence in nlp(wiki_page.content).sents:
        if re.search('budget', str(sentence)):
            res = " ".join([token.text for token in sentence if token.pos_ == 'NUM'])
            break
    return convert_to_num(res)

In [16]:
lst = []

for movie in tqdm(parsed_films.film.values):
    budget = get_budget(movie)
    lst.append(budget)
    
parsed_films['budget'] = lst



  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 23/23 [01:53<00:00,  4.93s/it]


In [17]:
parsed_films

Unnamed: 0,year,film,budget
0,1940,France,0
1,1940,Dunkirk,0
2,1940,World War II,20142018
3,1989,Roko co,0
4,1989,Tarantella,0
5,1989,Image Union,0
6,1997,Doodlebug,199716
7,2001,Mori,0
8,2002,Insomnia,0
9,2002,Academy Award,4000000


In [23]:
sparql_data

Unnamed: 0,film,year,budget
6,Inception,,160000000.0
7,The Dark Knight Rises,,230000000.0
8,The Dark Knight Trilogy,,585000000.0
9,Memento,,9000000.0
10,Batman Begins,,150000000.0
1,Doodlebug,1997.0,1000.0
5,Following,1998.0,6000.0
2,Insomnia,2002.0,46000000.0
0,The Prestige,2006.0,40000000.0
4,The Dark Knight,2008.0,185000000.0


### Evaluate results

#### strict merge

In [63]:
df_common = sparql_data.merge(parsed_films, on='film')
df_common

Unnamed: 0,film,year_x,budget_x,year_y,budget_y
0,Inception,,160000000.0,2010,160000000
1,Batman Begins,,150000000.0,2006,150000000
2,Doodlebug,1997.0,1000.0,1997,199716
3,Insomnia,2002.0,46000000.0,2002,0
4,Quay,2015.0,0.0,2015,0
5,Dunkirk,2017.0,0.0,1940,0


In [55]:
target = set(sparql_data.film.values)
preds = set(parsed_films.film.values)

In [56]:
TP = target.intersection(preds)
FN = target - preds
FP = preds - TP

In [57]:
FP

{'Academy Award',
 'Batman',
 'DGA',
 'France',
 'Image Union',
 'Man',
 'Man of Steel',
 'Mori',
 'NFPB',
 'Prestige',
 'Roko co',
 'Salon',
 'Tarantella',
 'Tenet',
 'Troy ',
 'Warner Bros',
 'World War II'}

Actually, Prestige is the same as The Prestige, so shouldn't be counted as FP. Mori is part of film Memento, which some time also called Memento Mori.
Also, Tenet is upcomming Nolan's film, so it's absent in sparql db, but will appear in the near future.

In [58]:
precision = len(TP) / (len(TP) + len(FP))
recal = len(TP) / (len(TP) + len(FN))
f1_score = 2 * precision * recal / (precision + recal)

print(f"precision: {precision:.2f}", f"recall: {recal:.2f}", f"f1_score: {f1_score:.2f}", sep='\n')

precision: 0.26
recall: 0.46
f1_score: 0.33


#### year extraction accuracy

We make an assumption that if data is absent in sparqldb, then the info that we found in text is the correct one

In [90]:
true = df_common.loc[(df_common.year_x.astype(str) == df_common.year_y.astype(str)) | 
                     ((df_common.year_x.str.strip() == "") & (df_common.year_y.map(str) != ""))]
true

Unnamed: 0,film,year_x,budget_x,year_y,budget_y
0,Inception,,160000000.0,2010,160000000
1,Batman Begins,,150000000.0,2006,150000000
2,Doodlebug,1997.0,1000.0,1997,199716
3,Insomnia,2002.0,46000000.0,2002,0
4,Quay,2015.0,0.0,2015,0


In [91]:
print(f"Accuracy: {true.shape[0]/ df_common.shape[0]:.2f}")

Accuracy: 0.83


#### budget extraction accuracy

In [92]:
true = df_common.loc[(df_common.budget_x == df_common.budget_y) & (df_common.budget_y != 0)]
true

Unnamed: 0,film,year_x,budget_x,year_y,budget_y
0,Inception,,160000000.0,2010,160000000
1,Batman Begins,,150000000.0,2006,150000000


In [89]:
print(f"Accuracy: {true.shape[0]/ df_common.shape[0]:.2f}")

Accuracy: 0.33
