## I. NLP Full Cycle

### 1. Domain

Let's compare information about **Christian Bale's** filmography in DBpedia vs. Wikipedia.

```
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX res:  <http://dbpedia.org/resource/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?movie ?year
WHERE {
	?uri dbo:starring res:Christian_Bale .
        OPTIONAL {?uri dct:subject ?cat . 
                  ?cat rdfs:label ?year . 
                  FILTER (regex (?year, '\\d+ films', 'i'))} .
	?uri rdfs:label ?movie .
        FILTER (lang(?movie) = 'en')
}```

**Christian Bale** in:

[🎬 DBpedia](https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=PREFIX+dbo%3A+%3Chttp%3A%2F%2Fdbpedia.org%2Fontology%2F%3E%0D%0APREFIX+res%3A++%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2F%3E%0D%0APREFIX+rdfs%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0D%0ASELECT+DISTINCT+%3Fmovie+%3Fyear%0D%0AWHERE+%7B%0D%0A++++%3Furi+dbo%3Astarring+res%3AChristian_Bale+.%0D%0A++++++++OPTIONAL+%7B%3Furi+dct%3Asubject+%3Fcat+.+%0D%0A++++++++++++++++++%3Fcat+rdfs%3Alabel+%3Fyear+.+%0D%0A++++++++++++++++++FILTER+%28regex+%28%3Fyear%2C+%27%5C%5Cd%2B+films%27%2C+%27i%27%29%29%7D+.%0D%0A++++%3Furi+rdfs%3Alabel+%3Fmovie+.%0D%0A++++++++FILTER+%28lang%28%3Fmovie%29+%3D+%27en%27%29%0D%0A%7D&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+)

[🎬 Wikipedia](https://en.wikipedia.org/wiki/Christian_Bale)

[🎬 IMDb](https://www.imdb.com/name/nm0000288)

### 2. Information extraction

In [1]:
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urlencode

In [2]:
title_clear_re = re.compile(r'\s\([\s\S]*') # remove year in title
year_re = re.compile(r'\d{4}(?!\w)') # year in YYYY format
p_year_re = re.compile(r'\(\d{4}\)') # year in parentheses (YYYY)
sbd_re = re.compile(r'(?<=\.|\n)')   # sentence boundary detection
ner_re = re.compile(r'\s*\(\d{4}\)') # movie titles search
tokenize_re = re.compile(r'(?<!vs)[,.]?[\s.]+') # simple tokinizer
ner_vocabulary = {}

In [3]:
def get_dbpedia_data(actor: str) -> dict:
    "Returns a dictionary of movie titles and year released"
    
    sparql_request = r'''
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX res:  <http://dbpedia.org/resource/>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT DISTINCT ?movie ?year
        WHERE {
            ?uri dbo:starring res:''' + actor + r'''.
                OPTIONAL {?uri dct:subject ?cat . 
                          ?cat rdfs:label ?year . 
                          FILTER (regex (?year, '\\d+ films', 'i'))} .
            ?uri rdfs:label ?movie .
                FILTER (lang(?movie) = 'en')
        }
    '''
    dbpedia_api = 'https://dbpedia.org/sparql?'
    dbpedia_query = dbpedia_api + urlencode({'query': sparql_request.strip(),
                                         'format': 'application/sparql-results+json'})
    
    movies = {}
    response = requests.get(dbpedia_query)
    for movie in response.json()['results']['bindings']:
        title = title_clear_re.sub('', movie['movie']['value'])
        year = None
        if 'year' in movie:
            match = year_re.match(movie['year']['value'])
            if match:
                year = match.group(0)
        movies[title] = year
    return movies

In [4]:
def get_wiki_data(actor: str) -> (str, list):
    "Returns text of the wiki page and a list of movie titles"
    
    wikipedia_api = 'https://en.wikipedia.org/api/rest_v1/'
    wikipedia_query = wikipedia_api + 'page/html/' + actor
    
    response = requests.get(wikipedia_query)
    response_html = response.text[:response.text.find('<h2 id="References">References</h2>')]
    soup = BeautifulSoup(response_html, features="html.parser")
    
    return soup.text, sorted(set([a.text for a in soup.find_all('a', {'title': True}) if 'film)' in a.attrs['title']]))

In [5]:
def sentence_boundary_detection(text: str) -> list:
    # this bit here may generate an error if you're using python 3.6 or older.
    # older versions of re.split() did not allow empty sub-strings to be returned
    return [s.strip() for s in sbd_re.split(text) if s.strip()]

def tokenize(text: str) -> list:
    return [t for t in tokenize_re.split(text) if t]

def is_NE(text: str) -> bool:
    return text[0].isupper() and (text[-1].islower() or text[-1].isdigit() or len(text) == 1)

def is_year(text: str) -> bool:
    return year_re.match(text) is not None

def is_p_year(text: str) -> bool:
    return p_year_re.match(text) is not None

In [6]:
def build_ner_vocabulary(titles: set) -> dict:
    tree = {}
    for t in titles:
        tokens = tokenize(t)
        if tokens[0] not in tree:
            tree[tokens[0]] = [tokens]
        else:
            tree[tokens[0]].append(tokens)
    return tree

There are two basic patterns for mentioning a year of release in a sentence:
- a year metioned once in a sentence, it is applicable for all movies listed in the sentence;
- a movie title is immediately followed by a year in parentheses.


In [7]:
def get_movies_from_sentence(sentence: str) -> dict:
    "Returns a dictionary of movie titles and year released found in a sentence"
    
    movies = {}
    default_year = None
    tokens = tokenize(sentence)
    i = 0
    while i < len(tokens):
        match = year_re.match(tokens[i])
        if match:
            default_year = match.group(0)
        elif is_NE(tokens[i]) and tokens[i] in ner_vocabulary:
            for title in ner_vocabulary[tokens[i]]:
                for j, word in enumerate(title):
                    if i + j == len(tokens) or word != tokens[i + j]:
                        j -= 1
                        break
                
                if j == len(title)-1:
                    # a movie title is found
                    i += j+1
                    title_text = ' '.join(title)
                    movies[title_text] = default_year

                    # check if it's followed by year in parentheses
                    if i < len(tokens) and is_p_year(tokens[i]):
                        movies[title_text] = tokens[i][1:-1]
                    break
        i += 1
        
        # assign default sentence year to movie titles without it
        if default_year and movies:
            for title, year in movies.items():
                if not year:
                    movies[title] = default_year
    
    return movies

In [8]:
def get_movies_from_text(text: str) -> dict:
    "Returns a dictionary of movie titles and year released found in a text"
    
    all_movies = {}
    sentences = sentence_boundary_detection(text)
    for s in sentences:
        movies = get_movies_from_sentence(s)
        for title, year in movies.items():
            if title not in all_movies or not all_movies[title]:
                all_movies[title] = year
    
    return all_movies

In [15]:
def get_dbpedia_and_wiki(actor: str):
    
    dbpedia_movies = get_dbpedia_data(actor)
    text_wiki, list_wiki = get_wiki_data(actor)
    
    global ner_vocabulary
    ner_vocabulary = build_ner_vocabulary(set(dbpedia_movies.keys()) | set(list_wiki))
    wiki_movies = get_movies_from_text(text_wiki)
    
    return dbpedia_movies, wiki_movies

In [30]:
dbpedia_movies, wiki_movies = get_dbpedia_and_wiki('Christian_Bale')

In [31]:
dbpedia_movies

{'Equilibrium': '2002',
 'Laurel Canyon': '2002',
 'Reign of Fire': '2002',
 'The New World': '2005',
 'Harsh Times': '2005',
 'The Machinist': '2004',
 'Rescue Dawn': '2006',
 "I'm Not There": '2007',
 '3:10 to Yuma': '2007',
 'Terminator Salvation': '2009',
 'The Fighter': '2010',
 'Out of the Furnace': '2013',
 'American Hustle': '2013',
 'The Promise': '2016',
 'Knight of Cups': '2015',
 'The Big Short': '2015',
 'Mio in the Land of Faraway': '1987',
 'Empire of the Sun': '1987',
 'Henry V': '1989',
 'Newsies': '1992',
 'Prince of Jutland': '1993',
 'Swing Kids': '1993',
 'Little Women': '1994',
 'Pocahontas': '1995',
 'Metroland': '1997',
 'Velvet Goldmine': '1998',
 'All the Little Animals': '1998',
 "A Midsummer Night's Dream": '1999',
 'Shaft': '2000',
 'American Psycho': '2000',
 "Captain Corelli's Mandolin": '2001',
 'Exodus: Gods and Kings': '2014',
 'Jungle Book': '2018',
 'The Flowers of War': '2011',
 'Weightless': None,
 'The Dark Knight Trilogy': None,
 'Hostiles': None

In [32]:
wiki_movies

{'Empire of the Sun': '1987',
 'Little Women': '1994',
 'American Psycho': '2000',
 'The Machinist': '2004',
 'The Dark Knight': '2005',
 'The Prestige': '2006',
 'Terminator Salvation': '2009',
 'Public Enemies': '2009',
 'The Fighter': '2010',
 'American Hustle': '2013',
 'The Big Short': '2015',
 'Vice': '2018',
 'Mio in the Land of Faraway': '1986',
 'Henry V': '1989',
 'Treasure Island': '1990',
 'Newsies': None,
 'Swing Kids': None,
 'Pocahontas': '1995',
 'Velvet Goldmine': '1997',
 "A Midsummer Night's Dream": '1999',
 'The Beach': None,
 'The Rules of Attraction': None,
 'Shaft': '2000',
 "Captain Corelli's Mandolin": None,
 'All the Little Animals': None,
 'Laurel Canyon': '2002',
 'Prince of Jutland': '1994',
 'Reign of Fire': None,
 'Equilibrium': '2002',
 "Howl's Moving Castle": None,
 'Harsh Times': '2005',
 'The New World': None,
 'Rescue Dawn': None,
 "I'm Not There": None,
 'W': None,
 'The Flowers of War': '2010',
 'Out of the Furnace': '2013',
 'Knight of Cups': None

### 3. Results evaluation

Since neither DBpedia nor Wiki bio-page contains full and comprehensive data on actor's movies I decided to settle with the two metrics:
- _agreeableness_ between the two sources;
- _scrupulousness_ - a share of movie titles with release year mentioned.

In [25]:
def agreeableness(dict1: dict, dict2: dict) -> float:
    matches = 0
    for key, value in dict1.items():
        if key in dict2:
            if dict1[key] == dict2[key]:
                matches += 1
            else:
                matches += 0.9
    return matches / (len(dict1) + len(dict2)) * 2

In [26]:
def scrupulousness(dict1: dict) -> float:
    count = 0
    for key, value in dict1.items():
        if value:
            count += 1
    return count / len(dict1)

In [42]:
actors = ['Christian_Bale', 'Matthew_McConaughey', 'Edward_Norton']
for actor in actors:
    print(f"\n\u001b[1m{actor.replace('_', ' ')}\u001b[0m")
    dbpedia_movies, wiki_movies = get_dbpedia_and_wiki(actor)
    print(f'DBpedia: movie titles = {len(dbpedia_movies)}, scrupulousness = {scrupulousness(dbpedia_movies):.2}')
    print(f'Wikipedia: movie titles = {len(wiki_movies)}, scrupulousness = {scrupulousness(wiki_movies):.2}')
    print(f'agreeableness = {agreeableness(dbpedia_movies, wiki_movies):.2}')                                           


[1mChristian Bale[0m
DBpedia: movie titles = 40, scrupulousness = 0.85
Wikipedia: movie titles = 40, scrupulousness = 0.68
agreeableness = 0.76

[1mMatthew McConaughey[0m
DBpedia: movie titles = 36, scrupulousness = 0.92
Wikipedia: movie titles = 39, scrupulousness = 0.82
agreeableness = 0.83

[1mEdward Norton[0m
DBpedia: movie titles = 20, scrupulousness = 0.9
Wikipedia: movie titles = 27, scrupulousness = 0.96
agreeableness = 0.79
