In [45]:
import time
import requests
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('BigData', 'en')
from bs4 import BeautifulSoup

def search_wiki_page_key(title, year):
    
    # Search wikipedia for the URL
    
    language_code = 'en'
    search_query = f'{title} film {year}'
    headers = {
    # 'Authorization': 'Bearer YOUR_ACCESS_TOKEN',
    'User-Agent': 'BigDataScraper (randomemail@gmail.com)'
    }

    base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
    endpoint = '/search/page'
    url = base_url + language_code + endpoint
    parameters = {'q': search_query, 'limit': 1}
    response = requests.get(url, headers=headers, params=parameters)
    
    # API limit of 500 per hour
    if response.status_code == 429:
        print ('Sleeping for 10 mins')
        
        # This is in loop so ctrl+c can be used to interrupt the sleep
        for i in range(10*60):
            time.sleep(1)
        
        response = requests.get(url, headers=headers, params=parameters)
    
    try:
        page_key = response.json()['pages'][0]['key']
        return page_key
    except (IndexError,KeyError):
        return None       
    

def get_wiki_categories(wiki_page_key):
    
    wiki_page = wiki_wiki.page(wiki_page_key)
    categories = [cat for cat in wiki_page.categories]
    
    return categories

def get_infobox(wiki_page_key):
    
    url = f'https://en.wikipedia.org/wiki/{wiki_page_key}'
    
    # Fetch the content from the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the infobox within the page
    infobox = soup.find('table', {'class': 'infobox'})
    if not infobox:
        return None
    
    # Dictionary to store the information
    infobox_data = {}
    
    # Iterate through each row in the infobox
    for tr in infobox.find_all('tr'):
        if tr.find('th') and tr.find('td'):
            # Get the key from the header and the value from the data cell
            key = tr.find('th').text.strip()
            value = tr.find('td').text.strip()
            
            # Add the key-value pair to the dictionary
            infobox_data[key] = value
            
    return infobox_data


In [46]:
# Initialise DuckDB

import duckdb

# Path to store DuckDB with wiki data
con_disk = duckdb.connect(database='wiki_data.duckdb')


In [47]:
# Load all the movie data from training, validation and test sets

con_mem = duckdb.connect(database=':memory:')
film_df = con_mem.execute('''
    SELECT tconst,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(primaryTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS pTitle,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(originalTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS oTitle,
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS Year,
        runtimeMinutes,numVotes
    FROM 'big-data-course-2024-projects/imdb/train-[1-8].csv'
    UNION
    SELECT tconst,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(primaryTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS pTitle,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(originalTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS oTitle,
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS Year,
        runtimeMinutes,numVotes
    FROM 'big-data-course-2024-projects/imdb/test_hidden.csv'
    UNION
    SELECT tconst,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(primaryTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS pTitle,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(originalTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS oTitle,
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS Year,
        runtimeMinutes,numVotes
    FROM 'big-data-course-2024-projects/imdb/validation_hidden.csv'
    ORDER BY Year, tconst
''').df()

film_df

Unnamed: 0,tconst,pTitle,oTitle,Year,runtimeMinutes,numVotes
0,tt0003740,cabiria,,1914,148,3452.0
1,tt0008663,a man there was,terje vigen,1917,65,1882.0
2,tt0009369,mickey,mickey,1918,93,1119.0
3,tt0010307,jaccuse,,1919,166,1692.0
4,tt0010600,the doll,die puppe,1919,66,1898.0
...,...,...,...,...,...,...
9995,tt9766294,fauji calling,fauji calling,2021,134,2921.0
9996,tt9769668,tughlaq durbar,tughlaq durbar,2021,145,1430.0
9997,tt9784798,judas and the black messiah,judas and the black messiah,2021,126,65194.0
9998,tt9808510,vellam,vellam,2021,154,1731.0


In [None]:
import json

data = []

for idx, film in film_df.iterrows():
    
    tconst = film['tconst']
    title = film['pTitle']
    year = film['Year']
    
    # Check if entry exists in database
    tconst_exists = con_disk.execute(f'''
        SELECT EXISTS (SELECT 1 FROM raw_wiki WHERE tconst = '{tconst}')
    ''').fetchone()[0]
    
    # Skip if it exists already
    if tconst_exists:            
        continue
    
    # Scrape wikipedia data
    wiki_key = search_wiki_page_key(title, year)
    
    if wiki_key is None:
        print (idx, tconst, title, year, '--key could not be found--')
        continue
        
    film_categories = get_wiki_categories(wiki_key)
    infobox_data = get_infobox(wiki_key)
    
    # Check that info box corresponds to a film
    
    # Append to film
    film['wiki_key'] = wiki_key
    
    if infobox_data is None:
        film['categories'] = None
        film['infobox'] = None
    
    elif 'Directed by' in infobox_data.keys():
        film['categories'] = json.dumps(film_categories)
        film['infobox'] = json.dumps(infobox_data)
    else:
        film['categories'] = None
        film['infobox'] = None
    
    filmT = film.to_frame().T
    filmT = filmT.drop(columns='oTitle')
    
    # Save to database
    con_disk.execute("INSERT INTO raw_wiki SELECT * FROM filmT;")
    # break
    # Track progress
    print (idx, tconst, title, film['oTitle'], year)

In [49]:
con_disk.close()