# Importing Libraries

In [83]:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# Data Preprocessing

This is for the data I downloaded from BFI weekend grossing, which I cleaned to retrieve the information to populate the data on TMDB

In [81]:
df = pd.read_csv('bfi-weekend-box-office-report-2024-08-09-11.csv')
df.head(3)

Unnamed: 0,BFI Weekend Box Office 09/08/2024 - 11/08/2024,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,#,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,Rank,Film,Country of Origin,Weekend Gross,Distributor,% change on last week,Weeks on release,Number of cinemas,Site average,Total Gross to date,,,,,,,,
1,1,It Ends With Us,USA,4516760,Sony Pictures,-,1,625,"£7,227",4516760,,,,,,,,
2,2,Deadpool & Wolverine,UK/USA,4083378,Disney,-49%,3,713,"£5,727",42986728,,,,,,,,


In [84]:
# Extracting the relevant columns: Film, Weekend Gross, and Total Gross to Date
df_cleaned = df[['Unnamed: 1', 'Unnamed: 3', 'Unnamed: 9']]

# Renaming the columns for clarity
df_cleaned.columns = ['Film', 'Weekend_Gross', 'Total_Gross_to_Date']

# Removing any header rows and rows with NaN in the Film column
df_cleaned = df_cleaned[df_cleaned['Film'] != 'Film']
df_cleaned = df_cleaned.dropna(subset=['Film'])
#Drop row with - in the Weekend_Gross column
df_cleaned = df_cleaned[df_cleaned['Weekend_Gross'] != '-']

# Reprocessing the data
df_cleaned['Weekend_Gross'] = df_cleaned['Weekend_Gross'].replace({' - ': np.nan, '  -  ': np.nan, ' -': np.nan}).replace('[£,]', '', regex=True)
df_cleaned['Total_Gross_to_Date'] = df_cleaned['Total_Gross_to_Date'].replace({' - ': np.nan, '  -  ': np.nan, ' -': np.nan}).replace('[£,]', '', regex=True)

# Now converting them to numeric
df_cleaned['Weekend_Gross'] = pd.to_numeric(df_cleaned['Weekend_Gross'], errors='coerce')
df_cleaned['Total_Gross_to_Date'] = pd.to_numeric(df_cleaned['Total_Gross_to_Date'], errors='coerce')

df_cleaned['Total_Gross_to_Date'] = df_cleaned['Total_Gross_to_Date'].replace('[£,]', '', regex=True).astype(float)

#Drop weekend gross rows with NaN
df_cleaned.dropna(subset=['Weekend_Gross'], inplace=True)
#Drop total gross
df_cleaned = df_cleaned[df_cleaned['Film'] != 'Total']


# Scrape and Retrieve the weekend Gross

This is a way to improve workflow process by Scraping and filling data directly from Box Office Mojo UK Weekend Earnings then pass the films to TMDB

It is benefitial as the front end has the potential to be automated in future prototypes

The user would input the date of which they would like to see the movie grossing, meaning you have the potential for historical data past January 2023

When converting it would h

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Ask user to input date
date_input = input("Enter the date in the format YYYY-MM-DD: ")

# Convert the date to a week in the year
date = pd.to_datetime(date_input)
week = date.isocalendar().week
year = date.year

# URL for the Box Office Mojo UK weekend page
url = f'https://www.boxofficemojo.com/weekend/{year}W{week}/?area=GB&ref_=bo_wey_table_3'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all film links on the weekend page
film_links = {}
films_table = soup.find('table')

# Extract all films and their links from the table
if films_table:
    rows = films_table.find_all('tr')[1:16]  # Only process the first 15 rows (skip the header)
    for row in rows:
        film_cell = row.find('a')
        if film_cell and 'href' in film_cell.attrs:
            film_name = film_cell.text.strip()
            film_link = 'https://www.boxofficemojo.com' + film_cell['href']
            film_links[film_name] = film_link

# Initialize an empty DataFrame to store all film data
all_films_data = pd.DataFrame()

# Scrape data for each film
for film, link in film_links.items():
    film_response = requests.get(link)
    film_soup = BeautifulSoup(film_response.content, 'html.parser')
    
    # Find the table containing weekend grossings 
    table = film_soup.find('table')  

    # Extract the table data
    if table:
        data = [
            [col.text.strip() for col in row.find_all('td')]
            for row in table.find_all('tr')[1:]  # Skipping header row
        ]

        # Convert data to DataFrame
        film_data = pd.DataFrame(data, columns=['Date', 'Rank', 'Weekend Gross', 'Gross Change/Week', 'Theater', 'Change', 'Average', 'To Date', 'Weekend Since Release', 'Been Preprocessed'])
        film_data['Film'] = film  # Add film name to the DataFrame
        all_films_data = pd.concat([all_films_data, film_data], ignore_index=True)

# Drop unnecessary columns
all_films_data.drop(columns=['Been Preprocessed'], inplace=True)

# Convert columns to strings, then remove non-numeric characters and convert to numeric
for col in ['Weekend Gross', 'To Date', 'Average']:
    all_films_data[col] = pd.to_numeric(all_films_data[col].str.replace(r'[^\d.]', '', regex=True), errors='coerce')

# Convert to GBP by multiplying by 0.78
exchange_rate = 0.78
for col in ['Weekend Gross', 'To Date', 'Average']:
    all_films_data[col] *= exchange_rate

# Save to CSV
all_films_data.to_csv('gross.csv', index=False)


In [2]:
all_films_data

Unnamed: 0,Date,Rank,Weekend Gross,Gross Change/Week,Theater,Change,Average,To Date,Weekend Since Release,Film
0,Jul 26-28,1,17331928.38,-,-,-,,17331928.38,1,Deadpool & Wolverine
1,Aug 2-4,1,8011635.06,-53.8%,715,-,11204.70,33394434.06,2,Deadpool & Wolverine
2,Aug 9-11,2,4063477.08,-49.3%,713,-2,5698.68,42777228.78,3,Deadpool & Wolverine
3,Aug 16-18,3,2416078.86,-40.5%,-,-,,48211877.22,4,Deadpool & Wolverine
4,Aug 23-25,3,1833927.42,-24.1%,677,-,2708.16,53687815.74,5,Deadpool & Wolverine
...,...,...,...,...,...,...,...,...,...,...
86,Jun 21-23,43,134.94,+652.2%,4,+3,33.54,9939832.50,25,One Life
87,Jul 5-7,34,25.74,-,2,-,12.48,9940165.56,27,One Life
88,Jul 12-14,37,61.62,+139.4%,1,-1,61.62,9940486.92,28,One Life
89,Jul 19-21,34,84.24,+36.7%,1,-,84.24,9940571.16,29,One Life


# Data Retrival from an external source TMDB

After retrieving the weekend gross from Box Moji we can then fill in the movie information from TMDB with further information for the visualisation.

This is benefitial as it would bring life to the web app with the ability for users to not only interact on the BFI website but all improve consumer use time on the website

Provides an accessible user interface with users having the ability to gain information on films they may have not heard off before and rather than leaving the BFI website to search for the movies they have the ability to gain information on rating, the movie description, film poster that way if they have seen it before it is something they can relate with and last but not least is the trailer which is sourced from youtube directly(if available).


This process is currently automated from the cell above as the data on films for said week will be passed onto the API and information stored will be saved into a csv file

Future works would include the adding of actors tab, similar films based on the current film a recommendation system (Possible through the api)

In [3]:
import requests
import pandas as pd

# Define base URL and API key
base_url = "https://api.themoviedb.org/3"
api_key = "9121de210a58fbaa7cd5f0654a7ec8d9"

# Initialize an empty list to store movie data
content_data = []

# Fetch genre details for movies
movies_genre_url = f"{base_url}/genre/movie/list?api_key={api_key}"
movies_genre_response = requests.get(movies_genre_url)
movies_genre_data = movies_genre_response.json()

# Map genre IDs to genre names
movie_genres = {genre['id']: genre['name'] for genre in movies_genre_data['genres']}

# Iterate over each movie in the DataFrame
for movie in all_films_data['Film'].unique()[:15]:
    # Search for the movie by name
    search_url = f"{base_url}/search/movie?api_key={api_key}&query={movie}"
    search_response = requests.get(search_url)
    search_data = search_response.json()

    # Check if the search results are not empty
    if search_data['results']:
        # Get the first result, which is the most relevant one
        first_result = search_data['results'][0]

        # Extract required details
        content_id = first_result.get('id')
        title = first_result.get('title')
        poster = first_result.get('poster_path')
        backdrop = first_result.get('backdrop_path')
        overview = first_result.get('overview')
        rating = first_result.get('vote_average')
        content_genres_ids = first_result.get('genre_ids', [])
        release_year = first_result.get('release_date', None)

        # Find genre names from genre IDs
        content_genres = [movie_genres.get(genre_id, 'Unknown') for genre_id in content_genres_ids]

        # Fetch the trailer link
        trailer_url = f"{base_url}/movie/{content_id}/videos?api_key={api_key}"
        trailer_response = requests.get(trailer_url)
        trailer_data = trailer_response.json()
        
        # Search through trailer results to find one where type is 'Trailer' and title contains 'official'
        trailer_key = 'N/A'
        for trailer in trailer_data.get('results', []):
            if trailer.get('type') == 'Trailer' and 'official' in trailer.get('name', '').lower():
                trailer_key = trailer.get('key', 'N/A')
                break  # Exit the loop once the desired trailer is found
        
        # If no trailer matched with "official", fallback to any trailer
        if trailer_key == 'N/A':
            for trailer in trailer_data.get('results', []):
                if trailer.get('type') == 'Trailer':
                    trailer_key = trailer.get('key', 'N/A')
                    break

        trailer_link = f"https://www.youtube.com/watch?v={trailer_key}" if trailer_key != 'N/A' else 'N/A'

        # Fetch additional movie details (runtime and origin country)
        content_details_url = f"{base_url}/movie/{content_id}?api_key={api_key}"
        content_response = requests.get(content_details_url)
        content_infos = content_response.json()

        runtime = content_infos.get('runtime', None)
        origin = 'N/A'
        if content_infos.get('production_companies') and len(content_infos['production_companies']) > 0:
            origin = content_infos['production_companies'][0].get('origin_country', 'N/A')

        # Create a dictionary for the movie
        content_info = {
            'Title': title,
            'Content ID': content_id,
            'Poster': poster,
            'Backdrop': backdrop,
            'Overview': overview,
            'Trailer': trailer_link,
            'Genres': ', '.join(content_genres),
            'Runtime (minutes)': runtime,
            'Rating': round(rating, 2),
            'Release Year': release_year if release_year else None,
            'Origin Country': origin
        }

        print('Processing:', title)
        # Append the movie data to the list
        content_data.append(content_info)

        # Calculate the total and remaining movies to process
        total = len(all_films_data['Film'].unique()[:15])
        print("Remaining:", total - len(content_data))

# Convert the list of movie data to a DataFrame
df_content = pd.DataFrame(content_data)



Processing: Deadpool & Wolverine
Remaining: 8
Processing: Despicable Me 4
Remaining: 7
Processing: Inside Out 2
Remaining: 6
Processing: Kinds of Kindness
Remaining: 5
Processing: The Bikeriders
Remaining: 4
Processing: MaXXXine
Remaining: 3
Processing: Kingdom of the Planet of the Apes
Remaining: 2
Processing: Treasure
Remaining: 1
Processing: One Life
Remaining: 0


In [70]:
df_content

Unnamed: 0,Title,Content ID,Poster,Backdrop,Overview,Trailer,Genres,Runtime (minutes),Rating,Release Year,Origin Country
0,It Ends with Us,1079091,/AjV6jFJ2YFIluYo4GQf13AA1tqu.jpg,/9BQqngPfwpeAfK7c2H3cwIFWIVR.jpg,When a woman's first love suddenly reenters he...,https://www.youtube.com/watch?v=DLET_u31M4M,"Romance, Drama",131,6.91,2024-08-07,US
1,Deadpool & Wolverine,533535,/8cdWjvZQUExUUTzyp4t6EDMubfO.jpg,/yDHYTfA3R0jFYba16jBB1ef8oIt.jpg,A listless Wade Wilson toils away in civilian ...,https://www.youtube.com/watch?v=73_1biulkYk,"Action, Comedy, Science Fiction",128,7.76,2024-07-24,US
2,Despicable Me 4,519182,/wWba3TaojhK7NdycRhoQpsG0FaH.jpg,/lgkPzcOSnTvjeMnuFzozRO5HHw1.jpg,"Gru and Lucy and their girls—Margo, Edith and ...",https://www.youtube.com/watch?v=LtNYaH61dXY,"Animation, Family, Comedy, Action",94,7.25,2024-06-20,US
3,Trap,1032823,/mWV2fNBkSTW67dIotVTXDYZhNBj.jpg,/iAlsYg6dlv1fvOBypM7SldIS1Wl.jpg,A father and teen daughter attend a pop concer...,https://www.youtube.com/watch?v=mps1HbpECIA,"Crime, Thriller, Horror, Mystery",105,6.4,2024-07-31,US
4,Borderlands,365177,/lrlSOlTiIlTQqVqJGW4HaWjJaBJ.jpg,/mKOBdgaEFguADkJhfFslY7TYxIh.jpg,"Returning to her home planet, an infamous boun...",https://www.youtube.com/watch?v=lU_NKNZljoQ,"Action, Science Fiction, Comedy",101,5.2,2024-08-07,US
5,Twisters,718821,/pjnD08FlMAIXsfOLKQbvmO0f0MD.jpg,/58D6ZAvOKxlHjyX9S8qNKSBE9Y.jpg,"As storm season intensifies, the paths of form...",https://www.youtube.com/watch?v=AZbEi95SuMg,"Action, Adventure, Drama",123,7.02,2024-07-10,US
6,Inside Out 2,1022789,/vpnVM9B6NMmQpWeZvzLvDESb2QY.jpg,/stKGOm8UyhuLPR9sZLjs5AkmncA.jpg,Teenager Riley's mind headquarters is undergoi...,https://www.youtube.com/watch?v=LEjhY15eCx0,"Animation, Family, Adventure, Comedy",97,7.68,2024-06-11,US
7,Harold and the Purple Crayon,826510,/dEsuQOZwdaFAVL26RjgjwGl9j7m.jpg,/6IrZ3C8qSZ8Tbb32s41ReJOXpI0.jpg,"Inside of his book, adventurous Harold can mak...",https://www.youtube.com/watch?v=-itXhXgatsI,"Adventure, Family, Fantasy, Comedy",90,6.56,2024-07-31,US
8,Longlegs,1226578,/5aj8vVGFwGVbQQs26ywhg4Zxk2L.jpg,/tabKOXkHRu6Nho2VOYrnyAirtY7.jpg,FBI Agent Lee Harker is assigned to an unsolve...,https://www.youtube.com/watch?v=OG7wOTE8NhE,"Crime, Horror, Thriller",101,6.6,2024-07-10,US
9,Spider-Man 2,558,/olxpyq9kJAZ2NU1siLshhhXEPR7.jpg,/8G6HCS82vNxgg5wp7oBDSk32XpF.jpg,Peter Parker is going through a major identity...,https://www.youtube.com/watch?v=3jBFwltrxJw,"Action, Adventure, Science Fiction",127,7.28,2004-06-25,US


In [73]:
df_content.to_csv('movie_data.csv', index=False)

In [76]:
all_films_data

Unnamed: 0,Date,Rank,Weekend Gross,Gross Change/Week,Theater,Change,Average,To Date,Weekend Since Release,Been Preprocessed,Film
0,Aug 9-11,1,4379496.96,-,625,-,7006.44,4379496.96,1,false,It Ends with Us
1,Aug 16-18,2,2788412.64,-36.3%,678,+53,4112.36,11200528.88,2,false,It Ends with Us
2,Aug 23-25,2,1855831.08,-33.4%,693,+15,2677.48,15931765.24,3,false,It Ends with Us
3,Jul 26-28,1,16887519.96,-,-,-,0.00,16887519.96,1,false,Deadpool & Wolverine
4,Aug 2-4,1,7806208.52,-53.8%,715,-,10917.40,32538166.52,2,false,Deadpool & Wolverine
...,...,...,...,...,...,...,...,...,...,...,...
377,Jun 2-4,34,530.48,-,1,-,530.48,81941.68,34,false,Exhibition on Screen: Hopper - An American Lov...
378,Jul 28-30,37,313.88,-,1,-,313.88,84878.32,42,false,Exhibition on Screen: Hopper - An American Lov...
379,Aug 9-11,39,633.08,-,1,-,633.08,87398.48,96,false,Exhibition on Screen: Hopper - An American Lov...
380,Aug 9-11,40,507.68,-,12,-,41.80,507.68,1,false,Leech


In [77]:
df_content

Unnamed: 0,Title,Content ID,Poster,Backdrop,Overview,Trailer,Genres,Runtime (minutes),Rating,Release Year,Origin Country
0,It Ends with Us,1079091,/AjV6jFJ2YFIluYo4GQf13AA1tqu.jpg,/9BQqngPfwpeAfK7c2H3cwIFWIVR.jpg,When a woman's first love suddenly reenters he...,https://www.youtube.com/watch?v=DLET_u31M4M,"Romance, Drama",131,6.91,2024-08-07,US
1,Deadpool & Wolverine,533535,/8cdWjvZQUExUUTzyp4t6EDMubfO.jpg,/yDHYTfA3R0jFYba16jBB1ef8oIt.jpg,A listless Wade Wilson toils away in civilian ...,https://www.youtube.com/watch?v=73_1biulkYk,"Action, Comedy, Science Fiction",128,7.76,2024-07-24,US
2,Despicable Me 4,519182,/wWba3TaojhK7NdycRhoQpsG0FaH.jpg,/lgkPzcOSnTvjeMnuFzozRO5HHw1.jpg,"Gru and Lucy and their girls—Margo, Edith and ...",https://www.youtube.com/watch?v=LtNYaH61dXY,"Animation, Family, Comedy, Action",94,7.25,2024-06-20,US
3,Trap,1032823,/mWV2fNBkSTW67dIotVTXDYZhNBj.jpg,/iAlsYg6dlv1fvOBypM7SldIS1Wl.jpg,A father and teen daughter attend a pop concer...,https://www.youtube.com/watch?v=mps1HbpECIA,"Crime, Thriller, Horror, Mystery",105,6.4,2024-07-31,US
4,Borderlands,365177,/lrlSOlTiIlTQqVqJGW4HaWjJaBJ.jpg,/mKOBdgaEFguADkJhfFslY7TYxIh.jpg,"Returning to her home planet, an infamous boun...",https://www.youtube.com/watch?v=lU_NKNZljoQ,"Action, Science Fiction, Comedy",101,5.2,2024-08-07,US
5,Twisters,718821,/pjnD08FlMAIXsfOLKQbvmO0f0MD.jpg,/58D6ZAvOKxlHjyX9S8qNKSBE9Y.jpg,"As storm season intensifies, the paths of form...",https://www.youtube.com/watch?v=AZbEi95SuMg,"Action, Adventure, Drama",123,7.02,2024-07-10,US
6,Inside Out 2,1022789,/vpnVM9B6NMmQpWeZvzLvDESb2QY.jpg,/stKGOm8UyhuLPR9sZLjs5AkmncA.jpg,Teenager Riley's mind headquarters is undergoi...,https://www.youtube.com/watch?v=LEjhY15eCx0,"Animation, Family, Adventure, Comedy",97,7.68,2024-06-11,US
7,Harold and the Purple Crayon,826510,/dEsuQOZwdaFAVL26RjgjwGl9j7m.jpg,/6IrZ3C8qSZ8Tbb32s41ReJOXpI0.jpg,"Inside of his book, adventurous Harold can mak...",https://www.youtube.com/watch?v=-itXhXgatsI,"Adventure, Family, Fantasy, Comedy",90,6.56,2024-07-31,US
8,Longlegs,1226578,/5aj8vVGFwGVbQQs26ywhg4Zxk2L.jpg,/tabKOXkHRu6Nho2VOYrnyAirtY7.jpg,FBI Agent Lee Harker is assigned to an unsolve...,https://www.youtube.com/watch?v=OG7wOTE8NhE,"Crime, Horror, Thriller",101,6.6,2024-07-10,US
9,Spider-Man 2,558,/olxpyq9kJAZ2NU1siLshhhXEPR7.jpg,/8G6HCS82vNxgg5wp7oBDSk32XpF.jpg,Peter Parker is going through a major identity...,https://www.youtube.com/watch?v=3jBFwltrxJw,"Action, Adventure, Science Fiction",127,7.28,2004-06-25,US
