In [1]:
import requests
import re
import json
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time


def get_movie_info(movie_id):
    """
        Extract movie information, including AI Summary, from IMDb page.
    """
    imdb_url = f"https://www.imdb.com/title/{movie_id}/"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    response = requests.get(imdb_url, headers=headers)
    response.raise_for_status()
    
    # --- JSON-LD Extraction (Most info) ---
    info = {}

    pattern = r'<script type="application/ld\+json">\s*({.*?})\s*</script>'
    match = re.search(pattern, response.text, re.DOTALL)
    
    if match:
        try:
            data = json.loads(match.group(1))

            info = {
                'id': movie_id,
                'title': data.get('name'),
                'description': data.get('description'),
                'summary': '',
                'image': data.get('image'),
                'url': data.get('url'),
                'datePublished': data.get('datePublished'),
                'duration': data.get('duration'),
                'genre': data.get('genre', []),
                'keywords': data.get('keywords'),
                'aggregateRating': data.get('aggregateRating', {}),
                'actors': [{'name': actor.get('name'), 'url': actor.get('url')} for actor in data.get('actor', [])],
                'directors': [{'name': director.get('name'), 'url': director.get('url')} for director in data.get('director', [])],
                'creators': [{'name': creator.get('name'), 'url': creator.get('url'), 'type': creator.get('@type')} for creator in data.get('creator', [])],
                'trailer': {
                    'name': data.get('trailer', {}).get('name'),
                    'url': data.get('trailer', {}).get('url'),
                    'embedUrl': data.get('trailer', {}).get('embedUrl'),
                    'thumbnail': data.get('trailer', {}).get('thumbnail', {}).get('contentUrl'),
                    'duration': data.get('trailer', {}).get('duration'),
                    'uploadDate': data.get('trailer', {}).get('uploadDate')
                },
                'review': data.get('review', {})
            }
        except (KeyError, json.JSONDecodeError) as e:
            print(f"Error parsing JSON-LD data: {e}")
    else:
        info['id'] = movie_id

    # --- Summary Extraction ---
    
    soup = BeautifulSoup(response.text, 'html.parser')
    ai_summary_div = soup.find('div', {'data-testid': 'ai-review-summary-text'})
    
    ai_summary_content = ''
    if ai_summary_div:
        inner_content_div = ai_summary_div.find('div', class_='ipc-html-content-inner-div')
        if inner_content_div:
            ai_summary_content = inner_content_div.get_text(strip=True)

    info['summary'] = ai_summary_content
        
    return info


def save_movie_info(movie_info, filename):
    """Save movie info to JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(movie_info, f, indent=2, ensure_ascii=False)
    print(f"Movie info saved to {filename}")

In [2]:
import pandas as pd

df_id = pd.read_csv('../data/top10k_movies.csv')

display(df_id.head())

Unnamed: 0,tconst,title,startYear,genres,averageRating,numVotes
0,tt0111161,The Shawshank Redemption,1994.0,Drama,9.3,3113920
1,tt0468569,The Dark Knight,2008.0,"Action,Crime,Drama",9.1,3089698
2,tt1375666,Inception,2010.0,"Action,Adventure,Sci-Fi",8.8,2744591
3,tt0137523,Fight Club,1999.0,"Crime,Drama,Thriller",8.8,2529183
4,tt0109830,Forrest Gump,1994.0,"Drama,Romance",8.8,2431823


In [12]:
def fetch_all_movie_infos(list, start=0, end=10000, max_workers=10):
    id_list = list[start:end]
    results = {}
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_id = {executor.submit(get_movie_info, movie_id): movie_id for movie_id in id_list}

        for i, future in enumerate(as_completed(future_to_id)):
            movie_id = future_to_id[future]
            try:
                data = future.result()
                if data:
                    results[movie_id] = data
                    save_movie_info(data, f"../data/all/{movie_id}.json")
            except Exception as e:
                print(f"[FAILED] {movie_id}: {e}")

            if (i + 1) % 10 == 0:
                elapsed = time.time() - start_time
                print(f"Processed {i+1}/{len(id_list)} in {elapsed:.1f}s "
                      f"({(i+1)/elapsed:.2f} movies/s)")

    print(f"\n✅ Done! {len(results)} movies fetched in {time.time() - start_time:.1f}s")

In [8]:
id_list = df_id['tconst'].tolist()

In [15]:
fetch_all_movie_infos(id_list, start=3000, end=4000, max_workers=10)

Movie info saved to ../data/all/tt2401878.json
Movie info saved to ../data/all/tt15318872.json
Movie info saved to ../data/all/tt0089469.json
Movie info saved to ../data/all/tt0317303.json
Movie info saved to ../data/all/tt0386032.json
Movie info saved to ../data/all/tt0374536.json
Movie info saved to ../data/all/tt0110989.json
Movie info saved to ../data/all/tt0103241.json
Movie info saved to ../data/all/tt0403702.json
Movie info saved to ../data/all/tt0243736.json
Processed 10/1000 in 2.8s (3.58 movies/s)
Movie info saved to ../data/all/tt0340377.json
Movie info saved to ../data/all/tt0116683.json
Movie info saved to ../data/all/tt0790628.json
Movie info saved to ../data/all/tt1586265.json
Movie info saved to ../data/all/tt4877122.json
Movie info saved to ../data/all/tt7766378.json
Movie info saved to ../data/all/tt0287717.json
Movie info saved to ../data/all/tt1937149.json
Movie info saved to ../data/all/tt0414055.json
Movie info saved to ../data/all/tt13186482.json
Processed 20/100