In [1]:
import requests
import re
import json
from bs4 import BeautifulSoup


def get_movie_info(movie_id):
    """
        Extract movie information, including AI Summary, from IMDb page.
    """
    imdb_url = f"https://www.imdb.com/title/{movie_id}/"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    response = requests.get(imdb_url, headers=headers)
    response.raise_for_status()
    
    # --- JSON-LD Extraction (Most info) ---
    info = {}

    pattern = r'<script type="application/ld\+json">\s*({.*?})\s*</script>'
    match = re.search(pattern, response.text, re.DOTALL)
    
    if match:
        try:
            data = json.loads(match.group(1))

            info = {
                'id': movie_id,
                'title': data.get('name'),
                'description': data.get('description'),
                'summary': '',
                'image': data.get('image'),
                'url': data.get('url'),
                'datePublished': data.get('datePublished'),
                'duration': data.get('duration'),
                'genre': data.get('genre', []),
                'keywords': data.get('keywords'),
                'aggregateRating': data.get('aggregateRating', {}),
                'actors': [{'name': actor.get('name'), 'url': actor.get('url')} for actor in data.get('actor', [])],
                'directors': [{'name': director.get('name'), 'url': director.get('url')} for director in data.get('director', [])],
                'creators': [{'name': creator.get('name'), 'url': creator.get('url'), 'type': creator.get('@type')} for creator in data.get('creator', [])],
                'trailer': {
                    'name': data.get('trailer', {}).get('name'),
                    'url': data.get('trailer', {}).get('url'),
                    'embedUrl': data.get('trailer', {}).get('embedUrl'),
                    'thumbnail': data.get('trailer', {}).get('thumbnail', {}).get('contentUrl'),
                    'duration': data.get('trailer', {}).get('duration'),
                    'uploadDate': data.get('trailer', {}).get('uploadDate')
                },
                'review': data.get('review', {})
            }
        except (KeyError, json.JSONDecodeError) as e:
            print(f"Error parsing JSON-LD data: {e}")
    else:
        info['id'] = movie_id

    # --- Summary Extraction ---
    
    soup = BeautifulSoup(response.text, 'html.parser')
    ai_summary_div = soup.find('div', {'data-testid': 'ai-review-summary-text'})
    
    ai_summary_content = ''
    if ai_summary_div:
        inner_content_div = ai_summary_div.find('div', class_='ipc-html-content-inner-div')
        if inner_content_div:
            ai_summary_content = inner_content_div.get_text(strip=True)

    info['summary'] = ai_summary_content
        
    return info


def save_movie_info(movie_info, filename):
    """Save movie info to JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(movie_info, f, indent=2, ensure_ascii=False)
    print(f"Movie info saved to {filename}")

In [7]:
import pandas as pd

df_id = pd.read_csv('top10k_movies.csv')

display(df_id.head())

Unnamed: 0,tconst,title,startYear,genres,averageRating,numVotes
0,tt0111161,The Shawshank Redemption,1994.0,Drama,9.3,3113920
1,tt0468569,The Dark Knight,2008.0,"Action,Crime,Drama",9.1,3089698
2,tt1375666,Inception,2010.0,"Action,Adventure,Sci-Fi",8.8,2744591
3,tt0137523,Fight Club,1999.0,"Crime,Drama,Thriller",8.8,2529183
4,tt0109830,Forrest Gump,1994.0,"Drama,Romance",8.8,2431823


In [None]:
id_list = df_id['tconst'].tolist()

['tt0111161',
 'tt0468569',
 'tt1375666',
 'tt0137523',
 'tt0109830',
 'tt0816692',
 'tt0110912',
 'tt0133093',
 'tt0120737',
 'tt0167260']

In [23]:
start = 4000
end = 5000

movie_info_batch = {}

In [24]:
for i, movie_id in enumerate(id_list[start:end]):
    print(i)
    
    print(f"Fetching movie info for {movie_id}...")
    movie_info = get_movie_info(movie_id)
    
    if movie_info:
        movie_info_batch[movie_id] = movie_info
        filename = f"data/all/{movie_id}.json"
        save_movie_info(movie_info, filename)
        
        print(f"\nMovie: {movie_info.get('title')}")
        print(f"Year: {movie_info.get('year')}")
        print(f"Rating: {movie_info.get('rating')}")
        print(f"Genres: {', '.join(movie_info.get('genres', []))}")
    else:
        print("Failed to extract movie information")

save_movie_info(movie_info_batch, f"data/batches/movies_{start}_{end}.json")

0
Fetching movie info for tt1727770...
Movie info saved to data/all/tt1727770.json

Movie: Absolutely Anything
Year: None
Rating: None
Genres: 
1
Fetching movie info for tt0090022...
Movie info saved to data/all/tt0090022.json

Movie: Silverado
Year: None
Rating: None
Genres: 
2
Fetching movie info for tt8508734...
Movie info saved to data/all/tt8508734.json

Movie: His House
Year: None
Rating: None
Genres: 
3
Fetching movie info for tt0430308...
Movie info saved to data/all/tt0430308.json

Movie: Get Rich or Die Tryin&apos;
Year: None
Rating: None
Genres: 
4
Fetching movie info for tt11127680...
Movie info saved to data/all/tt11127680.json

Movie: Boiling Point
Year: None
Rating: None
Genres: 
5
Fetching movie info for tt0433386...
Movie info saved to data/all/tt0433386.json

Movie: The Grudge 2
Year: None
Rating: None
Genres: 
6
Fetching movie info for tt12680684...
Movie info saved to data/all/tt12680684.json

Movie: È stata la mano di Dio
Year: None
Rating: None
Genres: 
7
Fetching