In [None]:
import requests
import pandas as pd
import json

In [None]:
# !pip install PyMovieDb (uncomment this for 1st run)
from PyMovieDb import IMDB

In [None]:
# Create an IMDb object
ia = IMDB()

# Initialize lists to store data
names = []
years = []
rating_values = []
rating_counts = []
genres = []
date_created = []
directors = []
durations = []
content_ratings = []

# Define the number of pages and the batch size
num_pages = 1  # Change this based on the number of pages you want to scrape 50 rows = 1 page, 2000 = 40 pages
batch_size = 50  # Number of movies per page

# Loop through the pages
for page in range(0, num_pages):
    start_id = page * batch_size + 1
    print(f"Scraping page {page + 1} (start_id = {start_id})")

    # Fetch popular movies for the current page
    movies_response = ia.popular_movies(genre=None, start_id=start_id, sort_by=None)

    # Parse the JSON string into a Python dictionary
    movies_data = json.loads(movies_response)
    movies = movies_data.get("results", [])  # Extract the "results" list

    # Extract and append movie details to the lists
    for movie in movies:
        title = movie.get('name')
        year = movie.get('year')
        id = movie.get('id')

        # Fetch the specific movie details
        res = ia.get_by_id(id)
        res_data = json.loads(res)

        # Function to handle missing keys and replace with "null"
        def safe_get(d, keys, default="null"):
            try:
                for key in keys:
                    d = d[key]
                return d
            except (KeyError, TypeError):
                return default

        rating_value = safe_get(res_data, ["rating", "ratingValue"], "null")
        rating_count = safe_get(res_data, ["rating", "ratingCount"], "null")
        genre = safe_get(res_data, ["genre"], "null")
        date_created_val = safe_get(res_data, ["review", "dateCreated"], "null")
        director = safe_get(res_data, ["director", 0, "name"], "null")
        duration = safe_get(res_data, ["duration"], "null")
        content_rating = safe_get(res_data, ["contentRating"], "null")

        names.append(title)
        years.append(year)
        rating_values.append(rating_value)
        rating_counts.append(rating_count)
        genres.append(genre)
        date_created.append(date_created_val)
        directors.append(director)
        durations.append(duration)
        content_ratings.append(content_rating)

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Name': names,
    'Year': years,
    'Rating Value': rating_values,
    'Rating Count': rating_counts,
    'Genre': genres,
    'Date Created': date_created,
    'Director': directors,
    'Duration': durations,
    'Content Rating': content_ratings
})

# Save the DataFrame to a CSV file
df.to_csv('popular_movies.csv', index=False)

# Print the first few rows of the DataFrame
print(df.head())


Scraping page 1 (start_id = 1)
                     Name  Year Rating Value Rating Count  \
0                 Reptile  2023          6.8        47659   
1               Fair Play  2023          6.5        18483   
2          Totally Killer  2023          6.6        23566   
3  The Exorcist: Believer  2023          5.1        14497   
4             The Creator  2023          7.1        43026   

                        Genre Date Created            Director Duration  \
0     [Crime, Drama, Mystery]   2023-09-30        Grant Singer  PT2H14M   
1  [Drama, Mystery, Thriller]   2023-10-06        Chloe Domont  PT1H53M   
2            [Comedy, Horror]   2023-10-06     Nahnatchka Khan  PT1H46M   
3                    [Horror]   2023-10-07  David Gordon Green  PT1H51M   
4  [Action, Adventure, Drama]   2023-09-29      Gareth Edwards  PT2H13M   

  Content Rating  
0              R  
1              R  
2              R  
3              R  
4          PG-13  


In [None]:
API_KEY = '6943157d21ba443339c2b84261287d57'

def get_director_id(director_name):
    search_url = f"https://api.themoviedb.org/3/search/person?api_key={API_KEY}&query={director_name}"
    response = requests.get(search_url)
    data = response.json()

    if data['results'] and len(data['results']) > 0:
        return data['results'][0]['id']
    return None

def get_movies_by_director(director_name):
    director_id = get_director_id(director_name)

    if director_id is None:
        return []

    movies_url = f"https://api.themoviedb.org/3/person/{director_id}/movie_credits?api_key={API_KEY}"
    response = requests.get(movies_url)
    data = response.json()

    movies = []

    for movie in data['crew']:
        if movie['job'] == 'Director':
            movies.append({
                'director_name': director_name,  # Add director_name attribute
                'title': movie['title'],
                'movie_id': movie['id']
            })

    return movies

def get_movie_details(movie_id):
    movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}"
    response = requests.get(movie_url)
    data = response.json()
    return data

def get_movie_release_details(movie_id):
    movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}/release_dates?api_key={API_KEY}"
    response = requests.get(movie_url)
    data = response.json()

    return data

def create_movie_dataframe(director_names):
    movie_data = []
    movies_with_us_releases = []
    i=0

    for director_name in director_names:
        movies = get_movies_by_director(director_name)

        print(f'{i} {director_name} {len(movies)}')   # for testing
        i+=1

        if movies:
            for movie in movies:
                released_in_us = False
                mpaa_rating = 'NULL'
                movie_details = get_movie_details(movie['movie_id'])
                movies_release_info = get_movie_release_details(movie['movie_id'])

                # get all US releases of the movie (if any)
                # (there can be multiple releases for a movie)
                try:
                  us_releases = [r for r in movies_release_info['results'] if r['iso_3166_1'] == 'US']

                  # check if there were any US releases, and set mpaa_rating if there were (for non-US releases, default mpaa rating is NULL)
                  if us_releases and us_releases[0].get('release_dates'):
                    released_in_us = True
                    movies_with_us_releases.append(movie['movie_id'])
                    mpaa_rating = us_releases[0]['release_dates'][0].get('certification', 'NULL')
                    mpaa_rating = us_releases[0].get('release_dates')[0].get('certification') if us_releases[0].get('release_dates') else 'NULL'
                except:
                  print(f"Error: problem with movies_release_info for movie ID {movie['movie_id']} (title: {movie['title']})")

                movie_data.append({
                    'Director Name': movie['director_name'],  # Add a director's name column
                    'Title': movie['title'],
                    'Budget': movie_details.get('budget'),
                    'Box Office': movie_details.get('revenue'),
                    'Rating': movie_details.get('vote_average'),
                    'Number of Ratings': movie_details.get('vote_count'),
                    'Genre': movie_details['genres'][0]['name'] if movie_details.get('genres') else None,
                    'Date of Release': movie_details.get('release_date'),
                    'Production House': movie_details['production_companies'][0]['name'] if movie_details.get('production_companies') else None,
                    'Runtime': movie_details.get('runtime'),
                    # 'MPAA Rating': movie_details.get('mpaa_rating')  # might need to have a diff api
                    'MPAA Rating': mpaa_rating
                })

    df = pd.DataFrame(movie_data)
    # print(movies_with_us_releases)
    return df, movies_with_us_releases
    # return df

# def create_movie_dataframe(director_names):
#     movie_data = []

#     for director_name in director_names:
#         movies = get_movies_by_director(director_name)
#         if movies:
#             for movie in movies:
#                 movie_details = get_movie_details(movie['movie_id'])
#                 movie_data.append({
#                     'Director Name': movie['director_name'],  # Add a director's name column
#                     'Title': movie['title'],
#                     'Budget': movie_details.get('budget'),
#                     'Box Office': movie_details.get('revenue'),
#                     'Rating': movie_details.get('vote_average'),
#                     'Number of Ratings': movie_details.get('vote_count'),
#                     'Genre': movie_details['genres'][0]['name'] if movie_details.get('genres') else None,
#                     'Date of Release': movie_details.get('release_date'),
#                     'Production House': movie_details['production_companies'][0]['name'] if movie_details.get('production_companies') else None,
#                     'Runtime': movie_details.get('runtime'),
#                     'MPAA Rating': movie_details.get('mpaa_rating')  # might need to have a diff api
#                 })

#     df = pd.DataFrame(movie_data)
#     return df


def get_director_names(limit=0):
    df = pd.read_csv('top_250_directors.csv')

    if (limit > 0 and limit < 250):
      return df.iloc[0:limit, 1]
    else:
      return df.iloc[:, 1]


In [None]:
### main function ###

# director_names = ["Christopher Nolan", "Quentin Tarantino"]

director_names = get_director_names()
# print(director_names)

movie_df, us_movies = create_movie_dataframe(director_names)
# movie_df= create_movie_dataframe(director_names)
movie_df.to_csv('raw_data.csv')

with open('us_movies.json', 'w') as file:
  json.dump(us_movies, file, indent=4)

# if not movie_df.empty:
    # print(movie_df)

0 Ertem Egilmez 44
1 Akira Kurosawa 32
2 Alfred Hitchcock 61
3 Ingmar Bergman 68
4 Steven Spielberg 43
5 Billy Wilder 27
6 Christopher Nolan 18
7 Stanley Kubrick 16
8 Martin Scorsese 58
9 William Wyler 56
10 Yasujirô Ozu 54
11 Charles Chaplin 0
12 Quentin Tarantino 17
13 Hayao Miyazaki 38
14 Francis Ford Coppola 36
15 Mani Ratnam 29
16 Peter Jackson 21
17 Sergio Leone 8
18 Luis Buñuel 33
19 David Fincher 14
20 Howard Hawks 42
21 Fritz Lang 47
22 Sidney Lumet 56
23 Frank Capra 58
24 Masaki Kobayashi 22
25 George Cukor 58
26 John Ford 141
27 David Lean 17
28 Jean-Luc Godard 137
Error: problem with movies_release_info for movie ID 109701 (title: Histoire(s) du Cinéma)
29 Andrei Tarkovsky 11
30 Elia Kazan 19
31 Woody Allen 56
32 Buster Keaton 34
33 Federico Fellini 27
34 Satyajit Ray 38
Error: problem with movies_release_info for movie ID 158391 (title: The Apu Trilogy)
35 Leonid Gaidai 23
36 Michael Curtiz 134
37 Clint Eastwood 43
38 Hrishikesh Mukherjee 43
39 John Huston 43
40 Mervyn LeR