In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:

def get_movie_data(start_number=1):
    """
    Fetches movie data starting from the provided movie number from IMDb's top 1000 movies.
    """
    # Setting the URL for the first page if start_number is 1
    if start_number == 1:
        url = "https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating"
    else:
        url = f"https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start={start_number}&ref_=adv_nxt"
    
    # Get the content of the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all movie containers
    movie_containers = soup.find_all('div', class_='lister-item mode-advanced')
    
    # Extract movie details from each container
    movie_data = []
    for movie in movie_containers:
        # Movie Title
        title = movie.h3.a.text
        
        # Movie Year
        year = movie.h3.find('span', class_='lister-item-year').text
        
        # IMDb Rating
        imdb_rating = float(movie.strong.text)
        
        # MetaScore
        m_score = movie.find('span', class_='metascore')
        metascore = int(m_score.text) if m_score else None
        
        # Votes
        votes = movie.find('span', attrs={'name': 'nv'})['data-value']
        
        # Append to movie data list
        movie_data.append([title, year, imdb_rating, metascore, votes])
    
    return movie_data

# Fetch data for all 10 pages (1000 movies)
all_movie_data = []
for start_number in range(1, 1001, 100):  # Starts at 1 and jumps by 100 for each page
    all_movie_data.extend(get_movie_data(start_number))

# Convert to a DataFrame for better representation
df = pd.DataFrame(all_movie_data, columns=['Title', 'Year', 'IMDb Rating', 'Metascore', 'Votes'])

# Save to CSV or Excel if required
# df.to_csv('IMDb_Top_1000_Movies.csv', index=False)


In [4]:
df.to_csv(r"D:\2_WorkTrack\0_Data\Datasets\IMDb_Top_1000_Movies.csv", index=False)

In [6]:
df.head()

Unnamed: 0,Title,Year,IMDb Rating,Metascore,Votes
0,The Shawshank Redemption,(1994),9.3,82.0,2773253
1,The Godfather,(1972),9.2,100.0,1930508
2,The Dark Knight,(2008),9.0,84.0,2748863
3,Schindler's List,(1993),9.0,95.0,1396195
4,12 Angry Men,(1957),9.0,97.0,822693


In [7]:


def get_movie_data(start_number=1):
    """
    Fetches movie data starting from the provided movie number from IMDb's top 1000 movies.
    """
    # Setting the URL for the first page if start_number is 1
    if start_number == 1:
        url = "https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating"
    else:
        url = f"https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start={start_number}&ref_=adv_nxt"
    
    # Get the content of the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all movie containers
    movie_containers = soup.find_all('div', class_='lister-item mode-advanced')
    
    # Extract movie details from each container
    movie_data = []
    for movie in movie_containers:
        # Movie Title
        title = movie.h3.a.text
        
        # Movie Year
        year = movie.h3.find('span', class_='lister-item-year').text
        
        # IMDb Rating
        imdb_rating = float(movie.strong.text)
        
        # MetaScore
        m_score = movie.find('span', class_='metascore')
        metascore = int(m_score.text) if m_score else None
        
        # Votes
        votes = movie.find('span', attrs={'name': 'nv'})['data-value']
        
        # Gross revenue
        gross_list = movie.find_all('span', attrs={'name': 'nv'})
        gross = gross_list[1]['data-value'] if len(gross_list) > 1 else None
        
        # Director and Stars
        director_stars = movie.find('p', class_='').find_all('a')
        director = director_stars[0].text
        stars = ", ".join([star.text for star in director_stars[1:]])
        
        # Append to movie data list
        movie_data.append([title, year, imdb_rating, metascore, votes, gross, director, stars])
    
    return movie_data

# Fetch data for all 10 pages (1000 movies)
all_movie_data = []
for start_number in range(1, 1001, 100):
    all_movie_data.extend(get_movie_data(start_number))

# Convert to a DataFrame for better representation
df1 = pd.DataFrame(all_movie_data, columns=['Title', 'Year', 'IMDb Rating', 'Metascore', 'Votes', 'Gross', 'Director', 'Stars'])

# Add a serial number column
df1.insert(0, 'Serial Number', range(1, 1001))

# Save to CSV or Excel if required
#path = r"D:\2_WorkTrack\0 Data\Datasets\IMDb_Top_1000_Movies.csv"
#df1.to_csv(path, index=False)

#print(f"DataFrame has been saved to {path}")


In [8]:
df1.head()

Unnamed: 0,Serial Number,Title,Year,IMDb Rating,Metascore,Votes,Gross,Director,Stars
0,1,The Shawshank Redemption,(1994),9.3,82.0,2773253,28341469,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi..."
1,2,The Godfather,(1972),9.2,100.0,1930508,134966411,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke..."
2,3,The Dark Knight,(2008),9.0,84.0,2748863,534858444,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,4,Schindler's List,(1993),9.0,95.0,1396195,96898818,Steven Spielberg,"Liam Neeson, Ralph Fiennes, Ben Kingsley, Caro..."
4,5,12 Angry Men,(1957),9.0,97.0,822693,4360000,Sidney Lumet,"Henry Fonda, Lee J. Cobb, Martin Balsam, John ..."


In [9]:
df1.tail()

Unnamed: 0,Serial Number,Title,Year,IMDb Rating,Metascore,Votes,Gross,Director,Stars
995,996,Un long dimanche de fiançailles,(2004),7.6,76.0,74978,6167817.0,Jean-Pierre Jeunet,"Audrey Tautou, Gaspard Ulliel, Jodie Foster, D..."
996,997,Philomena,(2013),7.6,77.0,102298,37707719.0,Stephen Frears,"Judi Dench, Steve Coogan, Sophie Kennedy Clark..."
997,998,Shine,(1996),7.6,87.0,55558,35811509.0,Scott Hicks,"Geoffrey Rush, Armin Mueller-Stahl, Justin Bra..."
998,999,The Invisible Man,(1933),7.6,87.0,37784,,James Whale,"Claude Rains, Gloria Stuart, William Harrigan,..."
999,1000,Celda 211,(2009),7.6,,69435,,Daniel Monzón,"Luis Tosar, Alberto Ammann, Antonio Resines, M..."


In [15]:
df1.isnull().sum()

Serial Number      0
Title              0
Year               0
IMDb Rating        0
Metascore        156
Votes              0
Gross            162
Director           0
Stars              0
dtype: int64