In [1]:
import csv
import requests
from bs4 import BeautifulSoup

In [2]:

# Function to scrape and extract movie details
def scrape_movies(url):
    # Send a GET request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all movie containers
    movie_containers = soup.find_all('div', class_='lister-item mode-advanced')

    movies = []
    for container in movie_containers:
        movie = {}

        # Movie name
        movie['Movie Name'] = container.h3.a.text

        # Directors (multiple directors handled)
        director_tags = container.select("div.lister-item-content p:nth-of-type(3) a")
        movie['Directors'] = [director.text for director in director_tags]

        # Duration
        duration = container.find('span', class_='runtime')
        movie['Duration'] = duration.text if duration else None

        # Year
        year = container.find('span', class_='lister-item-year').text
        movie['Year'] = year.strip('()')

        # Ratings
        ratings = container.find('div', class_='inline-block ratings-imdb-rating')
        movie['Ratings'] = ratings.strong.text if ratings else None

        # Metascore
        metascore = container.find('span', class_='metascore')
        movie['Metascore'] = metascore.text.strip() if metascore else None

        # Stars (multiple stars handled)
        star_tags = container.select("div.lister-item-content p:nth-of-type(4) a")
        movie['Stars'] = [star.text for star in star_tags]

        # Votes
        votes = container.find('span', attrs={'name': 'nv'})
        movie['Votes'] = votes['data-value'] if votes else None

        # Genre (multiple genres handled)
        genre_tags = container.select("div.lister-item-content p:nth-of-type(1) span.genre")
        movie['Genre'] = [genre.text.strip() for genre in genre_tags]

        # Gross Collection
        gross = container.find('span', class_='text-muted text-small')
        movie['Gross Collection'] = gross.contents[2].strip() if gross else None

        # Popularity
        popularity = container.find('span', class_='popularityTrend')
        movie['Popularity'] = popularity['title'] if popularity else None

        # Certification
        certificate = container.find('span', class_='certificate')
        movie['Certification'] = certificate.text if certificate else None

        movies.append(movie)

    return movies


In [3]:

# Main function to scrape all pages until desired count is reached
def scrape_all_movies(url, desired_count):
    all_movies = []

    page = 1
    count = 0
    while count < desired_count:
        print(f"Scraping page {page}...")
        page_url = url + f'&start={((page-1)*50)+1}'
        movies = scrape_movies(page_url)
        all_movies.extend(movies)
        count += len(movies)
        page += 1

    return all_movies[:desired_count]

# URL for scraping
url = 'https://www.imdb.com/search/title/?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=f11158cc-b50b-4c4d-b0a2-40b32863395b&pf_rd_r=XZ8X52H1R40B7KG5SNZ9&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_1'

# Desired count of movies
desired_count = 1743

# Scrape all movie details
all_movies = scrape_all_movies(url, desired_count)

# Define the field names for CSV files
csv_fields1 = ['Sno', 'Movie Name', 'Director1', 'Director2', 'Duration', 'Year', 'Ratings', 'Metascore']
csv_fields2 = ['Movie Name', 'Star1', 'Star2', 'Star3', 'Star4', 'Votes', 'Genre1', 'Genre2', 'Genre3', 'Gross Collection', 'Popularity', 'Certification']

# Generate CSV file 1
with open('movies_data_1.csv', mode='w', encoding='utf-8', newline='') as file1:
    writer1 = csv.DictWriter(file1, fieldnames=csv_fields1)
    writer1.writeheader()
    for index, movie in enumerate(all_movies):
        writer1.writerow({'Sno': index+1,
                          'Movie Name': movie['Movie Name'],
                          'Director1': movie['Directors'][0] if movie['Directors'] else None,
                          'Director2': movie['Directors'][1] if len(movie['Directors']) > 1 else None,
                          'Duration': movie['Duration'],
                          'Year': movie['Year'],
                          'Ratings': movie['Ratings'],
                          'Metascore': movie['Metascore']})

# Generate CSV file 2
with open('movies_data_2.csv', mode='w', encoding='utf-8', newline='') as file2:
    writer2 = csv.DictWriter(file2, fieldnames=csv_fields2)
    writer2.writeheader()
    for movie in all_movies:
        writer2.writerow({'Movie Name': movie['Movie Name'],
                          'Star1': movie['Stars'][0] if movie['Stars'] else None,
                          'Star2': movie['Stars'][1] if len(movie['Stars']) > 1 else None,
                          'Star3': movie['Stars'][2] if len(movie['Stars']) > 2 else None,
                          'Star4': movie['Stars'][3] if len(movie['Stars']) > 3 else None,
                          'Votes': movie['Votes'],
                          'Genre1': movie['Genre'][0] if movie['Genre'] else None,
                          'Genre2': movie['Genre'][1] if len(movie['Genre']) > 1 else None,
                          'Genre3': movie['Genre'][2] if len(movie['Genre']) > 2 else None,
                          'Gross Collection': movie['Gross Collection'],
                          'Popularity': movie['Popularity'],
                          'Certification': movie['Certification']})

print("Scraping and CSV generation completed successfully!")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping and CSV generation completed successfully!


In [4]:
import csv

# Function to print data from CSV file
def print_csv_data(filename):
    with open(filename, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            print(row)

# Print data from movies_data_1.csv
print("Data from movies_data_1.csv:")
print_csv_data('movies_data_1.csv')

Data from movies_data_1.csv:
['Sno', 'Movie Name', 'Director1', 'Director2', 'Duration', 'Year', 'Ratings', 'Metascore']
['1', 'The Dark Knight', 'Christopher Nolan', 'Christian Bale', '152 min', '2008', '9.0', '84']
['2', 'The Lord of the Rings: The Return of the King', 'Peter Jackson', 'Elijah Wood', '201 min', '2003', '9.0', '94']
['3', 'Inception', 'Christopher Nolan', 'Leonardo DiCaprio', '148 min', '2010', '8.8', '74']
['4', 'The Lord of the Rings: The Fellowship of the Ring', 'Peter Jackson', 'Elijah Wood', '178 min', '2001', '8.8', '92']
['5', 'The Lord of the Rings: The Two Towers', 'Peter Jackson', 'Elijah Wood', '179 min', '2002', '8.8', '87']
['6', 'The Matrix', 'Lana Wachowski', 'Lilly Wachowski', '136 min', '1999', '8.7', '73']
['7', 'Star Wars: Episode V - The Empire Strikes Back', 'Irvin Kershner', 'Mark Hamill', '124 min', '1980', '8.7', '82']
['8', 'Soorarai Pottru', 'Sudha Kongara', 'Suriya', '153 min', '2020', '8.7', '']
['9', 'Star Wars', 'George Lucas', 'Mark Hami

In [5]:
import csv

# Function to print data from CSV file
def print_csv_data(filename):
    with open(filename, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            print(row)

# Print data from movies_data_2.csv
print("Data from movies_data_2.csv:")
print_csv_data('movies_data_2.csv')

Data from movies_data_2.csv:
['Movie Name', 'Star1', 'Star2', 'Star3', 'Star4', 'Votes', 'Genre1', 'Genre2', 'Genre3', 'Gross Collection', 'Popularity', 'Certification']
['The Dark Knight', '', '', '', '', '2716423', 'Action, Crime, Drama', '', '', '', '', 'UA']
['The Lord of the Rings: The Return of the King', '', '', '', '', '1885997', 'Action, Adventure, Drama', '', '', '', '', 'U']
['Inception', '', '', '', '', '2411352', 'Action, Adventure, Sci-Fi', '', '', '', '', 'UA']
['The Lord of the Rings: The Fellowship of the Ring', '', '', '', '', '1914722', 'Action, Adventure, Drama', '', '', '', '', 'U']
['The Lord of the Rings: The Two Towers', '', '', '', '', '1702363', 'Action, Adventure, Drama', '', '', '', '', 'UA']
['The Matrix', '', '', '', '', '1956560', 'Action, Sci-Fi', '', '', '', '', 'A']
['Star Wars: Episode V - The Empire Strikes Back', '', '', '', '', '1320305', 'Action, Adventure, Fantasy', '', '', '', '', 'UA']
['Soorarai Pottru', '', '', '', '', '119439', 'Action, Dram