In [1]:
import os
import time
import pandas as pd
import tkinter as tk
from tkinter import filedialog
from selenium import webdriver
from PIL import ImageTk, Image
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC

In [2]:
URLs = pd.read_csv('URLs.csv')
URLs

Unnamed: 0,imdbId
0,114709
1,113497
2,113228
3,114885
4,113041
...,...
9737,5476944
9738,5914996
9739,6397426
9740,8391976


In [3]:
# Function to perform web scraping
def scrape_data():
    result_text.delete('1.0', tk.END)
    # List to store the scraped data
    titles = []
    release_years = []
    age_classifications = []
    runtimes = []
    rates = []
    genres = []
    movie_stories = []
    directors = []
    writerss = []
    starss = []
    
    # Get the code from the text box
    user_input = url_entry.get()

    # Check if IMDb code is empty
    if not user_input:
        result_text.tag_configure("notification", foreground="red", font=("Arial", 12, "bold"))
        result_text.insert(tk.END, "Please enter an IMDb code!", "notification")
        url_entry.delete(0, tk.END)
        return
    
    # Create the URL
    url = "https://www.imdb.com/title/tt0"
    final_URL = url + user_input + "/"
    # Create a web driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    
    try:
        # Load the URL in the Chrome webdriver
        driver.get(final_URL)
        content = driver.page_source
        soup = bs(content, 'html.parser')

        # Check if the page exists
        error_message = soup.find('div', class_='error_code')
        if error_message is not None:
            result_text.tag_configure("notification", foreground="red", font=("Arial", 12, "bold"))
            result_text.insert(tk.END, "The IMDb code is wrong or the page does not exist.", "notification")
            url_entry.delete(0, tk.END)
            return
        
        # Scrape the data (similar to your existing code)
        try:
            title = soup.find('h1').text
            titles.append(title)
            result_text.insert(tk.END, f"Title: {title}\n")
        except:
            titles.append(None)
            result_text.insert(tk.END, "Title: Not available\n")
            
        try:
            x = soup.find('ul', class_='ipc-inline-list ipc-inline-list--show-dividers sc-afe43def-4 kdXikI baseAlt')
            release_year = x.find_all('a')[0].text
            release_years.append(release_year)
            result_text.insert(tk.END, f"Release Year: {release_year}\n")
        except:
            release_years.append(None)
            result_text.insert(tk.END, "Release Year: Not available\n")
        
        try:
            age_classification = x.find_all('a')[1].text
            age_classifications.append(age_classification)
            result_text.insert(tk.END, f"Age Classification: {age_classification}\n")
        except:
            age_classifications.append(None)
            result_text.insert(tk.END, "Age Classification: Not available\n")
        
        runtime_elem = soup.find('li', {'data-testid': 'title-techspec_runtime'})
        if runtime_elem is not None:
            runtime = runtime_elem.find('div', {'class': 'ipc-metadata-list-item__content-container'}).text.strip()
            runtimes.append(runtime)
            result_text.insert(tk.END, f"Runtime: {runtime}\n")
        else:
            runtimes.append('')
            result_text.insert(tk.END, "Runtime: Not available\n")

        try:
            rate = soup.find('span', class_='sc-bde20123-1 iZlgcd').text
            rates.append(rate)
            result_text.insert(tk.END, f"IMDB score: {rate}\n")
        except:
            rates.append(None)
            result_text.insert(tk.END, "IMDB score: Not available\n")

        try:
            genre_tags = soup.find_all('a', class_='ipc-chip ipc-chip--on-baseAlt')
            genres_list = [tag.text for tag in genre_tags]
            genres.append(', '.join(genres_list))
            result_text.insert(tk.END, f"Genres: {', '.join(genres_list)}\n")
        except:
            genres.append(None)
            result_text.insert(tk.END, "Genres: Not available\n")

        try:
            movie_story = soup.find('span', class_='sc-6a7933c5-0 cUeLJx').text
            movie_stories.append(movie_story)
            result_text.insert(tk.END, f"Synopsis: {movie_story}\n")
        except:
            movie_stories.append(None)
            result_text.insert(tk.END, "Synopsis: Not available\n")
        
        credit_items = soup.find_all('li', {'class': 'ipc-metadata-list__item', 'data-testid': 'title-pc-principal-credit'})
        director = credit_items[0].find('a').text
        directors.append(director)
        result_text.insert(tk.END, f"Director: {director}\n")

        writers = []
        stars = []
        for item in credit_items[1:3]:
            people = item.find('ul')
            x = people.find_all('a')
            if 'Writer' in item.text:
                writers = [person.text for person in x]
            elif 'Stars' in item.text:
                stars = [person.text for person in x]

        writerss.append(', '.join(writers))
        starss.append(', '.join(stars))
        result_text.insert(tk.END, f"Writers: {', '.join(writers)}\n")
        result_text.insert(tk.END, f"Stars: {', '.join(stars)}\n")

        # Prompt the user to save the information
        save_prompt = tk.Toplevel(window,bg="#858585")
        save_prompt.title("Save Information")
        save_label = tk.Label(save_prompt, text="Do you want to save the information?", bg="#858585", font=("Arial", 12), fg="white")
        save_label.pack()
        
        # Lambda function with a condition to handle missing age_classification variable
        save_button_yes = tk.Button(save_prompt, text="Yes",bg="green", fg="white", font=("Arial", 10), command=lambda: (save_data(
            title if 'title' in locals() else None,
            release_year if 'release_year' in locals() else None,
            age_classification if 'age_classification' in locals() else None,
            runtime if 'runtime' in locals() else None,
            rate if 'rate' in locals() else None,
            genres_list if 'genres_list' in locals() else None,
            movie_story if 'movie_story' in locals() else None,
            director if 'director' in locals() else None,
            writers if 'writers' in locals() else None,
            stars if 'stars' in locals() else None
        ),save_prompt.destroy()))
        save_button_yes.pack(side=tk.LEFT)
        
        save_button_no = tk.Button(save_prompt, text="No",bg="red", fg="white", font=("Arial", 10), command=save_prompt.destroy)
        save_button_no.pack(side=tk.LEFT,padx=5)
        
    except Exception as e:
        result_text.tag_configure("notification", foreground="red", font=("Arial", 12, "bold"))
        result_text.insert(tk.END, f"Error occurred while scraping data: {e}", "notification")
        url_entry.delete(0, tk.END)

    # Close the web driver
    driver.quit()
    url_entry.delete(0, tk.END)


# Function to extract and save user comments
def extract_comments():
    titles = []
    comments = []
    reviews_text.delete('1.0', tk.END)
    # Get the code from the text box
    user_input = url_entry.get()

    # Check if IMDb code is empty
    if not user_input:
        reviews_text.tag_configure("notification", foreground="red", font=("Arial", 12, "bold"))
        reviews_text.insert(tk.END, "Please enter an IMDb code!", "notification")
        url_entry.delete(0, tk.END)
        return
    
    # Create the URL
    url = "https://www.imdb.com/title/tt0"
    final_URL = url + user_input + "/"
    
    # Create a web driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    
    try:
        # Load the URL in the Chrome webdriver
        driver.get(final_URL)
        content = driver.page_source
        soup = bs(content, 'html.parser')
        
        # Check if the page exists
        error_message = soup.find('div', class_='error_code')
        if error_message is not None:
            reviews_text.tag_configure("notification", foreground="red", font=("Arial", 12, "bold"))
            reviews_text.insert(tk.END, "The IMDb code is wrong or the page does not exist.", "notification")
            url_entry.delete(0, tk.END)
            return
        
        title = soup.find('h1').text
        titles.append(title)
        reviews_text.insert(tk.END, f"Title: {title}\n\n")
        
        # Wait for the user review button to be clickable and click it
        user_review_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.LINK_TEXT, 'User reviews')))
        user_review_button.click()
        
        
        # Wait for the reviews to load using an explicit wait
        while True:
            try:
                load_more_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.ID, 'load-more-trigger')))
                load_more_button.click()
                time.sleep(2)  # Wait for new reviews to load
            except:
                break
        
        # Extract the reviews using BeautifulSoup
        content = driver.page_source
        soup = bs(content, 'html.parser')
        reviews = soup.find_all('div', class_='text show-more__control')
        user_reviews = [review.text.strip() for review in reviews]
        comments.extend(user_reviews)
        
        # Display the comments in the result text box
        reviews_text.insert(tk.END, "User Reviews:\n")
        for i, review in enumerate(user_reviews):
            reviews_text.insert(tk.END, f"\nReview {i+1}:\n{review}\n")
        
        # Prompt the user to save the comments
        save_prompt = tk.Toplevel(window,bg="#858585")
        save_prompt.title("Save Comments")
        
        save_label = tk.Label(save_prompt, text="Do you want to save the comments?", bg="#858585", font=("Arial", 12), fg="white")
        save_label.pack()
        
        save_button_yes = tk.Button(save_prompt, text="Yes",bg="green", fg="white",font=("Arial", 10), command=lambda: (save_comments(title, comments),save_prompt.destroy()))
        save_button_yes.pack(side=tk.LEFT)
        
        save_button_no = tk.Button(save_prompt, text="No",bg="red", fg="white",font=("Arial", 10), command=save_prompt.destroy)
        save_button_no.pack(side=tk.LEFT, padx=5)

    except Exception as e:
        reviews_text.tag_configure("notification", foreground="red", font=("Arial", 12, "bold"))
        reviews_text.insert(tk.END, f"Error occurred while scraping data: {e}", "notification")
        url_entry.delete(0, tk.END)

    # Close the web driver
    driver.quit()
    url_entry.delete(0, tk.END)


# Function to save the data to a CSV file
def save_data(title, release_year, age_classification, runtime, rate, genres_list, movie_story, director, writers, stars):
    # Assign default values for missing data
    if release_year is None:
        release_year = ""
    if age_classification is None:
        age_classification = ""
    if runtime is None:
        runtime = ""
    if rate is None:
        rate = ""
    if genres_list is None:
        genres_list = []
    if movie_story is None:
        movie_story = ""
    if director is None:
        director = ""
    if writers is None:
        writers = []
    if stars is None:
        stars = []

    # Create a dictionary with the data
    data = {
        'Title': [title],
        'Production Year': [release_year],
        'Age Classification': [age_classification],
        'Runtime': [runtime],
        'IMDB Score': [rate],
        'Genres': [', '.join(genres_list)],
        'Synopsis': [movie_story],
        'Director': [director],
        'Writers': [', '.join(writers)],
        'Stars': [', '.join(stars)]
    }

    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(data)

    # Prompt the user to select a file path using a file dialog
    save_file_path = filedialog.asksaveasfilename(defaultextension='.csv')
    
    # Check if the file path exists
    if os.path.exists(save_file_path):
        # Load the existing data from the file into a DataFrame
        existing_df = pd.read_csv(save_file_path)

        # Append the new data to the existing DataFrame
        updated_df = pd.concat([existing_df, df])

        # Save the updated DataFrame to the same file
        updated_df.to_csv(save_file_path, index=False)
    else:
        # Save the new data to a new file
        df.to_csv(save_file_path, index=False)

    # Display the save message in your GUI
    result_text.insert(tk.END, f"\nData saved successfully to: {save_file_path}")
    result_text.tag_config("save_message", foreground="green")
    result_text.tag_add("save_message", "insert linestart", "insert lineend")
    

# Function to save the comments to a text file
def save_comments(title, comments):
    # Create a string with all the comments
    comments_str = f"{title}\n\n" + '\n\n'.join(comments)
    # Save the comments to a file using file dialog
    save_file_path = filedialog.asksaveasfilename(defaultextension='.txt')
    with open(save_file_path, 'w', encoding='utf-8') as file:
        file.write(comments_str)
    reviews_text.insert(tk.END, f"\nComments saved successfully to: {save_file_path}")
    reviews_text.tag_config("save_message", foreground="green")
    reviews_text.tag_add("save_message", "insert linestart", "insert lineend")


# Create the Tkinter application window
window = tk.Tk()
window.title("IMDb Scrapper")

# IMDb-like color scheme
bg_color = "#f5f5f5"  # Light gray
text_color = "#333333"  # Dark gray
title_font = ("Arial", 16, "bold")
button_font = ("Arial", 12, "bold")

# Set the background color
window.configure(bg="#333333")

# Create the frames
top_frame = tk.Frame(window, bg="#333333")
top_frame.pack(side=tk.TOP, fill=tk.X, padx=10, pady=10)

result_frame = tk.Frame(window, bg=bg_color)
result_frame.pack(padx=80, pady=10)

reviews_frame = tk.Frame(window, bg=bg_color)
reviews_frame.pack(padx=80, pady=10)

# Add the IMDb logo
imdb_logo = Image.open("logo.png")  
imdb_logo = imdb_logo.resize((140, 70)) 
imdb_logo = ImageTk.PhotoImage(imdb_logo)
logo_label = tk.Label(top_frame, image=imdb_logo, bg="#333333")
logo_label.pack(side=tk.LEFT, padx=10, pady=10)

# Add a label and text entry for the user input
input_label = tk.Label(top_frame, text="Enter IMDB code:", font=title_font, fg=text_color, bg="#333333", foreground="white")
input_label.pack(side=tk.LEFT, padx=10)

url_entry = tk.Entry(top_frame, width=30, font=("Arial", 12))
url_entry.pack(side=tk.LEFT, padx=10)

# Add a button to trigger the data scraping
scrape_button = tk.Button(top_frame, text="Extract Data", bg="#d9b70f", fg="#333333", font=button_font, command=scrape_data)
scrape_button.pack(side=tk.LEFT, padx=10)

# Add a button to trigger the comment extraction
extract_button = tk.Button(top_frame, text="Extract Comments", bg="#d9b70f", fg="#333333", font=button_font, command=extract_comments)
extract_button.pack(side=tk.LEFT, padx=10)

result_label = tk.Label(result_frame, text="Scraped Data", font=title_font, fg=text_color, bg=bg_color)
result_label.pack()

# Add a text box to display the movie information
result_text = tk.Text(result_frame, height=10, width=80, bg="#858585", fg="White", font=("Arial", 12))
result_text.pack()

reviews_label = tk.Label(reviews_frame, text="Scraped Reviews", font=title_font, fg=text_color, bg=bg_color)
reviews_label.pack()

# Add a text box to display the user reviews
reviews_text = tk.Text(reviews_frame, height=10, width=80, bg="#858585", fg="White", font=("Arial", 12))
reviews_text.pack()

# Run the Tkinter event loop
window.mainloop()