In [104]:
import requests
from bs4 import BeautifulSoup
import csv
from fpdf import FPDF
import re
from datetime import datetime

API_KEY = "940c9fcf"

# ---------------------- #
# 1Ô∏è‚É£ Web Scraping Phase  #
# ---------------------- #
def fetch_movies_from_rotten_tomatoes():
    print("üçÖ Fetching Latest Movies from Rotten Tomatoes...")
    url = "https://www.rottentomatoes.com/browse/movies_in_theaters"
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        titles = [tag.get("href").split("/")[-1].replace("-", " ").title() 
                  for tag in soup.select('a.js-tile-link')]

        print(f"‚úÖ Fetched {len(titles)} movies from Rotten Tomatoes!\n")
        return titles[:30]

    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error while scraping Rotten Tomatoes: {e}")
        return []

# ---------------------- #
# 2Ô∏è‚É£ Extract Phase       #
# ---------------------- #
def extract_movies(titles):
    print("üöÄ Fetching Data from OMDb API... üé¨")
    movies = []

    for title in titles:
        url = f"http://www.omdbapi.com/?t={title}&apikey=940c9fcf"
        try:
            response = requests.get(url, timeout=10)
            data = response.json()

            if data.get("Response") == "True":
                movies.append(data)
            else:
                print(f"‚ùå Movie not found: {title}")

        except requests.exceptions.RequestException as e:
            print(f"‚ùå Request Error for {title}: {e}")

    print(f"‚úÖ Fetched {len(movies)} movies successfully!\n")
    return movies

# ---------------------- #
# 3Ô∏è‚É£ Transform Phase      #
# ---------------------- #
def transform_data(movies):
    print("üîÑ Transforming Data... üßπ")
    transformed_movies = []

    for movie in movies:
        # TASK1 - Title Transformation
        title_clean = re.sub(r'[^\w\s]', '', movie.get("Title", "").strip()).title() 
        
        # TASK2 - Release Date Transformation
        release_date = movie.get("Released", "N/A")   
        formatted_date = (
            datetime.strptime(release_date, '%d %b %Y').strftime('%Y-%m-%d') 
            if release_date != "N/A" else "Unknown"
        )

        # TASK3 - Genre Transformation
        genres = list(set([genre.strip().lower() for genre in movie.get("Genre", "").split(",")]))

        # TASK4 - IMDb Rating Transformation
        imdb_rating = round(float(movie.get("imdbRating", 0)), 1) if movie.get("imdbRating") != "N/A" else None  
        imdb_rating_normalized = imdb_rating / 10 if imdb_rating else None

        # TASK5 - Actors Transformation
        actors = ", ".join(sorted([actor.strip() for actor in movie.get("Actors", "").split(",")[:3]]))

        # TASK6 - Box Office Transformation
        box_office_clean = re.sub(r'[^\d]', '', movie.get("BoxOffice", ""))  
        box_office_value = int(box_office_clean) if box_office_clean else 0

        # TASK7 - Awards Transformation
        awards_text = movie.get("Awards", "N/A")
        award_numbers = [int(num) for num in re.findall(r'(\d+)', awards_text)]
        total_awards = sum(award_numbers) if award_numbers else 0

        # TASK8 - Metascore Transformation
        metascore = int(movie.get("Metascore", 0)) if movie.get("Metascore") != "N/A" else None
        metascore_normalized = metascore / 100 if metascore else None

        # TASK9 - Language Transformation
        language = ", ".join(sorted([lang.strip().lower() for lang in movie.get("Language", "").split(",")])) if movie.get("Language") else "Unknown"

        # TASK10 - Production Transformation
        production = re.sub(r'[^\w\s]', '', movie.get("Production", "Independent").strip())

        transformed_movies.append({
            "Title": title_clean,
            "Release Date": formatted_date,
            "Genres": genres,
            "IMDb Rating": imdb_rating,
            "Normalized IMDb Rating": imdb_rating_normalized,
            "Top 3 Actors": actors,
            "Box Office": box_office_value,
            "Total Awards": total_awards,
            "Metascore": metascore_normalized,
            "Language": language,
            "Production": production
        })

    print(f"‚ú® Transformation Complete for {len(transformed_movies)} movies!\n")
    return transformed_movies

# ---------------------- #
# 4Ô∏è‚É£ Load Phase (CSV)     #
# ---------------------- #
def load_to_csv(data):
    print("üíæ Saving Data to CSV... üìä")
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"movies_{timestamp}.csv"  # ‚úÖ Add timestamp to avoid overwrite

    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

    print(f"‚úÖ Data successfully saved to {filename}!\n")

# ---------------------- #
# üöÄ Execute ETL Pipeline #
# ---------------------- #
if __name__ == "__main__":
    titles = fetch_movies_from_rotten_tomatoes()
    if titles:
        movies = extract_movies(titles)
        if movies:
            transformed_data = transform_data(movies)
            load_to_csv(transformed_data)
        else:
            print("‚ö†Ô∏è No movie data fetched from the API.")
    else:
        print("‚ö†Ô∏è No movie titles found during scraping.")

# ---------------------- #
# üöÄ Convert to PDF #
# ---------------------- #

from fpdf import FPDF
from datetime import datetime

def load_to_pdf(data):
    print("üìù Generating PDF Report... üìÑ")
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"movies_{timestamp}.pdf"

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # ‚úÖ Title
    pdf.cell(200, 10, "Movie Insights Report", ln=True, align="C")
    pdf.ln(5)

    # üîπ Add Data with Bifurcation Lines
    for movie in data:
        pdf.set_font("Arial", style='B', size=12)  # Bold for Title
        pdf.cell(0, 8, f" {movie.get('Title', 'N/A')}", ln=True)
        pdf.set_font("Arial", size=11)

        for key, value in movie.items():
            if key != "Title":  # Skip repeating the title
                pdf.multi_cell(0, 8, f"{key}: {value}")

        # üöÄ Add a Bifurcation Line for Separation
        pdf.set_draw_color(0, 0, 0)  # Black color
        pdf.set_line_width(0.5)
        pdf.line(10, pdf.get_y(), 200, pdf.get_y())  # Draw horizontal line
        pdf.ln(5)  # Add space after the line

    pdf.output(filename)
    print(f"‚úÖ PDF report saved as {filename}!\n")
load_to_pdf(transformed_data)

üçÖ Fetching Latest Movies from Rotten Tomatoes...
‚úÖ Fetched 11 movies from Rotten Tomatoes!

üöÄ Fetching Data from OMDb API... üé¨
‚ùå Movie not found: Open_Your_Eyes_Jeffrey
‚ùå Movie not found: The_Visitor_2024
‚ùå Movie not found: Ufc_313_Pereira_Vs_Ankalaev
‚úÖ Fetched 8 movies successfully!

üîÑ Transforming Data... üßπ
‚ú® Transformation Complete for 8 movies!

üíæ Saving Data to CSV... üìä
‚úÖ Data successfully saved to movies_20250308_230725.csv!

üìù Generating PDF Report... üìÑ
‚úÖ PDF report saved as movies_20250308_230725.pdf!

