In [1]:
import requests
from bs4 import BeautifulSoup
import re
from moviepy.editor import VideoFileClip
import speech_recognition as sr
import os
from moviepy.editor import VideoFileClip
import whisper


### Webscraping from Wikipedia

In [4]:
def clean_text(text):
    """
    Purpose: Clean the extracted text from the Wikipedia page
    text: The text to clean
    """
    # Here we want to remove any text within square brackets, as it's often used for annotations
    text = re.sub(r'\[\w\]', '', text)
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Here we return the text with leading and trailing whitespaces removed
    return text

def wiki_summary(url, output_filename):
    # Here we use the requests library to retrieve the webpage
    response = requests.get(url)
    # If the connection was made, then we proceed to extract the content
    if response.status_code == 200:
        # We define the parser, which is the tool used to extract the content
        # We use BeautifulSoup to parse the webpage content
        soup = BeautifulSoup(response.text, 'html.parser')
        # The plot content is usually within the 'mw-parser-output' div tag. We direct the soap object to find this tag
        content_wrapper = soup.find('div', class_='mw-parser-output')
        # Here we create a placeholder for the extracted text
        summary_text = ''
        # If we find the content wrapper, then we proceed to extract the plot
        if content_wrapper:
            # The plot is usually under the 'span' tag with the id 'Plot' or 'Episodes'
            plot_heading = content_wrapper.find('span', id='Episodes')
            # If we find either the plot or episodes heading, then we proceed to extract the text
            if plot_heading:
                # We iterate through each element after the plot heading until we find the next section heading
                for elem in plot_heading.parent.find_next_siblings():
                    # If we find another section heading, then we stop
                    if elem.name in ['h2', 'h3']: 
                        break
                    # Here we append the text to the placeholder summary_text
                    summary_text += elem.get_text(separator="\n", strip=True) + '\n\n'
        
        # We can call the clean_text function to clean the extracted text
        summary_text = clean_text(summary_text)
        
        # Finally, we create a file and write the summary to the file
        with open(output_filename, 'w', encoding='utf-8') as file:
            file.write(summary_text)

        print(f"Summary has been successfully saved to '{output_filename}'")
    else:
        print("Error. Status code:", response.status_code)

In [5]:
# Example usage of the wiki_summary function
wiki_summary('https://en.wikipedia.org/wiki/The_Marvels', '/Users/sveerisetti/Desktop/Duke_Spring/LLM/Assignments/Assignment2/Scripts/Marvels.txt')

Summary has been successfully saved to '/Users/sveerisetti/Desktop/Duke_Spring/LLM/Assignments/Assignment2/Scripts/Marvels.txt'


### Scraping from Marvel Cinematic Universe Fandom Wiki Page


In [20]:
def clean_text(text):
    """
    Purpose: Clean the extracted text from the Wikipedia page
    text: The text to clean
    """
    # Here we want to remove any text within square brackets, as it's often used for annotations
    text = re.sub(r'\[\w\]', '', text)
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Here we return the text with leading and trailing whitespaces removed
    return text

def extract_sections(url, output_filename, class_names):
    try:
        # Here we use the requests library to retrieve the webpage
        response = requests.get(url)
        # Raise an exception if the status code is not 200
        response.raise_for_status() 

        # Define the parser, which is the tool used to extract the content. In this case we use BeautifulSoup to parse the webpage content
        soup = BeautifulSoup(response.text, 'html.parser')
        # Placeholder for the extracted text
        summary_text = ''

        # Iterate through each class name and extract the content
        for class_name in class_names:
            # Find the content under the class name
            content = soup.find('div', class_=class_name)
            # If we find the content, then we append it to the summary_text
            if content:
                # We can then append the text to the placeholder summary_text
                summary_text += content.get_text(separator="\n", strip=True) + '\n\n'
        
        # We can use the clean_text function to clean the extracted text
        summary_text = clean_text(summary_text)

        # Finally, we create a file and write the summary to the file
        with open(output_filename, 'w', encoding='utf-8') as file:
            file.write(summary_text)

        # Print a success message
        print(f"Content under classes {class_names} has been successfully saved to '{output_filename}'")
    except requests.HTTPError as e:
        # If an HTTP error occurs, then we print the error message
        print(f"Failed to retrieve the webpage. HTTP Error: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

Content under classes ['mw-parser-output'] has been saved to '/Users/sveerisetti/Desktop/Duke_Spring/LLM/Assignments/Assignment2/Marvel/Lore2/Avengers_Echo.txt'


In [None]:
# Example usage of the extract_sections function
extract_sections('https://marvelcinematicuniverse.fandom.com/wiki/Echo', '/Users/sveerisetti/Desktop/Duke_Spring/LLM/Assignments/Assignment2/Marvel/Lore2/Avengers_Echo.txt', ['mw-parser-output'])