In [None]:
import PyPDF2

def read_pdf_text(pdf_path, page_range=None):
    """
    Read text content from a PDF file

    Args:
        pdf_path: Path to the PDF file
        page_range: Optional tuple specifying the page range to read, 
                    e.g., (1, 5) reads pages 1 through 5 (0-based indexing)
    """
    try:
        # Open the PDF file
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get the total number of pages
            total_pages = len(pdf_reader.pages)
            print(f"Total pages in PDF: {total_pages}")
            
            text_content = ""
            
            # Determine which pages to read
            if page_range:
                start, end = page_range
                # Ensure the page range is valid
                start = max(0, start)
                end = min(total_pages, end)
                page_range = range(start, end)
                print(f"Reading content from page {start+1} to {end}:")
            else:
                page_range = range(total_pages)
                print("Reading all pages:")
            
            # Extract text from each page
            for page_num in page_range:
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text_content += f"Page {page_num+1}:\n{page_text}\n\n"
                    print(f"Successfully read page {page_num+1}")
                else:
                    print(f"No text content found on page {page_num+1}")
            
            return text_content
    
    except FileNotFoundError:
        print(f"Error: File {pdf_path} does not exist")
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
    return ""

# Example usage
if __name__ == "__main__":
    pdf_path = "example.pdf"  # Replace with your actual PDF path
    
    # Method 1: Read all pages
    full_text = read_pdf_text(pdf_path)
    print("\nFull document text:\n", full_text)
    
    # Method 2: Read a specific range of pages (e.g., pages 1 to 3, zero-based)
    # Specifying a page range overrides reading the entire document
    partial_text = read_pdf_text(pdf_path, page_range=(0, 3))
    print("\nPartial document text (pages 1–3):\n", partial_text)


In [None]:
import docx
from docx.shared import Inches

def read_docx_text(docx_path):
    """
    Read text content from a Word document and return a list of paragraphs

    Args:
        docx_path: Path to the Word document (.docx)
    """
    try:
        # Open the Word document
        doc = docx.Document(docx_path)
        
        # Extract document metadata
        info = {
            "Title": doc.core_properties.title if doc.core_properties.title else "No title",
            "Author": doc.core_properties.author if doc.core_properties.author else "Unknown author",
            "Created": doc.core_properties.created.strftime("%Y-%m-%d %H:%M:%S")
                       if doc.core_properties.created else "No creation time",
            "ParagraphCount": len(doc.paragraphs)
        }
        print(f"Document info:\n{info}\n")
        
        # Extract text from each paragraph
        all_text = ""
        paragraphs = []
        for i, para in enumerate(doc.paragraphs, 1):
            para_text = para.text.strip()
            if para_text:
                paragraphs.append(para_text)
                all_text += f"Paragraph {i}: {para_text}\n"
        
        print(f"Successfully read {len(paragraphs)} non-empty paragraphs\n")
        return all_text, paragraphs, info
    
    except FileNotFoundError:
        print(f"Error: File {docx_path} does not exist")
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
    return "", [], {}

# Example usage
if __name__ == "__main__":
    docx_path = "example.docx"  # Replace with your actual Word file path
    
    full_text, para_list, doc_info = read_docx_text(docx_path)
    
    # Print the first 3 paragraphs (example)
    print("First 3 paragraphs of the document:")
    for para in para_list[:3]:
        print("-", para)


In [1]:
!pip install PyPDF2
import PyPDF2

def read_pdf_text(pdf_path, page_range=None):
    """
    Read text content from a PDF file

    Args:
        pdf_path: Path to the PDF file
        page_range: Optional tuple specifying the page range to read, 
                    e.g., (1, 5) reads pages 1 through 5 (0-based indexing)
    """
    try:
        # Open the PDF file
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get the total number of pages
            total_pages = len(pdf_reader.pages)
            print(f"Total pages in PDF: {total_pages}")
            
            text_content = ""
            
            # Determine which pages to read
            if page_range:
                start, end = page_range
                # Ensure the page range is valid
                start = max(0, start)
                end = min(total_pages, end)
                page_range = range(start, end)
                print(f"Reading content from page {start+1} to {end}:")
            else:
                page_range = range(total_pages)
                print("Reading all pages:")
            
            # Extract text from each page
            for page_num in page_range:
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text_content += f"Page {page_num+1}:\n{page_text}\n\n"
                    print(f"Successfully read page {page_num+1}")
                else:
                    print(f"No text content found on page {page_num+1}")
            
            return text_content
    
    except FileNotFoundError:
        print(f"Error: File {pdf_path} does not exist")
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
    return ""

# Example usage
if __name__ == "__main__":
    pdf_path = "C:/Users/zlzdego/Downloads/AY 2023-2024/AY 2023-2024/JbtHutpwaVcqwaYdfqWpgP3m0s0Ai8tYaTcYUUHD.pdf"  # Replace with your actual PDF path
    
    # Method 1: Read all pages
    full_text = read_pdf_text(pdf_path)
    print("\nFull document text:\n", full_text)
    
    # Method 2: Read a specific range of pages (e.g., pages 1 to 3, zero-based)
    # Specifying a page range overrides reading the entire document
    partial_text = read_pdf_text(pdf_path, page_range=(0, 3))
    print("\nPartial document text (pages 1–3):\n", partial_text)


Total pages in PDF: 4
Reading all pages:
Successfully read page 1
Successfully read page 2
Successfully read page 3
Successfully read page 4

Full document text:
 Page 1:
Brown University HMAN 2400U, Spring 2024  Into the Wild: Thinking Democracy Ecologically  Monday 3:00-5:30     Pembroke 003 Mark Cladis (off hrs Thurs 3:45-5:30)  Sharon Krause (off hrs Wed 12:30-2:30)  How are we to respond to ecological crises that interweave politics, economy, religion, and culture, and that affect and position people differently based on their race, class, gender, sexuality, and other aspects of their identities? What resources do we have—or can we create—for reimagining “the human” and the more-than-human in ways that might be emancipatory for both? What kinds of cultural efforts, artistic work, social practices, and political institutions might figure in this reimagining? This collaborative humanities seminar explores a diverse range of contemporary and historical works ranging from poetry, fict

In [2]:
import docx
from docx.shared import Inches

def read_docx_text(docx_path):
    """
    Read text content from a Word document and return a list of paragraphs

    Args:
        docx_path: Path to the Word document (.docx)
    """
    try:
        # Open the Word document
        doc = docx.Document(docx_path)
        
        # Extract document metadata
        info = {
            "Title": doc.core_properties.title if doc.core_properties.title else "No title",
            "Author": doc.core_properties.author if doc.core_properties.author else "Unknown author",
            "Created": doc.core_properties.created.strftime("%Y-%m-%d %H:%M:%S")
                       if doc.core_properties.created else "No creation time",
            "ParagraphCount": len(doc.paragraphs)
        }
        print(f"Document info:\n{info}\n")
        
        # Extract text from each paragraph
        all_text = ""
        paragraphs = []
        for i, para in enumerate(doc.paragraphs, 1):
            para_text = para.text.strip()
            if para_text:
                paragraphs.append(para_text)
                all_text += f"Paragraph {i}: {para_text}\n"
        
        print(f"Successfully read {len(paragraphs)} non-empty paragraphs\n")
        return all_text, paragraphs, info
    
    except FileNotFoundError:
        print(f"Error: File {docx_path} does not exist")
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
    return "", [], {}

# Example usage
if __name__ == "__main__":
    docx_path = "C:/Users/zlzdego/Downloads/AY 2023-2024/AY 2023-2024/jpVWstdGlchQYBzAquRYEWOUOHfP5nDDjJmFRFUT.docx"  # Replace with your actual Word file path
    
    full_text, para_list, doc_info = read_docx_text(docx_path)
    
    # Print the first 3 paragraphs (example)
    print("First 3 paragraphs of the document:")
    for para in para_list[:3]:
        print("-", para)

Document info:
{'Title': 'No title', 'Author': 'Mercedes Vaquero', 'Created': '2023-08-07 17:20:00', 'ParagraphCount': 273}

Successfully read 172 non-empty paragraphs

First 3 paragraphs of the document:
- Hispanic Culture Through Cinema
- HISP0710B-S01, Fall ‘23
- Freshman Seminar
