In [9]:
from bs4 import BeautifulSoup
import os
import re

folder = "D:\\RAM\\AI\\NLP\\REP_gen\\pro_1\\pdf_analysis\\samples_pdf\\"

def extract_sections_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    text_blocks = soup.find_all(["h1", "h2", "h3", "p", "div"])
    sections = {}
    current_section = "Unknown"

    for tag in text_blocks:
        text = tag.get_text(strip=True)
        if not text:
            continue

        # Match section headers
        match = re.match(r"(abstract|introduction|methods?|results?|discussion|conclusion|references?)", text.lower())
        if match:
            current_section = match.group(1).capitalize()
            sections[current_section] = []
        else:
            sections.setdefault(current_section, []).append(text)

    return sections

# Loop through all HTML files and extract
for filename in os.listdir(folder):
    if filename.endswith(".html"):
        path = os.path.join(folder, filename)
        print(f"\n📄 {filename}")
        sections = extract_sections_from_html(path)

        for section, content in sections.items():
            print(f"\n🔹 {section} ({len(content)} blocks):")
            print("   " + content[0][:150] + "...")  # Preview first 150 chars of the section



📄 article_1.html

🔹 Unknown (35 blocks):
   Your privacy, your choiceWe use essential cookies to make sure the site can function. We also use optional cookies for advertising, personalisation of...

🔹 Abstract (121 blocks):
   Advertisement...

🔹 References (184 blocks):
   Ding Y, John NW, Smith L, Sun JA, Smith M (2015) Combination of 3D skin surface texture features and 2D ABCD features for improved melanoma diagnosis....

📄 article_2.html

🔹 Unknown (35 blocks):
   Your privacy, your choiceWe use essential cookies to make sure the site can function. We also use optional cookies for advertising, personalisation of...

🔹 Abstract (82 blocks):
   Advertisement...

🔹 References (139 blocks):
   Alam MM, Islam MT (2019) Machine learning approach of automatic identification and counting of blood cells. Healthc Technol Lett 6(4):103–108ArticleGo...

📄 article_3.html

🔹 Unknown (44 blocks):
   Your privacy, your choiceWe use essential cookies to make sure the site can function. We also us

In [None]:
import ollama

# Load the .html file directly
file = "D:\\RAM\AI\\NLP\\REP_gen\\pro_1\\pdf_analysis\\samples_pdf\\real_page.html"
with open(file, "r", encoding="utf-8") as f:
    html_content = f.read()

# Initialize a conversation
response = ollama.chat(
    model="llama3.2",  # Or "mistral", "phi", etc.
    messages=[
        {"role": "system", "content": "You are an assistant who reads and explains HTML pages."},
        {"role": "user", "content": f"Here is the content of a website:\n\n{html_content}\n\nCan you summarize what this page is about?"}
    ]
)

print("🤖", response["message"]["content"])

  file = "D:\\RAM\AI\\NLP\\REP_gen\\pro_1\\pdf_analysis\\samples_pdf\\real_page.html"


🤖 This appears to be a Google Scholar search results page, but the content of the page is not visible due to its lengthy and complex HTML structure.

However, based on the structure and layout of the page, it appears that this is a search results page for academic papers related to "convolutional neural networks" (CNNs). The page displays a list of 10 research articles or papers that match the search query, along with relevant metadata such as author names, publication dates, and titles.

If you'd like to see the actual content of the pages listed on this page, you would need to click on the individual article titles to view their full texts.
