In [2]:
import os
from bs4 import BeautifulSoup
import json
import pandas as pd

In [5]:
base_dir = '../pages/'
files = os.listdir(base_dir)
files = [base_dir + file for file in files if '.htm' in file]
files

['../pages/dezhi-yin.htm',
 '../pages/GA.htm',
 '../pages/he-zhang.htm',
 '../pages/Locations.htm',
 '../pages/Application-Process.htm',
 '../pages/ManishAgarwal.htm',
 '../pages/tim-smith.htm',
 '../pages/Wolfgang.htm',
 '../pages/Alan-Henver.htm',
 '../pages/immunization.htm',
 '../pages/Shivendu.htm',
 '../pages/GBAIS.htm',
 '../pages/Ron-satterfield.htm',
 '../pages/Joni-jones.htm',
 '../pages/DonBerndt.htm',
 '../pages/tom-stablein.htm',
 '../pages/Mathew.htm',
 '../pages/Homepage.htm',
 '../pages/bull-runner-hours.htm',
 '../pages/Grandon-Gill.htm',
 '../pages/barb-W.htm',
 '../pages/FAQ.htm',
 '../pages/Anol.htm',
 '../pages/Insurance.htm',
 '../pages/bull-runner.htm',
 '../pages/Sunil-Mithas.htm',
 '../pages/Daniel-Z.htm',
 '../pages/Faculty.htm']

In [18]:
file1 = files[-1]
with open(file1, 'r', encoding='utf-8') as file:
        content = file.read()

In [19]:
soup = BeautifulSoup(content, 'html.parser')
title = soup.title.string if soup.title else "No title"
target_div = soup.find('div', class_='mainContent_well u-flexItem--largeExtra')
paragraphs = target_div.find_all('p')
main_content = ' '.join(p.get_text().strip() for p in paragraphs)
    
metadata = {
    'keywords': soup.find('meta', attrs={'name': 'keywords'})['content'] if soup.find('meta', attrs={'name': 'keywords'}) else "",
    'description': soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "",
    # Add more metadata extraction as needed
}
main_content


'Manish Agrawal\xa0• Professor • TampaDon Berndt\xa0• Associate Professor • TampaAnol Bhattacherjee\xa0 • Professor • TampaKaushik Dutta\xa0• Information Systems and Management Director and ProfessorGrandon Gill\xa0• Professor and Academic Director, DBA programAlan R. Hevner\xa0• Distinguished University Professor and AAAS FellowWolfgang S. Jank, Anderson Professor and advisor, Center for Analytics and Creativity Joni L. Jones\xa0• Associate Professor • Academic Director, MS in BAIS program • Faculty Liaison,\n               Florida Center for Cybersecurity and Academic Director • TampaSunil Mithas\xa0• Professor and World Class Scholar • TampaMatthew Mullarkey\xa0• Associate Professor of Instruction and Director, DBA programRonald K. Satterfield\xa0• Professor of Instruction • Academic Director, Weekend Executive MS program• Instructor\n               • Member, Board of Directors, USF Federal Credit Union • TampaShivendu Shivendu\xa0• Associate Professor • TampaTim Smith • Professor o

In [22]:
import os
import json
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

class HTMLExtractor:
    def __init__(self):
        self.extractors = {}

    def add_extractor(self, name, title_func, content_func, metadata_func):
        self.extractors[name] = {
            'title': title_func,
            'content': content_func,
            'metadata': metadata_func
        }

    def extract(self, html, extractor_name):
        extractor = self.extractors.get(extractor_name)
        if not extractor:
            return None
        
        soup = BeautifulSoup(html, 'html.parser')
        return {
            'title': extractor['title'](soup),
            'content': extractor['content'](soup),
            'metadata': extractor['metadata'](soup)
        }

    def guess_extractor(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        for name, extractor in self.extractors.items():
            if extractor['content'](soup):
                return name
        return None

def process_htm_file(file_path, extractor, output_dir):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    extractor_name = extractor.guess_extractor(content)
    if extractor_name:
        extracted_data = extractor.extract(content, extractor_name)
        if extracted_data:
            extracted_data['extractor_used'] = extractor_name
            extracted_data['source'] = file_path

            # Create a LangChain Document
            doc = Document(
                page_content=extracted_data['content'],
                metadata={
                    'source': file_path,
                    'title': extracted_data['title'],
                    **extracted_data['metadata'],
                    'extractor_used': extractor_name
                }
            )

            # Use RecursiveCharacterTextSplitter
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            splits = text_splitter.split_documents([doc])

            # Prepare data for JSON
            doc_data = {
                'metadata': doc.metadata,
                'chunks': [
                    {
                        'content': split.page_content,
                        'metadata': split.metadata
                    } for split in splits
                ]
            }

            # Save to individual JSON file
            output_file = os.path.join(output_dir, f"{os.path.basename(file_path)}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(doc_data, f, ensure_ascii=False, indent=2)

            return output_file
    
    print(f"Failed to extract data from {file_path}")
    return None

def process_directory(input_dir, output_dir, extractor):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    processed_files = []
    for filename in os.listdir(input_dir):
        if filename.endswith('.htm') or filename.endswith('.html'):
            file_path = os.path.join(input_dir, filename)
            output_file = process_htm_file(file_path, extractor, output_dir)
            if output_file:
                processed_files.append(output_file)
    
    return processed_files

# Example usage
extractor = HTMLExtractor()

# Add extractor for pages with a specific div class
extractor.add_extractor(
    'main_content_well',
    lambda soup: soup.title.string if soup.title else "No title",
    lambda soup: ' '.join(p.get_text().strip() for p in soup.find('div', class_='mainContent_well u-flexItem--largeExtra').find_all('p')) if soup.find('div', class_='mainContent_well u-flexItem--largeExtra') else "",
    lambda soup: {
        'keywords': soup.find('meta', attrs={'name': 'keywords'})['content'] if soup.find('meta', attrs={'name': 'keywords'}) else "",
        'description': soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "",
    }
)

# Add extractor for pages with a specific article structure
extractor.add_extractor(
    'article_content',
    lambda soup: soup.find('h1', class_='article-title').get_text() if soup.find('h1', class_='article-title') else "No title",
    lambda soup: ' '.join(p.get_text().strip() for p in soup.find('article', class_='main-content').find_all('p')) if soup.find('article', class_='main-content') else "",
    lambda soup: {
        'author': soup.find('span', class_='author-name').get_text() if soup.find('span', class_='author-name') else "Unknown",
        'date': soup.find('time', class_='publish-date')['datetime'] if soup.find('time', class_='publish-date') else "Unknown",
    }
)

# Add extractors as before
# ... [Add your extractors here]

# Process the directory
input_directory = '../pages'
output_directory = '../pages/json'
processed_files = process_directory(input_directory, output_directory, extractor)

print(f"Processed {len(processed_files)} files. JSON files saved in {output_directory}")

# Example us

Processed 28 files. JSON files saved in ../pages/json
