In [47]:
import json
import pathlib
from bs4 import BeautifulSoup

In [48]:
def convert_html_to_json(html_file: pathlib.PosixPath):
    json_file = './data/json/' + html_file.name + '.json'
    
    # Open both the HTML file and the JSON file
    with open(html_file.absolute(), 'r') as h, open(json_file, 'w') as j:
        # Use BeautifulSoup to parse the HTML
        soup = BeautifulSoup(h, 'html.parser')
        
        # Extract the title, author, category, and date of the article
        # If the elements are not found, set their values to an empty string
        title = soup.find(class_ = 'css-om3e2').string if soup.find(class_ = 'css-om3e2') else ''
        author = soup.find(class_ = 'css-PH621').string if soup.find(class_ = 'css-PH621') else ''
        category = soup.find(class_ = 'css-vV9lX').string if soup.find(class_ = 'css-vV9lX') else ''
        date = soup.find('time').string if soup.find('time') else ''
        
        # Extract the content of the article by concatenating all `p` elements
        # and remove newline characters
        content = ' '.join(p.text.strip() for p in soup.find_all('p') if p.text.strip())
        content = content.replace('\n', ' ')
        
        # Create a dictionary representing the article
        article = {
            'title': title.strip(),
            'author': author.strip(),
            'category': category.strip(),
            'date': date.strip(),
            'content': content
        }
        
        # Write the dictionary to the JSON file
        json.dump(article, j, indent=4)

In [None]:
def test_convert_html_to_json():
    # Create a test HTML file
    html = """
    <html>
    <head>
    <title>Test Article</title>
    </head>
    <body>
        <h1 class="css-om3e2">Test Title</h1>
        <span class="css-PH621">Test Author</span>
        <span class="css-vV9lX">Test Category</span>
        <time>2023-02-13</time>
        <p>Test content 1</p>
        <p>Test content 2</p>
    </body>
    </html>
    """
    html_file = pathlib.Path('./test_html.html')
    with open(html_file, 'w') as f:
        f.write(html)
        
    # Call the function to convert the HTML to JSON
    convert_html_to_json(html_file)
    
    # Read the generated JSON file and compare its content to the expected result
    json_file = pathlib.Path('./data/json/test_html.json')
    with open(json_file, 'r') as f:
        result = json.load(f)
    
    expected = {
        'title': 'Test Title',
        'author': 'Test Author',
        'category': 'Test Category',
        'date': '2023-02-13',
        'content': 'Test content 1 Test content 2'
    }
    
    assert result == expected, f"Expected {expected}, but got {result}"
    
    # Clean up the test files
    os.remove(html_file)
    os.remove(json_file)

In [50]:
def main():
    
    # Convert the HTML files to JSON
    raw_dir = pathlib.Path('./data/raw_html')
    i = 0
    for html_file in raw_dir.iterdir():
        convert_html_to_json(html_file=html_file)
        i += 1
        
    print(f'{i} files converted from html to json')

if __name__ == '__main__':
    main()

1920 files converted from html to json
