In [1]:
import pandas as pd
from trafilatura.sitemaps import sitemap_search
from tqdm import tqdm
from trafilatura import fetch_url, extract
import time

In [2]:
def get_urls_from_sitemap(resource_url: str) -> list:
    """
    Get a list of urls from a sitemap with trafilatura
    """
    urls = sitemap_search(resource_url)
    return urls

In [1]:
def extract_article(url: str) -> dict:
    """
    Extract text content from a url
    """
    downloaded = fetch_url(url) 
    article = extract(downloaded, favor_precision=True)
    
    return article

In [4]:
def create_dataset(list_of_websites: list) -> pd.DataFrame:
    """
    Create a dataframe from a list of sitemaps that is passed to get_urls_from_sitemap
    """
    data = []
    for website in tqdm(list_of_websites, desc="Websites"):
        urls = get_urls_from_sitemap(website)
        for url in tqdm(urls, desc="URLs"):
            d = {
                'url': url,
                "article": extract_article(url)
            }
            data.append(d)
            time.sleep(0.5)

    df = pd.DataFrame(data)
    df = df.drop_duplicates()
    df = df.dropna()

    return df

### Broken URLS:
These either have empty html tree somewhere which kills my kernel and I have to restart or just returns an empty dataframe


- https://www.fortheculturegames.us/
- https://www.kilcommonslaw.com/
- https://tourguidegames.com/
- https://www.balancebound.co/
- https://mindfulmamamentor.com/
- https://hosthelpr.com/
- https://coreptpilates.com/
- https://pinotspalette.com/danville/

In [8]:
if __name__ == "__main__":
    #string = st.text_input('Input The Email Associated with your account')
    #string = "https://carpedmdating.com/"
    #list_of_websites = string.split()
    list_of_websites = ["https://drinkyoro.com/"]
    # list_of_websites = []

    df = create_dataset(list_of_websites)

Websites:   0%|                                           | 0/1 [00:00<?, ?it/s]
URLs:   0%|                                              | 0/32 [00:00<?, ?it/s][A
URLs:   3%|█▏                                    | 1/32 [00:00<00:25,  1.23it/s][A
URLs:   6%|██▍                                   | 2/32 [00:01<00:23,  1.28it/s][A
URLs:   9%|███▌                                  | 3/32 [00:02<00:23,  1.21it/s][A
URLs:  12%|████▊                                 | 4/32 [00:03<00:23,  1.20it/s][A
URLs:  16%|█████▉                                | 5/32 [00:04<00:23,  1.17it/s][A
URLs:  19%|███████▏                              | 6/32 [00:05<00:23,  1.13it/s][A
URLs:  22%|████████▎                             | 7/32 [00:06<00:22,  1.12it/s][A
URLs:  25%|█████████▌                            | 8/32 [00:06<00:21,  1.12it/s][A
URLs:  28%|██████████▋                           | 9/32 [00:07<00:19,  1.17it/s][A
URLs:  31%|███████████▌                         | 10/32 [00:08<00:19,  1.15it/s

In [10]:
df.to_csv('yoro.csv')