In [None]:
#!pip install html2text #required for SimpleWebPageReader to work

In [2]:
from pathlib import Path
import os
from llama_index import (
    download_loader,
    SimpleWebPageReader
)

import pandas as pd

In [3]:
# Params
data_folder = "SolarBot Master Datasets"
websites_metadata_path = "websites_metadata.xlsx"
pdf_metadata_path = "pdf_metadata.xlsx"

In [4]:
# Load metadata mapping tables
websites_metadata = pd.read_excel(os.path.join(data_folder, websites_metadata_path))
pdf_metadata = pd.read_excel(os.path.join(data_folder, pdf_metadata_path))

#Merge the files
full_metadata = pd.concat([websites_metadata, pdf_metadata])

In [5]:
# Function to return metadata
def get_file_metadata(file_name: str, mapping: pd.DataFrame):
    """Get file metadata."""
    date = mapping[mapping['file_path'] == file_name]['date'].values[0]
    category = mapping[mapping['file_path'] == file_name]['category'].values[0]
    metadata = {'date': date, 'category': category}
    
    return metadata

In [7]:
#Obtain full list of docs
doc_list = full_metadata['file_path']

#Instantiate loaders
YoutubeTranscriptReader = download_loader("YoutubeTranscriptReader")
yt_loader = YoutubeTranscriptReader()

UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)
unstructured_loader = UnstructuredReader()

# Alternative pdf reader
# PDFReader = download_loader("PDFReader")
# pdf_loader = PDFReader()

# Load in all documents sequentially
doc_set = {}
all_docs = []

for file in doc_list:
    
    if "youtube" in file:
        doc = yt_loader.load_data(ytlinks=[file])
    elif "https://" in file:
        doc =  SimpleWebPageReader(html_to_text=True).load_data([file])
    else:
        doc = unstructured_loader.load_data(file=Path(os.path.join(data_folder, file)), split_documents=False)
        
    # Insert metadata info
    for d in doc:
        d.extra_info = get_file_metadata(file, full_metadata)
        
    doc_set[file] = doc
    all_docs.extend(doc)

[nltk_data] Downloading package punkt to C:\Users\Zhong
[nltk_data]     Xuean\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Zhong Xuean\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
