pip3 install chromadb

pip3 install sentence-transformers

pip3 install google-generativeai

pip3 install bs4

In [1]:
import os
import uuid
import chromadb
from bs4 import BeautifulSoup
import google.generativeai as genai
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set up Google Generative AI API
GEMINI_AI_API_KEY = os.environ['GEMINI_AI_API_KEY']

In [3]:
client = chromadb.Client()
collection_name = 'esg_collection'

try:
    collection = client.get_collection(collection_name)
    print(f"Collection '{collection_name}' already exists. Using the existing collection.")
except Exception:
    collection = client.create_collection(collection_name)
    print(f"Collection '{collection_name}' created.")

# initialise an embedding model (change to openai embedding model later)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

genai.configure(api_key = GEMINI_AI_API_KEY)

Collection 'esg_collection' created.


In [4]:
def extract_text_from_html(html_file):
    '''Extract text from an HTML file.'''
    with open(html_file, 'r') as file:
        soup = BeautifulSoup(file, 'html.parser')
    full_text = ' '.join([p.get_text() for p in soup.find_all('p')])
    return full_text


def embed_text(esg_text):
    '''Generate embedding for the text extracted using Sentence Transformers.'''
    return embedding_model.encode(esg_text)


def store_embedding(embedding, esg_text):
    '''Store embedding in the collection.'''
    collection.add(
        documents=[esg_text],
        embeddings=[embedding],
        ids = [f'{str(uuid.uuid4())}']
    )

def process_html_files(html_files):
    '''Process html files - extracting texts, embedding and storing in the vector database'''
    for html_file in html_files:
        esg_text = extract_text_from_html(html_file)
        if esg_text: 
            embedding = embed_text(esg_text)
            store_embedding(embedding, esg_text)


def retrieve_esg_info(query):
    '''Retrieve ESG information based on a user query.'''
    query_embedding = embed_text(query)
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=1
    )
    return ' '.join(results['documents'][0])


def generate_query_response(query, context_text):
    '''Generate a query-specific response using Google Generative AI.'''
    model = genai.GenerativeModel(model_name='gemini-1.5-pro')
    response = model.generate_content(f'Based on the following context, answer this question: {query}\n\nContext:\n{context_text}')
    if response.text:
        return response.text


def display_results(query):
    '''Display the retrieved ESG information based on the user query.'''
    print(f'Query: {query}')
    info_found = retrieve_esg_info(query)
    response = generate_query_response(query, info_found)
    
    print(f'ESG Initiatives: {response}')
    print('\n')


# Example Usage
if __name__ == '__main__':
    html_files = ['/Users/yxiao/Downloads/Prod224_6666_04624360_20230331.html']
    process_html_files(html_files)

    user_query = 'What sustainability initiatives does Sybron Limited have?'
    display_results(user_query)

Query: What sustainability initiatives does Sybron Limited have?
ESG Initiatives: Sybron Limited has achieved carbon neutrality and obtained NCZ Silver certification.  This certification signifies that all organizational emissions have been independently assessed and reported on in compliance with ISO 14064 and the GHG protocol corporate standard.  They also embrace diversity in all its forms as part of their social responsibility initiatives.





In [5]:
client.delete_collection(collection_name)