In [1]:
import google.generativeai as genai
import os
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
def scrapingData(url):
    DRIVER_PATH = "C:/edgedriver_win32/msedgedriver.exe"
    options = Options()
    options.add_argument('--headless')
    service = Service(executable_path=DRIVER_PATH)  
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    
    elements = driver.find_elements(By.XPATH, "//h2 | //p | //ul | //i")
    content = []
    headers = []

    for element in elements:
        text = element.text.strip() 
        if element.tag_name == 'h2' or element.tag_name == 'i':
            headers.append(text)
        if not text: 
            continue
        if element.tag_name == 'ul':
            li_elements = element.find_elements(By.TAG_NAME, 'li')
            list_items = [li.text.strip() for li in li_elements if li.text.strip()]
            if list_items:
                content.append(" ".join(list_items))  
        else:
            content.append(text)

    driver.quit()
    return content,headers
        

In [3]:
url = 'https://www.presight.io/privacy-policy.html'
content, headers = scrapingData(url)
content

['PRIVACY POLICY',
 'Last updated 15 Sep 2023',
 'At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.',
 'Information Collection and Use',
 'We collect several different types of information for various purposes to provide and improve our Service to you.',
 'Types of Data Collected',
 'Personal Data',
 'While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to:',
 'Email address First name and last name Phone number Address, State, Province, ZIP/Postal code, City Cookies and Usage Data',
 'Usage Data',
 'We may also collect information that your browser sends whenever you visit our Service or when you access the Service by or through a mobile device ("Us

In [4]:
headers

['PRIVACY POLICY',
 'Last updated 15 Sep 2023',
 'Information Collection and Use',
 'Types of Data Collected',
 'Personal Data',
 'Usage Data',
 'Use of Data',
 'Consent',
 'Access to Personal Information',
 'Accessing Your Personal Information',
 'Automated Edit Checks',
 'Disclosure of Information',
 'Sharing of Personal Data',
 'Google User Data and Google Workspace APIs',
 'Data Security',
 'Data Retention & Disposal',
 "Quality, Including Data Subjects' Responsibilities for Quality",
 'Monitoring and Enforcement',
 'Cookies',
 'Third-Party Websites',
 'Changes to Privacy Policy',
 'Contact Us',
 'Purposeful Use Only']

In [5]:
def index_content(content, headers):
    indexed_content = {}
    current_index = None
    title = None
    content_list = []
    for item in content:
        # Kiểm tra nếu item có trong headers
        if item in headers:
            # Nếu có title mới, lưu lại title và content cũ
            if current_index is not None:
                indexed_content[current_index] = {
                    'title': title,
                    'content': content_list
                }   
            # Thiết lập title mới và làm mới content
            title = item
            content_list = []
            current_index = len(indexed_content)  
        else:
            # Nếu không trùng, thêm item vào content của title hiện tại
            content_list.append(item)
    
    # Lưu phần cuối cùng
    if current_index is not None:
        indexed_content[current_index] = {
            'title': title,
            'content': content_list
        }
    return indexed_content

indexed_content = index_content(content, headers)

# In kết quả
for index, data in indexed_content.items():
    print(f"Index {index}:")
    print(f"\tTitle: {data['title']}")
    print(f"\tContent: {data['content']}")

Index 0:
	Title: PRIVACY POLICY
	Content: []
Index 1:
	Title: Last updated 15 Sep 2023
	Content: ['At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.']
Index 2:
	Title: Information Collection and Use
	Content: ['We collect several different types of information for various purposes to provide and improve our Service to you.']
Index 3:
	Title: Types of Data Collected
	Content: []
Index 4:
	Title: Personal Data
	Content: ['While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to:', 'Email address First name and last name Phone number Address, State, Province, ZIP/Postal code, City Cookies and Usage Data']
Index 5:
	Title: Usage Data
	Content: ['We may al

In [6]:
indexed_content[0] = {'title': 'PRIVACY POLICY - Last update', 'content': ['Last updated 15 Sep 2023']}
indexed_content[1]['title'] = 'PRIVACY POLICY - Description'
indexed_content[3]['content'] = 'Personal Data, Usage Data'
indexed_content[8]['content'] = 'Accessing Your Personal Information, Automated Edit Checks'
indexed_content

{0: {'title': 'PRIVACY POLICY - Last update',
  'content': ['Last updated 15 Sep 2023']},
 1: {'title': 'PRIVACY POLICY - Description',
  'content': ['At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.']},
 2: {'title': 'Information Collection and Use',
  'content': ['We collect several different types of information for various purposes to provide and improve our Service to you.']},
 3: {'title': 'Types of Data Collected',
  'content': 'Personal Data, Usage Data'},
 4: {'title': 'Personal Data',
  'content': ['While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to:',
   'Email address First name and last name Phone number Address, State, Province, Z

In [11]:
class Chatbot():
    def __init__(self, encoder, model):
        self.__api_key = os.getenv("API_KEY")
        self.encoder = encoder
        self.model = model

    def separate(self, indexed_content):
        titles = {key: value['title'] for key, value in indexed_content.items()}
        content = {key: value['content'] for key, value in indexed_content.items()}
        return titles, content
    
    def chat_bot(self, indexed_content, query):
        titles, content = self.separate(indexed_content)
        titles_embedding = list(titles.values())
        titles_embedding = self.encoder.encode(titles_embedding)
        query_embedding = self.encoder.encode([query])
        similarities = cosine_similarity(query_embedding, titles_embedding)
        most_similar_index = np.argmax(similarities)

        genai.configure(api_key=self.__api_key)

        response = self.model.generate_content(f"Answer this query{query}, base on the information {content[most_similar_index]}")
        return response

In [13]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')
model = genai.GenerativeModel("gemini-1.5-flash")

chatbot = Chatbot(encoder=encoder, model=model)
query = "Data security policy of this website?"
response = chatbot.chat_bot(indexed_content, query)
print(f"Q: {query}")
print(f"A: {response.text}")

Q: Data security policy of this website?
A: This website employs a data security policy that includes:

* **Data Encryption:**  All data is encrypted both while being transmitted (in transit) and while stored (at rest) using industry-standard encryption methods.

* **Regular Security Audits and Assessments:**  The platform undergoes regular security audits and vulnerability assessments to identify and address potential weaknesses.

* **Employee Training:**  Employees receive training on best practices for data security.

* **Restricted Access:** Access to customer data is controlled and limited to only those who need it for their job ("need-to-know" basis).

