In [66]:
import google.generativeai as genai
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [47]:
def scrapingData(url):
    DRIVER_PATH = 'C:/path/to/chromedriver/chromedriver-win64/chromedriver.exe'  
    options = Options()
    options.add_argument('--headless')
    service = Service(executable_path=DRIVER_PATH)  
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    
    elements = driver.find_elements(By.XPATH, "//h2 | //p | //ul | //i")
    content = []
    headers=[]

    for element in elements:
        text = element.text.strip() 
        if element.tag_name == 'h2' or element.tag_name == 'i':
            headers.append(text)
        if not text: 
            continue
        if element.tag_name == 'ul':
            li_elements = element.find_elements(By.TAG_NAME, 'li')
            list_items = [li.text.strip() for li in li_elements if li.text.strip()]
            if list_items:
                content.append(" ".join(list_items))  
        else:
            content.append(text)  
    driver.quit()
    return content,headers
        

In [50]:
url = 'https://www.presight.io/privacy-policy.html'
content,headers = scrapingData(url)
headers

['PRIVACY POLICY',
 'Last updated 15 Sep 2023',
 'Information Collection and Use',
 'Types of Data Collected',
 'Personal Data',
 'Usage Data',
 'Use of Data',
 'Consent',
 'Access to Personal Information',
 'Accessing Your Personal Information',
 'Automated Edit Checks',
 'Disclosure of Information',
 'Sharing of Personal Data',
 'Google User Data and Google Workspace APIs',
 'Data Security',
 'Data Retention & Disposal',
 "Quality, Including Data Subjects' Responsibilities for Quality",
 'Monitoring and Enforcement',
 'Cookies',
 'Third-Party Websites',
 'Changes to Privacy Policy',
 'Contact Us',
 'Purposeful Use Only']

In [43]:
def index_content(content, headers):
    indexed_content = {}
    current_index = None
    title = None
    content_list = []
    for item in content:
        # Kiểm tra nếu item có trong headers
        if item in headers:
            # Nếu có title mới, lưu lại title và content cũ
            if current_index is not None:
                indexed_content[current_index] = {
                    'title': title,
                    'content': content_list
                }   
            # Thiết lập title mới và làm mới content
            title = item
            content_list = []
            current_index = len(indexed_content)  
        else:
            # Nếu không trùng, thêm item vào content của title hiện tại
            content_list.append(item)
    
    # Lưu phần cuối cùng
    if current_index is not None:
        indexed_content[current_index] = {
            'title': title,
            'content': content_list
        }
    return indexed_content

indexed_content = index_content(content, headers)

# In kết quả
for index, data in indexed_content.items():
    print(f"Index {index}:")
    print(f"  Title: {data['title']}")
    print(f"  Content: {data['content']}")

Index 0:
  Title: PRIVACY POLICY
  Content: []
Index 1:
  Title: Last updated 15 Sep 2023
  Content: ['At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.']
Index 2:
  Title: Information Collection and Use
  Content: ['We collect several different types of information for various purposes to provide and improve our Service to you.']
Index 3:
  Title: Types of Data Collected
  Content: []
Index 4:
  Title: Personal Data
  Content: ['While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to:', 'Email address First name and last name Phone number Address, State, Province, ZIP/Postal code, City Cookies and Usage Data']
Index 5:
  Title: Usage Data
  Content:

In [52]:
indexed_content[0] = {'title': 'PRIVACY POLICY, LAST UPDATE', 'content': ['Last updated 15 Sep 2023']}
indexed_content[1]['title'] = 'PRIVACY POLICY, description'
indexed_content[3]['content'] = 'Personal Data, Usage Data'
indexed_content[5]['content'] = 'Accessing Your Personal Information, Automated Edit Checks'
indexed_content

{0: {'title': 'PRIVACY POLICY, LAST UPDATE',
  'content': ['Last updated 15 Sep 2023']},
 1: {'title': 'PRIVACY POLICY, description',
  'content': ['At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.']},
 2: {'title': 'Information Collection and Use',
  'content': ['We collect several different types of information for various purposes to provide and improve our Service to you.']},
 3: {'title': 'Types of Data Collected',
  'content': 'Personal Data, Usage Data'},
 4: {'title': 'Personal Data',
  'content': ['While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to:',
   'Email address First name and last name Phone number Address, State, Province, ZIP

In [69]:
class Model():
    def separate(self, indexed_content):
        titles = {key: value['title'] for key, value in indexed_content.items()}
        content = {key: value['content'] for key, value in indexed_content.items()}
        return titles, content
    def chat_bot(self, indexed_content, query):
        titles, content = self.separate(indexed_content)
        model = SentenceTransformer('all-MiniLM-L6-v2')
        titles_embedding = list(titles.values())
        titles_embedding = model.encode(titles_embedding)
        query_embedding = model.encode([query])
        similarities = cosine_similarity(query_embedding, titles_embedding)
        most_similar_index = np.argmax(similarities)
        my_api_key = os.getenv("GEMINI_API_KEY")
        genai.configure(api_key=my_api_key)
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(f"Hãy trả lời {query}, dựa trên thông tin{content[most_similar_index]}")
        return response

In [72]:
model = Model()
query = input()
response = model.chat_bot(indexed_content, query)
print(f"Q: {query}\nA:{response}")



Q: what is the last updated day
A:response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "The last updated day is September 15th, 2023.\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "avg_logprobs": -0.11557210816277398
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 28,
        "candidates_token_count": 18,
        "total_token_count": 46
      }
    }),
)
