In [7]:
import pinecone
from langchain_community.vectorstores import Pinecone as PineconeStore

pc = pinecone.Pinecone(api_key="c079a1cd-1698-41b2-b457-74aa836db1aa")
index = pc.Index('chat-agh')
index

<pinecone.data.index.Index at 0x110eb4350>

In [8]:
index.upsert

<bound method Index.upsert of <pinecone.data.index.Index object at 0x110eb4350>>

In [9]:
from usp.tree import sitemap_tree_for_homepage
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from src.utils import config

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=config["sources_loader"]["chunk_size"],
    chunk_overlap=config["sources_loader"]["chunk_overlap"]
)


domains = [
    'https://www.miasteczko.agh.edu.pl'
]

def get_pages_from_sitemap(domain):
  """
  Extracts URLs from a website's sitemap.
  """
  try:
      all_pages = []
      tree = sitemap_tree_for_homepage(domain)
      for page in tree.all_pages():
        all_pages.append(page.url)
  except Exception as e:
      print(f"Error while fetching url's for domain {domain}: {e}")
      return []

  return all_pages

def format_link(link: str):
    return link.split('#')[0]

def filter(page):
    for year in range(16, 22):
        if f'20{str(year)}' in page:
            return True
    if len(page) > 110:
        return True
    return False

pages = []
for domain in domains:
    new_links = [format_link(page) for page in get_pages_from_sitemap(domain) if not filter(page)]
    pages.extend(new_links)
    print(f"{len(new_links)} url's found in domain {domain}")

def format_html(doc):
    doc.page_content = doc.page_content.replace('\n\n\n\n', '\n\n')
    return doc

def load_html(link):
    loader = WebBaseLoader(link)
    data = loader.load()
    docs = text_splitter.split_documents(data)
    docs = [format_html(doc) for doc in docs]
    docs = [doc for doc in docs if len(doc.page_content) > 10]
    return docs

documents = []
print('Loading htmls')
for i, source_url in enumerate(pages):
    print(f'{i} out of {len(pages)} documents loaded')
    try:
        doc = load_html(source_url)
        documents.extend(doc)
    except Exception as e:
        print(f"Error while fetching file {source_url}, error message: {e}")
        continue
        
documents

2024-03-24 18:56:43,004 INFO usp.fetch_parse [94174/MainThread]: Fetching level 0 sitemap from https://www.miasteczko.agh.edu.pl/robots.txt...
2024-03-24 18:56:43,004 INFO usp.helpers [94174/MainThread]: Fetching URL https://www.miasteczko.agh.edu.pl/robots.txt...
2024-03-24 18:56:43,353 INFO usp.fetch_parse [94174/MainThread]: Parsing sitemap from URL https://www.miasteczko.agh.edu.pl/robots.txt...
2024-03-24 18:56:43,353 INFO usp.fetch_parse [94174/MainThread]: Fetching level 0 sitemap from https://www.miasteczko.agh.edu.pl/sitemap.xml...
2024-03-24 18:56:43,354 INFO usp.helpers [94174/MainThread]: Fetching URL https://www.miasteczko.agh.edu.pl/sitemap.xml...
2024-03-24 18:56:43,652 INFO usp.fetch_parse [94174/MainThread]: Parsing sitemap from URL https://www.miasteczko.agh.edu.pl/sitemap.xml...
2024-03-24 18:56:43,657 INFO usp.fetch_parse [94174/MainThread]: Fetching level 0 sitemap from https://www.miasteczko.agh.edu.pl/sitemap...
2024-03-24 18:56:43,657 INFO usp.helpers [94174/Mai

23 url's found in domain https://www.miasteczko.agh.edu.pl
Loading htmls
0 out of 23 documents loaded
1 out of 23 documents loaded
2 out of 23 documents loaded
3 out of 23 documents loaded
4 out of 23 documents loaded
5 out of 23 documents loaded
6 out of 23 documents loaded
7 out of 23 documents loaded
8 out of 23 documents loaded
9 out of 23 documents loaded
10 out of 23 documents loaded
11 out of 23 documents loaded
12 out of 23 documents loaded
13 out of 23 documents loaded
14 out of 23 documents loaded
15 out of 23 documents loaded
16 out of 23 documents loaded
17 out of 23 documents loaded
18 out of 23 documents loaded
19 out of 23 documents loaded
20 out of 23 documents loaded
21 out of 23 documents loaded
22 out of 23 documents loaded


[Document(page_content='Miasteczko Studenckie AGH - Strona Główna\n\n\n\n\n\n\n\nPomiń nawigację\n\n\n\n\n\nMenu\nDla mieszkańców\n\n\n\n\nWyszukiwanie\n\n\nMENU\n\nO nas\n\nGaleria zdjęć\n\nCiekawostki\n\nBezpieczeństwo\n\nKontakt\n\nFAQ\n\nDeklaracja dostępności\n\nFilmy PJM\n\n\n\n\n\n\n\n23.112023r.\n\xa0\nCzwartek\n\nWolna sprzedaż miejsc w trakcie roku akademickiego 2023/2024\nMożliwość zakwaterowania w domach studenckich Miasteczka Studenckiego AGH.\n\n\n\n\n\n\nZimowa akcja krwiodawstwa w AGH \n\nZimowa akcja krwiodastwa w AGH \n\n\n\r\n\t\t01/15/24\r\n\t\n\nWięcej\n\n\n\n\nWolna sprzedaż miejsc w trakcie roku akademickiego 2023/2024\n\nMożliwość zakwaterowania w domach studenckich Miasteczka Studenckiego AGH. \n\n\n\r\n\t\t11/23/23\r\n\t\n\nWięcej\n\n\n\n\nDyżur psychologiczny\n\nAdministracja MS AGH informuje o bezpłatnym dyżurze psychologicznym\xa0dla mieszkańców Miasteczka Studenckiego. \n\n\n\r\n\t\t10/15/23\r\n\t\n\nWięcej\n\n\n\n\nInformacje na temat zakwaterowania w MS 

In [11]:
import os
from langchain_openai import OpenAIEmbeddings

os.environ['PINECONE_API_KEY'] = config['pinecone']['api_key']
os.environ['OPENAI_API_KEY'] = config['openai']['api_key']

pinecone_env = config['pinecone']['environment']
index_name = config['pinecone']['index_name']

pc = pinecone.Pinecone(environment=pinecone_env)
embeddings = OpenAIEmbeddings()

pc = PineconeStore.from_existing_index(index_name, embeddings)
pc

<langchain_community.vectorstores.pinecone.Pinecone at 0x1216a9910>

In [12]:
pc.add_documents(documents)

['694bffe6-9501-47cc-a289-3c5330157bd7',
 '040623cf-2786-4cf5-a6ec-826a6d6733e7',
 '8fcbab02-8d36-4eec-a8ba-f36263828e04',
 '8bcf1cf5-ed64-40a8-ba1d-6a680ad30f0a',
 '593a81ab-158a-43a2-955e-8d79103864d0',
 '57a1a5c3-e8de-4143-96eb-1bf9f3b2d82f',
 'b0d10fa6-c36e-4bd6-8eff-35fbea3fd7a3',
 '7e80ce45-a5ef-4a66-a477-4c26d0a47788',
 '4392d91e-a61d-43bb-b5ad-b02638f0d0c8',
 '2c405292-aee9-40f3-a2d6-9fc90e530bd4',
 '1662bbb9-f0b6-4aaf-ad25-ee21a5e1bfd2',
 'e8b3cbef-1aae-4c6b-96d5-aa3870b99249',
 '287c9dbc-0268-40c5-8a63-e9af286b95b5',
 '136ddd2f-2302-4206-a0f8-adf01d47fcda',
 'b007edce-c8f3-4872-8921-099459ff7f27',
 'b87442b0-d5f3-468e-973e-95c50a502d85',
 '29d0ead1-323b-4990-9c7e-3672864b7d04',
 'b2356615-6545-461e-ba92-582a4f76fd57',
 '5dd55737-f6ca-4f02-98ff-04fd0c2a1904',
 'e20fee78-a4c3-4c9a-a690-a5524143ae9c',
 '5a5b1736-f820-4e03-b5e9-5386803f2416',
 '6db7f082-0708-49fa-87c2-f7a942202dcb',
 'ca8178d5-9830-4fdd-9b61-5d6d2953d9aa',
 'f0c54a64-f6e4-4dfc-8ef1-3d92ed44a38d',
 'a597a9fd-4256-

In [2]:
from urls_finder import get_pages_from_sitemap

get_pages_from_sitemap('https://skn.agh.edu.pl')

2024-03-26 13:05:20,088 INFO usp.fetch_parse [10222/MainThread]: Fetching level 0 sitemap from https://skn.agh.edu.pl/robots.txt...
2024-03-26 13:05:20,089 INFO usp.helpers [10222/MainThread]: Fetching URL https://skn.agh.edu.pl/robots.txt...
2024-03-26 13:05:20,375 INFO usp.helpers [10222/MainThread]: Not retrying for URL https://skn.agh.edu.pl/robots.txt
2024-03-26 13:05:20,375 INFO usp.fetch_parse [10222/MainThread]: Fetching level 0 sitemap from https://skn.agh.edu.pl/admin/config/search/xmlsitemap...
2024-03-26 13:05:20,375 INFO usp.helpers [10222/MainThread]: Fetching URL https://skn.agh.edu.pl/admin/config/search/xmlsitemap...
2024-03-26 13:05:20,663 INFO usp.helpers [10222/MainThread]: Not retrying for URL https://skn.agh.edu.pl/admin/config/search/xmlsitemap
2024-03-26 13:05:20,663 INFO usp.fetch_parse [10222/MainThread]: Fetching level 0 sitemap from https://skn.agh.edu.pl/sitemap_index.xml.gz...
2024-03-26 13:05:20,663 INFO usp.helpers [10222/MainThread]: Fetching URL https:

[]

In [None]:



import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
 
def get_domain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    return domain
 
def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = set()
    for link in soup.find_all('a'):
        link_url = link.get('href')
        if link_url:
            absolute_link = urljoin(url, link_url)
            if absolute_link.startswith(domain):
                links.add(absolute_link)
    return links
 
if __name__ == '__main__':
    url = 'https://sylabusy.agh.edu.pl'
    domain = get_domain(url)
    queue = [url]
    visited = set()
 
    while queue:
        url = queue.pop(0)
        visited.add(url)
        print(url)
        links = get_links(url)
        for link in links:
            if link not in visited and link not in queue:
                queue.append(link)

https://sylabusy.agh.edu.pl
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/13
https://sylabusy.agh.edu.pl/en/1/2/19/1/4
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/63
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4?department=
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/4
https://sylabusy.agh.edu.pl/admin
https://sylabusy.agh.edu.pl/pl/1/2/19/1/2?department=
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/9
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4
https://sylabusy.agh.edu.pl/pl/2
https://sylabusy.agh.edu.pl/
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/10
https://sylabusy.agh.edu.pl/pl/1/1/19/1/4
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/11
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/2
https://sylabusy.agh.edu.pl/pl/1/2/19/1/6?department=
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/3
https://sylabusy.agh.edu.pl/pl/1/2/17/1/4
https://sylabusy.agh.edu.pl/pl/1/2/16/1/4
https://sylabusy.agh.edu.pl/pl/1/2/19/1/1?department=
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/17
https://sylabusy.agh.edu.pl/pl/1/2/19/1/4/6
https