In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

#### URL 크롤링

In [2]:
def crawling():
    driver = webdriver.Chrome()

    # 일반 공지
    driver.get('https://www.dongguk.edu/article/GENERALNOTICES/list')

    time.sleep(2)

    items = driver.find_elements(By.CSS_SELECTOR, 'div.board_list li')
    url_list=[]

    for i in range(len(items)):
        # refresh 안해주면 하나만 크롤링하고 끝남
        items = driver.find_elements(By.CSS_SELECTOR, 'div.board_list li')
        try:
            num_span = items[i].find_element(By.XPATH, './/a/div[@class="mark"]/span[@class="num"]')
            num_span.click()
            time.sleep(3)
            
            # add code
            url = driver.current_url
            url_list.append(url)
            driver.back()
            time.sleep(2)
            
        except Exception as e:
            pass

    driver.quit()
    return url_list

url_list = crawling()

In [3]:
url_list

['https://www.dongguk.edu/article/GENERALNOTICES/detail/26758365',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758350',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758345',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758340',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758339',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758338',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758337',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758332',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758330',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758320']

#### 본문에 이미지 파일이 있다면 로컬에 이미지 저장

In [4]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# 이미지 저장 경로 설정
image_folder = os.path.join(os.getcwd(), "image")
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

def download_image(image_url, save_path):
    try:
        response = requests.get(image_url)
        if response.status_code == 200:
            with open(save_path, "wb") as img_file:
                img_file.write(response.content)
            print(f"Image saved at {save_path}")
        else:
            print(f"Failed to download image from {image_url}")
    except Exception as e:
        print(f"Error downloading image from {image_url}: {e}")

# URL 순회하면서 이미지 처리
for url in url_list:
    # 페이지 요청
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        
        # view_cont div 찾기
        div_section = soup.find('div', {'class': 'view_cont'})
        
        if div_section:
            # 이미지 태그 찾기
            img_tag = div_section.find('img')
            if img_tag and 'src' in img_tag.attrs:
                image_url = img_tag.attrs['src']
                
                # 상대 URL 처리 (만약 src가 상대경로라면)
                image_url = urljoin(url, image_url)
                image_number = url.split("/")[-1]
                
                # 이미지 저장 경로 (번호 + .jpeg)
                image_path = os.path.join(image_folder, f"{image_number}.jpeg")
                
                
                # 이미지 다운로드
                download_image(image_url, image_path)
            else:
                print("No image found in the view_cont section.")
        else:
            print("No view_cont section found.")
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")


Image saved at /Users/lee/Desktop/chatDPT/image/26758365.jpeg
Image saved at /Users/lee/Desktop/chatDPT/image/26758350.jpeg
Image saved at /Users/lee/Desktop/chatDPT/image/26758345.jpeg
No image found in the view_cont section.
No image found in the view_cont section.
Image saved at /Users/lee/Desktop/chatDPT/image/26758338.jpeg
Image saved at /Users/lee/Desktop/chatDPT/image/26758337.jpeg
Image saved at /Users/lee/Desktop/chatDPT/image/26758332.jpeg
Image saved at /Users/lee/Desktop/chatDPT/image/26758330.jpeg
No image found in the view_cont section.


#### 이미지 있으면 이미지 내용과 같이 저장

In [12]:
import os
import requests
from bs4 import BeautifulSoup
import base64
from openai import OpenAI

client = OpenAI()

# 이미지 폴더 경로
image_folder = os.path.join(os.getcwd(), "image")

# GPT-4 텍스트 변환 함수
def analyze_data_with_gpt4(image_path):
    # 올바른 image_path 처리
    image_path = os.path.join(os.getcwd(), "image", image_path)  # {image_path} -> image_path로 수정

    with open(image_path, "rb") as image_file:
        current_base64_image = base64.b64encode(image_file.read()).decode("utf-8")

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Turn an image into text with as much content as possible. Don't use special characters, write in lines. and Answer in Korean."},
            {"role": "user", "content": [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{current_base64_image}"}}]}
        ]
    )
    advice = response.choices[0].message.content
    return advice

content = []

# URL 리스트 순회
for url in url_list:
    # 페이지 요청
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        
        # view_cont div 찾기
        div_section = soup.find('div', {'class': 'view_cont'})
        

        if div_section:
            # view_cont 내의 텍스트 추출
            content_text = div_section.get_text(separator='\n', strip=True)
            
            image_name = url.split("/")[-1] + ".jpeg"  # 이미지 파일 이름 생성
            image_path = os.path.join(image_folder, image_name)
            
            # 이미지 파일이 존재하면, GPT 모델로 텍스트 변환
            if os.path.exists(image_path):
                explanation = analyze_data_with_gpt4(image_name)  # 이미지 경로를 함수로 전달
                content_text += explanation  # 이미지 설명 추가
            
            # 텍스트 출력 및 저장
            if content_text:
                content.append(content_text)
            else:
                print(f"No text found in view_cont for {url}.")
        else:
            print(f"view_cont section not found for {url}.")
    else:
        print(f"Failed to retrieve the page {url}. Status code: {response.status_code}")


In [14]:
url_content_mapping = dict(zip(url_list, content))
url_content_mapping

{'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758365': '오케스트라 공연을 아래와 같이 개최하오니, 교내 구성원들의 많은 관심과 참여 부탁드립니다.20 December 12:00 pm  \n동국대학교 중강당, 본관 3층  \nDongguk Symphony Orchestra  \n\nW.A. Mozart, Violin Concerto No. 3  \nL.v. Beethoven, Symphony No. 5  \nA. Dvorák, Symphony No. 9  \n지휘&협연 전강호[다니엘] 악장 성동하  \n\n2024. 12. 20 Friday  \n주관: 동국대학교 주최: 다르마 갤러지  ',
 'https://www.dongguk.edu/article/GENERALNOTICES/detail/26758350': '병역판정검사 일자 및 장소 본인선택 안내\n\n병무청에서는 병역판정검사 대상자의 편의를 고려하고 자율적으로 병역의무를 이행할 수 있도록, 병역판정검사 일자 및 장소를 직접 선택하여 검사를 받을 수 있는 제도를 운용하고 있습니다. 희망자는 다음 사항을 참고하여 인터넷으로 접수하시기 바랍니다.\n\n- 2025년도 병역판정검사 대상\n  - 2006년생, 병역판정검사 연기중인 2005년 이전생\n- 병역판정검사 기간: 2025. 1. 13. ~ 12. 17. (토요일, 공휴일 등 제외)\n- 접수 시작일자: 2024. 12. 27.(금) 10:00부터\n- ‘20세 병역판정검사 후 입영’ 본인선택 제도 신설: 2026년도 입영희망 시\n- 2026년 병역판정검사 희망일 사전신청가능(신청기간): ’24. 12. 27. ~ ’25. 9. 30.\n- 접수방법: 병무청 누리집(www.mma.go.kr) 또는 모바일 앱\n  - PC: 병무청 누리집 → (좌측) 민원신청 → 병역판정검사 → 병역판정검사 일자 및 장소 본인선택 → 신청\n  - 모바일: ‘병무청’ 앱 다운로드 → (좌측상단) 메뉴 → 

#### 기존의 pinecone DB에 내용 추가 저장 (안됨)

In [17]:
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_upstage import UpstageEmbeddings

embedding = UpstageEmbeddings(model="solar-embedding-1-large-passage")

# Initialize Pinecone connection
index_name = "dongguk"  # Replace with your index name

# Load the existing Pinecone database
database = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embedding)

In [28]:
#from pinecone.grpc import PineconeGRPC as Pinecone

# Initialize Pinecone client with your API key
pinecone_api_key=os.environ.get("PINECONE_API_KEY")
pc=Pinecone(api_key=pinecone_api_key)
index = pc.Index(host="https://dongguk-73v7xl6.svc.aped-4627-b74a.pinecone.io")


# Prepare vectors and metadata
vectors = []
for url, content in url_content_mapping.items():
    # Vectorize the content using embed_query
    vector = embedding.embed_query(content)  # Vectorize the content using embed_query()
    
    # Prepare metadata (you can customize this part based on your needs)
    metadata = {"content": content}
    
    # Append to the list in the required format
    vectors.append({
        "id": url,  # URL as the id
        "values": vector,  # The vector from embed_query
        "metadata": metadata  # Metadata (can include more info)
    })

# Upsert the vectors into the Pinecone index
index.upsert(vectors=vectors)

{'upserted_count': 10}