In [None]:
from langchain_community.utilities import GoogleSerperAPIWrapper
from dotenv import load_dotenv
import os
import pprint

load_dotenv()

# Google 에서 키워드 검색을 해서 결과를 겨져옵니다.
# Obtaining results with metadata
# https://python.langchain.com/v0.2/docs/integrations/tools/google_serper/

search = GoogleSerperAPIWrapper(gl="kr", hl="ko", k=20)
results = search.results("갑자(甲子) 일주론")
pprint.pp(results)



In [48]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

def website_scrape(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")
    # 모든 <p> 태그의 텍스트를 추출하고 결합
    content = " ".join([p.text for p in soup.find_all("p")])
    return content


def scrape_and_combine_content(inputs):
    combined_content = ""
    for input in inputs:
        link = input["link"]
        content = website_scrape(link)
        combined_content += f"URL: {link}\n\n{content}\n\n"
    return combined_content


def remove_duplicates(text):
    # LangChain을 사용하여 중복 제거
    llm = ChatOpenAI( model_name="gpt-4o-mini", temperature=0.9)

    prompt_template = """
    다음 텍스트에서 중복되는 내용이 있다면 제거해 주세요. 텍스트의 내용을 요약하거나 제거하지 마세요. 오직 중복되는 내용만 제거해 주세요.
    {text}
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
    chain = LLMChain(llm=llm, prompt=prompt)

    result = chain.run(text)
    return result

In [50]:
combined_content = scrape_and_combine_content(results["organic"])
# unique_content = remove_duplicates(combined_content)

with open("combined_content.txt", "w", encoding="utf-8") as file:
    file.write(combined_content)