In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain.chains import RetrievalQA


txt_split = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

txt_documents = DirectoryLoader("./saved_articles", glob="*.txt").load()
txt_docs = txt_split.split_documents(txt_documents)

documents = PyPDFLoader('./RAG/med_data_all.pdf').load()
pdf_docs = txt_split.split_documents(documents)

docs = pdf_docs# + txt_docs
print(len(docs))

embedding = OpenAIEmbeddings(model='text-embedding-ada-002')
vdb = FAISS.from_documents(docs, embedding)
base_retriever = vdb.as_retriever(search_kwargs={"k": 10})

reranker = CohereRerank(
    model="rerank-multilingual-v3.0",  
    top_n=5  # 최종 문서 5개만 선택 (원하는 수로 조정 가능)
)

retriever = ContextualCompressionRetriever(
    base_compressor=reranker,
    base_retriever=base_retriever,
)

gpt = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)

qa = RetrievalQA.from_chain_type(
    llm=gpt,
    chain_type='stuff',
    retriever=retriever,
)

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


789


In [2]:
from generator import NewsletterGenerator
from RAG import reranker_qa

generator = NewsletterGenerator(reranker_qa)

with open('./saved_articles/cnn_Hypertension_1_20250521.txt', 'r', encoding='utf-8') as f:
    article = f.read()

# article = """Cutting 1 teaspoon of salt from your diet each day can lower your top blood pressure reading just as much as a typical hypertension medication, even if you don't have high blood pressure, a new study found.

# A teaspoon of salt is 2,300 milligrams — that's the top daily limit for people over 14 recommended by the latest US nutritional guidelines. However, the American Heart Association recommends a diet with less than 1,500 milligrams of sodium a day.

# Woman At Home Adding Salt To Meal
# 'Massive efforts' are needed to reduce salt intake and protect lives, World Health Organization says

# 'This is the first study to show that people who are already on blood pressure medication can lower their blood pressure even more by limiting sodium,” said coprincipal investigator Norrina Allen, professor of preventive medicine at Northwestern University's Feinberg School of Medicine.

# 'And regardless of medication, we found 70% to 75% of people are likely to see a reduction in their blood pressure if they lower the sodium in their diet,” Allen said.

# High blood pressure is often called the 'silent killer,' because there are no symptoms — the only way to know if you have it is to test for it. Yet hypertension affects 1 in 3 adults worldwide and can lead to heart attack, heart failure, kidney damage and stroke, according to a 2023 report by the World Health Organization.

# (add caption)
# One way to cut a teaspoon of salt from your diet is by reading labels since many foods have added salt, experts say. Wirestock/iStockphoto/Getty Images
# Nearly half of all Americans live with high blood pressure, according to the American Heart Association. About a third of those have “resistant” hypertension, high blood pressure that has not responded despite the concurrent use of three types of medications. A 2021 study found men ages 20 to 49 are up to 70% more likely to have uncontrolled hypertension than women of the same age."""

result = generator.generate_newsletter_from_articles(article)

789
content='chronic medical conditions, high blood pressure, hypertension, diabetes, early detection, treatment, medications, lifestyle changes, cardiovascular disease, kidney disease, stroke, dementia, blood pressure cuff, A1C test, insulin, obesity, physical activity, healthy diet, preventive measures.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 55, 'prompt_tokens': 1592, 'total_tokens': 1647, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-BZuRuI19jRsz5L6JuDOF7R5OvRBxQ', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--3d7bf402-a9c8-46aa-a48f-6a306e3fe533-0' usage_metadata={'input_tokens': 1592, 'output_tokens': 55, 'total_tokens': 1647, 'input_token_details': {'audio': 

In [3]:
result

{'articles': 'It’s a new year and time for many people to make their health-related resolutions. According to my go-to doctor expert, one of your resolutions should be to find out if you have any chronic medical conditions and address them before symptoms begin.\n\nThis approach, of course, is the case for diagnosing cancer, and early detection can help save lives. But people often let other conditions such as high blood pressure and diabetes go for a long time without adequate treatment.\n\nMore than 77% of the 119.9 million Americans with hypertension do not have their blood pressure under control, according to the US government’s Million Hearts initiative.\n\nMore than 50% of people 30 and older living with diabetes did not take medications for their diabetes in 2022, according to the World Health Organization. Many may not even be aware that they have diabetes; in the United States, the American Diabetes Association estimated in 2021 that of the 38.4 million people who have diabete

In [5]:
# 유사도 확인을 위한 쿼리 예시
query = result['newsletter_summary']

# 기본 검색 결과
base_results = base_retriever.get_relevant_documents(query)
print("기본 검색 결과 수:", len(base_results))

# 리랭킹된 결과
reranked_results = retriever.get_relevant_documents(query)
print("리랭킹된 결과 수:", len(reranked_results))

# 결과 비교
# print("\n=== 기본 검색 결과 ===")
# for i, doc in enumerate(base_results):
#     print(f"\n문서 {i+1}:")
#     print(f"내용: {doc.page_content[:200]}...")
#     print(f"메타데이터: {doc.metadata}")

# print("\n=== 리랭킹된 결과 ===")
# for i, doc in enumerate(reranked_results):
#     print(f"\n문서 {i+1}:")
#     print(f"내용: {doc.page_content[:200]}...")
#     print(f"메타데이터: {doc.metadata}")

# 유사도 점수 확인
print("\n=== 유사도 점수 ===")
for i, doc in enumerate(reranked_results):
    print(doc.page_content)
    print(doc.metadata)
    print('-'*100)
    if hasattr(doc, 'score'):
        print(f"문서 {i+1} 유사도 점수: {doc.score}")

기본 검색 결과 수: 10
리랭킹된 결과 수: 5

=== 유사도 점수 ===
changes that can help eat a healthy diet, limiting salt and sugar, plus eating lots of fruits, 
vegetables, and whole grains, get vaccinations for hepatitis A and B, the flu and 
pneumococcal disease. If you get hepatitis A or B along with fatty liver, it is more likely 
to lead to liver failure. People with chronic liver disease are more likely to get infections, 
so the other two vaccinations are also important. get regular exercise, which can help
{'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-05-14T15:44:39+09:00', 'author': 'Nitrosoft', 'moddate': '2025-05-14T15:44:39+09:00', 'source': './RAG/med_data_all.pdf', 'total_pages': 126, 'page': 79, 'page_label': '80', 'relevance_score': 0.8677098}
----------------------------------------------------------------------------------------------------
prevent atherosclerosis choose heart-healthy foods, such fruits, vegetables, and whole 
grains. Limit 

OpenAI의 text-embedding-ada-002 모델 특성
-----------------------------------------------------
text-embedding-ada-002는 단순한 영어 전용이 아니라,
한국어, 영어, 일본어, 중국어 등 다양한 언어를
"공통된 의미 공간(shared semantic space)" 안에서 임베딩한다


"The Adventures of Tom Sawyer"라는 영어 텍스트를 임베딩하면,
그것이 가진 의미(semantics) 가 벡터로 표현됨.
그리고 한글 질문 "줄거리 요약해줘"도 벡터로 임베딩할 때
그 의미가 비슷한 벡터로 변환됨.

결국
"의미가 비슷하면 언어가 달라도 벡터가 가까워진다" 는 것.
