In [1]:
from abc import ABC, abstractmethod
from typing import Optional, Any, Dict, List
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_ollama import OllamaLLM
'''
'''
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_core.output_parsers import JsonOutputParser
from bs4 import BeautifulSoup

In [2]:
import logging
import re

In [3]:
class ContentProcessor:
    def __init__(self):
        # HTML 符號替換映射
        self.html_replacements = {
            '\n': '<br>',
            '-': '&ndash;',
            '"': '&quot;',
            "'": '&apos;',
            '(': '&#40;',
            ')': '&#41;',
        }
    
    def split_pages(self, content: str) -> List[Dict[str, str]]:
        """
        將內容按頁面分割並進行格式化
        
        Args:
            content: 原始內容字符串
        
        Returns:
            包含每個頁面信息的字典列表
        """
        try:
            # 使用 \n\n 分割頁面
            pages = content.split('\n\n')
            result = []
            
            for page in pages:
                if page.startswith('Page:'):
                    # 分離頁面標題和內容
                    parts = page.split('\nSummary: ', 1)
                    if len(parts) == 2:
                        title = parts[0].replace('Page: ', '')
                        content = parts[1]
                        
                        # 處理HTML符號
                        processed_content = self.replace_symbols(content)
                        
                        result.append({
                            'title': title,
                            'content': processed_content
                        })
            
            return result
        except Exception as e:
            print(f"分割頁面時發生錯誤: {str(e)}")
            return []
    
    def replace_symbols(self, text: str) -> str:
        """
        替換文本中的符號為HTML標籤
        
        Args:
            text: 需要處理的文本
        
        Returns:
            處理後的文本
        """
        try:
            # 使用預定義的替換規則
            for symbol, html_tag in self.html_replacements.items():
                text = text.replace(symbol, html_tag)
            
            # 處理特殊的Unicode字符
            text = re.sub(r'\u2060', '', text)  # 移除零寬不換行空格
            
            return text
        except Exception as e:
            print(f"替換符號時發生錯誤: {str(e)}")
            return text

In [4]:
"""
def main():
    # 使用示例
    processor = ContentProcessor()
    
    # 測試內容
    test_content = '''Page: BIOS
Summary: In computing, BIOS (, BY-oss, -\u2060ohss; Basic Input/Output System...

Page: BIOS interrupt call
Summary: BIOS implementations provide interrupts...'''
    
    # 處理內容
    pages = processor.split_pages(test_content)
    
    # 輸出結果
    for page in pages:
        print(f"\n標題: {page['title']}")
        print(f"內容: {page['content'][:100]}...")  # 只顯示前100個字符
"""

'\ndef main():\n    # 使用示例\n    processor = ContentProcessor()\n\n    # 測試內容\n    test_content = \'\'\'Page: BIOS\nSummary: In computing, BIOS (, BY-oss, -\u2060ohss; Basic Input/Output System...\n\nPage: BIOS interrupt call\nSummary: BIOS implementations provide interrupts...\'\'\'\n\n    # 處理內容\n    pages = processor.split_pages(test_content)\n\n    # 輸出結果\n    for page in pages:\n        print(f"\n標題: {page[\'title\']}")\n        print(f"內容: {page[\'content\'][:100]}...")  # 只顯示前100個字符\n'

In [5]:
# wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper(), )

In [6]:
# raw_content = wikipedia.run("Bios programming")
# processor = ContentProcessor()
# pages = processor.split_pages(raw_content)
# for page in pages:
#         print(f"\n標題: {page['title']}")
#         print(f"內容: {page['content'][:100]}...")  # 只顯示前100個字符

In [7]:
class CustomWikiAPIWrapper(WikipediaAPIWrapper):
    def _clean_html(self, html: str) -> str:
        soup = BeautifulSoup(html, features="lxml")
        text = soup.get_text()
        return text

In [8]:
class LLMInitializer:
    """LLM模型初始化類"""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    
    def init_ollama_model(
        self, 
        model: str = "deepseek-r1:7b",
        base_url: str = "http://localhost:11434",
        **kwargs
    ) -> OllamaLLM:
        """初始化OllamaLLM模型
        
        Args:
            model: Ollama模型名稱
            base_url: Ollama服務URL
            **kwargs: 額外的模型參數
            
        Returns:
            初始化後的OllamaLLM實例
        """
        try:
            return OllamaLLM(
                model=model,
                base_url=base_url,
                **kwargs
            )
        except Exception as e:
            self.logger.error(f"OllamaLLM模型初始化失敗: {str(e)}")
            raise

In [32]:
class ContentSummarizer(ABC):
    """內容摘要基類"""
    def __init__(self, llm: Any):
        self.logger = logging.getLogger(__name__)
        self.llm = llm
        
        self.template = """
        Role: You are an experienced and well-skilled text summarizer.
        Task:
        Please summarize the following context:
        
        {context}
        
        Please provide:
        1. Abstract: A very short overview
        2. Summarization Content (100-500 words):
           a. Most important points
           b. Extended content
        
        3. Use technical and formal style.
        
        """
        
        self.prompt = PromptTemplate(
            input_variables=["context"],
            template=self.template
        )
        self.content_parser = ContentProcessor()
        self.chain = self.prompt | self.llm
        # self.chain = LLMChain(prompt=self.prompt, llm=self.llm, verbose=False, return_final_only=True)

    # def llm_invoke
    
    @abstractmethod
    def get_content(self, query: str) -> Optional[str]:
        """獲取要摘要的內容"""
        pass
    
    def generate_summary(self, content: str) -> Optional[str]:
        """生成內容摘要"""
        try:
            response = self.chain.invoke({"context": content},)
            # query_str = self.template.format(context=content)
            # response = self.llm.invoke(query_str)
            return response #response.get("text", "")  # 從回應中獲取文本
        except Exception as e:
            self.logger.error(f"摘要生成失敗: {str(e)}")
            return None

class WikiSummarizer(ContentSummarizer):
    """Wikipedia內容摘要器"""
    
    def __init__(self, llm: Any):
        super().__init__(llm)
        self.wiki = CustomWikiAPIWrapper()
        # self.wiki = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper(), )
        # self.wiki = WikipediaQueryRun(CustomWikiAPIWrapper())
    
    def get_content(self, query: str) -> Optional[str]:
        """從Wikipedia獲取內容"""
        try:
            # filtered_content = self.content_parser.split_pages(self.wiki.run(query))
            return self.wiki.run(query)
        except Exception as e:
            self.logger.error(f"Wikipedia檢索失敗: {str(e)}")
            return None

    
    def summarize(self, query: str) -> Optional[str]:
        """摘要Wikipedia內容"""
        content = self.get_content(query)
        if not content:
            return None
        return self.generate_summary(content)

In [41]:
def test_main(query:str=None):
    llm_init = LLMInitializer()
    
    try:
        # 初始化deepseek-r1:7b模型
        llm = llm_init.init_ollama_model(
            model="deepseek-r1:7b",
            temperature=0.7
        )
        
        # 創建摘要器並使用
        wiki_summarizer = WikiSummarizer(llm)
        summary = wiki_summarizer.summarize(query)
        filtered_summary = summary[summary.find("</think>")+8:]
        print(f"查詢內容：{query}\n回傳結果:\n{filtered_summary}")
        
    except Exception as e:
        print(f"Error: {str(e)}")

In [42]:
test_main("AI")



  lis = BeautifulSoup(html).find_all('li')


查詢內容：AI
回傳結果:


### Summary of "Artificial Intelligence" Page:

#### Abstract:
The field of artificial intelligence (AI) involves the development of computational systems that mimic human intelligence to perform tasks such as learning, reasoning, problem-solving, perception, and decision-making. AI draws from various disciplines like computer science, mathematics, psychology, and linguistics. Its applications include web search engines, recommendation systems, virtual assistants, autonomous vehicles, generative tools, and strategy games. AI research has faced challenges such as "AI winters" due to funding issues but has recently seen growth with advancements in deep learning and transformer architectures. The future of AI is expected to be marked by rapid progress and increased investment.

#### Summarization Content:

1. **Most Important Points**:
   - AI refers to computational systems designed to perform intelligent tasks.
   - Applications include web search, recommendation systems