In [1]:
import os
import json
import requests
from requests.exceptions import Timeout
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import concurrent
from concurrent.futures import ThreadPoolExecutor
import pdfplumber
from io import BytesIO
import re
import string
from typing import Optional, Tuple
from nltk.tokenize import sent_tokenize

# 自定义请求头
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
#     'Referer': 'https://www.google.com/'  # Referer有时也会影响访问权限
# }
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/58.0.3029.110 Safari/537.36',
    'Referer': 'https://www.google.com/',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

jina_headers = {
    'Authorization': 'Bearer jina_c3839dcd54ad44b29a2922aec781cd88b7bjgVl0avG0q4yJ2MUWblSXEVlP',
    'X-Return-Format': 'markdown',
    # 'X-With-Links-Summary': 'true'
}

# 初始化会话
session = requests.Session()
session.headers.update(headers)

# 从PDF中提取文本
def extract_pdf_text(url):
    try:
        response = session.get(url, timeout=20)  # 设置超时时间为20秒
        if response.status_code != 200:
            return f"Error: Unable to retrieve the PDF (status code {response.status_code})"
        
        # 使用pdfplumber打开PDF文件
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            full_text = ""
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text
        
        # 限制文本长度
        cleaned_text = ' '.join(full_text.split()[:1000])
        return cleaned_text
    except requests.exceptions.Timeout:
        return "Error: Request timed out after 10 seconds"
    except Exception as e:
        return f"Error: {str(e)}"

def extract_text_from_url(url, use_jina=False, snippet: Optional[str] = None):
    """
    从 URL 中提取文本。如果提供了 snippet，则提取与之相关的上下文。

    Args:
        url (str): 网页或 PDF 的 URL。
        use_jina (bool): 是否使用 Jina 进行提取。
        snippet (Optional[str]): 要查找的片段。

    Returns:
        str: 提取的文本或上下文。
    """
    try:
        if use_jina:
            response = requests.get(f'https://r.jina.ai/{url}', headers=jina_headers).text
            # 去除 URL
            pattern = r"\(https?:.*?\)|\[https?:.*?\]"
            text = re.sub(pattern, "", response).replace('---','-').replace('===','=').replace('   ',' ').replace('   ',' ')
        else:
            response = session.get(url, timeout=20)  # 设置超时时间为20秒
            response.raise_for_status()  # 如果请求失败，抛出 HTTPError
            # 判断返回的内容类型
            content_type = response.headers.get('Content-Type', '')
            if 'pdf' in content_type:
                # 如果是 PDF 文件，提取 PDF 文本
                return extract_pdf_text(url)
            # 尝试使用 lxml 解析，如果不可用则使用 html.parser
            try:
                soup = BeautifulSoup(response.text, 'lxml')
            except Exception:
                print("lxml parser not found or failed, falling back to html.parser")
                soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)

        return text
    except requests.exceptions.HTTPError as http_err:
        return f"HTTP error occurred: {http_err}"
    except requests.exceptions.ConnectionError:
        return "Error: Connection error occurred"
    except requests.exceptions.Timeout:
        return "Error: Request timed out after 20 seconds"
    except Exception as e:
        return f"Unexpected error: {str(e)}"

def fetch_page_content(urls, max_workers=32, use_jina=False, snippets: Optional[dict] = None):
    """
    并发地从多个 URL 中获取内容。

    Args:
        urls (list): 要抓取的 URL 列表。
        max_workers (int): 最大并发线程数。
        use_jina (bool): 是否使用 Jina 进行提取。
        snippets (Optional[dict]): 一个字典，将 URL 映射到相应的片段。

    Returns:
        dict: 一个字典，将 URL 映射到提取的内容或上下文。
    """
    results = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # 使用 tqdm 显示进度条
        futures = {
            executor.submit(extract_text_from_url, url, use_jina, snippets.get(url) if snippets else None): url
            for url in urls
        }
        for future in tqdm(concurrent.futures.as_completed(futures), desc="Fetching URLs", total=len(urls)):
            url = futures[future]
            try:
                data = future.result()
                results[url] = data
            except Exception as exc:
                results[url] = f"Error fetching {url}: {exc}"
            # time.sleep(0.1)  # 简单的速率限制
    return results

In [None]:
if __name__ == "__main__":
    # 定义输入和输出路径
    url_cache_path = "/fs/archive/share/u2023000153/Search-o1/cache/url_cache.json"
    url_cache_new1_path = "/fs/archive/share/u2023000153/Search-o1/cache/url_cache_new1.json"
    
    # 读取url_cache文件
    with open(url_cache_path, 'r', encoding='utf-8') as f:
        url_cache = json.load(f)
    
    # 提取所有错误的URL
    urls = []
    error_indicators = [
        'limit exceeded',
        'Error fetching URL',
        'Account balance not enough to run this query, please recharge.',
        'Invalid bearer token',
        'HTTP error occurred', 
        'Error: Connection error occurred',
        'Error: Request timed out',
        'Unexpected error',
        'Please turn on Javascript'
    ]
    
    for url, page in url_cache.items():
        if isinstance(page, str) and (any(indicator.lower() in page.lower() for indicator in error_indicators) or page.startswith('http') or len(page) < 10):
            urls.append(url)
            
    print(f"Total URLs to fetch: {len(urls)}")
    
    # 重新获取错误URL的内容
    if urls:
        new_cache = fetch_page_content(urls, use_jina=False)
        
        # 将新获取的内容合并到原始缓存中
        for url, content in new_cache.items():
            url_cache[url] = content
        
    # 保存完整的url_cache到新文件
    with open(url_cache_new1_path, 'w', encoding='utf-8') as f:
        json.dump(url_cache, f, ensure_ascii=False, indent=4)
    print(f"Updated complete cache saved to {url_cache_new1_path}")


Total URLs to fetch: 47125


  soup = BeautifulSoup(response.text, 'lxml')
Fetching URLs:  33%|███▎      | 15734/47125 [2:52:35<10:12:05,  1.17s/it]

In [3]:
import json

# Load the two cache files
cache_path = '/fs/archive/share/u2023000153/Search-o1/cache/url_cache.json'
cache_path_new1 = '/fs/archive/share/u2023000153/Search-o1/cache/url_cache_new1.json'
output_path_new2 = '/fs/archive/share/u2023000153/Search-o1/cache/url_cache_new2.json'

with open(cache_path, 'r', encoding='utf-8') as f:
    url_cache = json.load(f)
    
with open(cache_path_new1, 'r', encoding='utf-8') as f:
    url_cache_new1 = json.load(f)

# Define error indicators
error_indicators = [
    'limit exceeded',
    'Error fetching URL',
    'Account balance not enough to run this query, please recharge.',
    'Invalid bearer token', 
    'HTTP error occurred',
    'Error: Connection error occurred',
    'Error: Request timed out',
    'Unexpected error',
    'Please turn on Javascript'
]

# Merge caches, preferring valid content
merged_cache = {}
for url in set(url_cache.keys()) | set(url_cache_new1.keys()):
    content1 = url_cache.get(url, '')
    content2 = url_cache_new1.get(url, '')
    
    # Check if contents are error messages
    is_error1 = isinstance(content1, str) and (any(indicator in content1 for indicator in error_indicators) or content1.startswith('http'))
    is_error2 = isinstance(content2, str) and (any(indicator in content2 for indicator in error_indicators) or content2.startswith('http'))
    
    # Take valid content if available
    if not is_error1:
        merged_cache[url] = content1
    elif not is_error2:
        merged_cache[url] = content2
    else:
        # If both are errors, take the first one
        merged_cache[url] = content1

# Save merged cache
with open(output_path_new2, 'w', encoding='utf-8') as f:
    json.dump(merged_cache, f, ensure_ascii=False, indent=4)

print(f"Original cache size: {len(url_cache)}")
print(f"New cache 1 size: {len(url_cache_new1)}")
print(f"Merged cache size: {len(merged_cache)}")
print(f"Merged cache saved to {output_path_new2}")


Original cache size: 170617
New cache 1 size: 10311
Merged cache size: 170617
Merged cache saved to /fs/archive/share/u2023000153/Search-o1/cache/url_cache_new2.json


In [1]:
import os
import json
import requests
from requests.exceptions import Timeout
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import concurrent
from concurrent.futures import ThreadPoolExecutor
import pdfplumber
from io import BytesIO
import re
import string
from typing import Optional, Tuple
from nltk.tokenize import sent_tokenize
from typing import List, Dict, Union
from urllib.parse import urljoin


# ----------------------- Custom Headers -----------------------
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/58.0.3029.110 Safari/537.36',
    'Referer': 'https://www.google.com/',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

# Initialize session
session = requests.Session()
session.headers.update(headers)


class WebParserClient:
    def __init__(self, base_url: str = "http://localhost:8000"):
        """
        初始化Web解析器客户端
        
        Args:
            base_url: API服务器的基础URL，默认为本地测试服务器
        """
        self.base_url = base_url.rstrip('/')
        
    def parse_urls(self, urls: List[str], timeout: int = 200) -> List[Dict[str, Union[str, bool]]]:
        """
        发送URL列表到解析服务器并获取解析结果
        
        Args:
            urls: 需要解析的URL列表
            timeout: 请求超时时间，默认20秒
            
        Returns:
            解析结果列表
            
        Raises:
            requests.exceptions.RequestException: 当API请求失败时
            requests.exceptions.Timeout: 当请求超时时
        """
        endpoint = urljoin(self.base_url, "/parse_urls")
        response = requests.post(endpoint, json={"urls": urls}, timeout=timeout)
        response.raise_for_status()  # 如果响应状态码不是200，抛出异常
        
        return response.json()["results"]


def remove_punctuation(text: str) -> str:
    """Remove punctuation from the text."""
    return text.translate(str.maketrans("", "", string.punctuation))

def f1_score(true_set: set, pred_set: set) -> float:
    """Calculate the F1 score between two sets of words."""
    intersection = len(true_set.intersection(pred_set))
    if not intersection:
        return 0.0
    precision = intersection / float(len(pred_set))
    recall = intersection / float(len(true_set))
    return 2 * (precision * recall) / (precision + recall)

def extract_snippet_with_context(full_text: str, snippet: str, context_chars: int = 2500) -> Tuple[bool, str]:
    """
    Extract the sentence that best matches the snippet and its context from the full text.

    Args:
        full_text (str): The full text extracted from the webpage.
        snippet (str): The snippet to match.
        context_chars (int): Number of characters to include before and after the snippet.

    Returns:
        Tuple[bool, str]: The first element indicates whether extraction was successful, the second element is the extracted context.
    """
    try:
        full_text = full_text[:50000]

        snippet = snippet.lower()
        snippet = remove_punctuation(snippet)
        snippet_words = set(snippet.split())

        best_sentence = None
        best_f1 = 0.2

        # sentences = re.split(r'(?<=[.!?]) +', full_text)  # Split sentences using regex, supporting ., !, ? endings
        sentences = sent_tokenize(full_text)  # Split sentences using nltk's sent_tokenize

        for sentence in sentences:
            key_sentence = sentence.lower()
            key_sentence = remove_punctuation(key_sentence)
            sentence_words = set(key_sentence.split())
            f1 = f1_score(snippet_words, sentence_words)
            if f1 > best_f1:
                best_f1 = f1
                best_sentence = sentence

        if best_sentence:
            para_start = full_text.find(best_sentence)
            para_end = para_start + len(best_sentence)
            start_index = max(0, para_start - context_chars)
            end_index = min(len(full_text), para_end + context_chars)
            context = full_text[start_index:end_index]
            return True, context
        else:
            # If no matching sentence is found, return the first context_chars*2 characters of the full text
            return False, full_text[:context_chars * 2]
    except Exception as e:
        return False, f"Failed to extract snippet context due to {str(e)}"

def extract_text_from_url(url, use_jina=False, jina_api_key=None, snippet: Optional[str] = None, keep_links=False):
    """
    Extract text from a URL. If a snippet is provided, extract the context related to it.

    Args:
        url (str): URL of a webpage or PDF.
        use_jina (bool): Whether to use Jina for extraction.
        jina_api_key (str): API key for Jina.
        snippet (Optional[str]): The snippet to search for.
        keep_links (bool): Whether to keep links in the extracted text.

    Returns:
        str: Extracted text or context.
    """
    try:
        if use_jina:
            jina_headers = {
                'Authorization': f'Bearer {jina_api_key}',
                'X-Return-Format': 'markdown',
            }
            response = requests.get(f'https://r.jina.ai/{url}', headers=jina_headers).text
            # Remove URLs
            pattern = r"\(https?:.*?\)|\[https?:.*?\]"
            text = re.sub(pattern, "", response).replace('---','-').replace('===','=').replace('   ',' ').replace('   ',' ')
        else:
            if 'pdf' in url:
                # If it's a PDF file, extract PDF text
                return extract_pdf_text(url)

            try:
                response = session.get(url, timeout=30)  # Set timeout to 20 seconds
                response.raise_for_status()  # Raise HTTPError if the request failed
                # Determine the content type
                content_type = response.headers.get('Content-Type', '')
                
                # Try using lxml parser, fallback to html.parser if unavailable
                try:
                    soup = BeautifulSoup(response.text, 'lxml')
                except Exception:
                    print("lxml parser not found or failed, falling back to html.parser")
                    soup = BeautifulSoup(response.text, 'html.parser')

                # Check if content has error indicators
                error_indicators = [
                    'limit exceeded',
                    'Error fetching URL',
                    'Account balance not enough to run this query, please recharge.',
                    'Invalid bearer token',
                    'HTTP error occurred', 
                    'Error: Connection error occurred',
                    'Error: Request timed out',
                    'Unexpected error',
                    'Please turn on Javascript'
                ]

                has_error = any(indicator.lower() in response.text.lower() for indicator in error_indicators)
                if has_error:
                    # If content has error, use WebParserClient as fallback
                    client = WebParserClient("http://183.174.229.164:1241")
                    results = client.parse_urls([url])
                    if results and results[0]["success"]:
                        text = results[0]["content"]
                    else:
                        error_msg = results[0].get("error", "Unknown error") if results else "No results returned"
                        return f"WebParserClient error: {error_msg}"
                else:
                    if keep_links:
                        # Clean and extract main content
                        # Remove script, style tags etc
                        for element in soup.find_all(['script', 'style', 'meta', 'link']):
                            element.decompose()

                        # Extract text and links
                        text_parts = []
                        for element in soup.body.descendants if soup.body else soup.descendants:
                            if isinstance(element, str) and element.strip():
                                # Clean extra whitespace
                                cleaned_text = ' '.join(element.strip().split())
                                if cleaned_text:
                                    text_parts.append(cleaned_text)
                            elif element.name == 'a' and element.get('href'):
                                href = element.get('href')
                                link_text = element.get_text(strip=True)
                                if href and link_text:  # Only process a tags with both text and href
                                    # Handle relative URLs
                                    if href.startswith('/'):
                                        base_url = '/'.join(url.split('/')[:3])
                                        href = base_url + href
                                    elif not href.startswith(('http://', 'https://')):
                                        href = url.rstrip('/') + '/' + href
                                    text_parts.append(f"[{link_text}]({href})")

                        # Merge text with reasonable spacing
                        text = ' '.join(text_parts)
                        # Clean extra spaces
                        text = ' '.join(text.split())
                    else:
                        text = soup.get_text(separator=' ', strip=True)
            except Exception as e:
                # If normal extraction fails, try using WebParserClient
                client = WebParserClient("http://183.174.229.164:1241")
                results = client.parse_urls([url])
                if results and results[0]["success"]:
                    text = results[0]["content"]
                else:
                    error_msg = results[0].get("error", "Unknown error") if results else "No results returned"
                    return f"WebParserClient error: {error_msg}"

        if snippet:
            success, context = extract_snippet_with_context(text, snippet)
            if success:
                return context
            else:
                return text
        else:
            # If no snippet is provided, return directly
            return text[:30000]
    except requests.exceptions.HTTPError as http_err:
        return f"HTTP error occurred: {http_err}"
    except requests.exceptions.ConnectionError:
        return "Error: Connection error occurred"
    except requests.exceptions.Timeout:
        return "Error: Request timed out after 20 seconds"
    except Exception as e:
        return f"Unexpected error: {str(e)}"

def fetch_page_content(urls, max_workers=32, use_jina=False, jina_api_key=None, snippets: Optional[dict] = None, show_progress=False, keep_links=False):
    """
    Concurrently fetch content from multiple URLs.

    Args:
        urls (list): List of URLs to scrape.
        max_workers (int): Maximum number of concurrent threads.
        use_jina (bool): Whether to use Jina for extraction.
        jina_api_key (str): API key for Jina.
        snippets (Optional[dict]): A dictionary mapping URLs to their respective snippets.
        show_progress (bool): Whether to show progress bar with tqdm.
        keep_links (bool): Whether to keep links in the extracted text.

    Returns:
        dict: A dictionary mapping URLs to the extracted content or context.
    """
    results = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(extract_text_from_url, url, use_jina, jina_api_key, snippets.get(url) if snippets else None, keep_links): url
            for url in urls
        }
        completed_futures = concurrent.futures.as_completed(futures)
        if show_progress:
            completed_futures = tqdm(completed_futures, desc="Fetching URLs", total=len(urls))
            
        for future in completed_futures:
            url = futures[future]
            try:
                data = future.result()
                results[url] = data
            except Exception as exc:
                results[url] = f"Error fetching {url}: {exc}"
            # time.sleep(0.1)  # Simple rate limiting
    return results

def bing_web_search(query, subscription_key, endpoint, market='en-US', language='en', timeout=20):
    """
    Perform a search using the Bing Web Search API with a set timeout.

    Args:
        query (str): Search query.
        subscription_key (str): Subscription key for the Bing Search API.
        endpoint (str): Endpoint for the Bing Search API.
        market (str): Market, e.g., "en-US" or "zh-CN".
        language (str): Language of the results, e.g., "en".
        timeout (int or float or tuple): Request timeout in seconds.
                                         Can be a float representing the total timeout,
                                         or a tuple (connect timeout, read timeout).

    Returns:
        dict: JSON response of the search results. Returns empty dict if all retries fail.
    """
    headers = {
        "Ocp-Apim-Subscription-Key": subscription_key
    }
    params = {
        "q": query,
        "mkt": market,
        "setLang": language,
        "textDecorations": True,
        "textFormat": "HTML"
    }

    max_retries = 3
    retry_count = 0

    while retry_count < max_retries:
        try:
            response = requests.get(endpoint, headers=headers, params=params, timeout=timeout)
            response.raise_for_status()  # Raise exception if the request failed
            search_results = response.json()
            return search_results
        except Timeout:
            retry_count += 1
            if retry_count == max_retries:
                print(f"Bing Web Search request timed out ({timeout} seconds) for query: {query} after {max_retries} retries")
                return {}
            print(f"Bing Web Search Timeout occurred, retrying ({retry_count}/{max_retries})...")
        except requests.exceptions.RequestException as e:
            retry_count += 1
            if retry_count == max_retries:
                print(f"Bing Web Search Request Error occurred: {e} after {max_retries} retries")
                return {}
            print(f"Bing Web Search Request Error occurred, retrying ({retry_count}/{max_retries})...")
        time.sleep(1)  # Wait 1 second between retries
    
    return {}  # Should never reach here but added for completeness


def extract_pdf_text(url):
    """
    Extract text from a PDF.

    Args:
        url (str): URL of the PDF file.

    Returns:
        str: Extracted text content or error message.
    """
    try:
        response = session.get(url, timeout=20)  # Set timeout to 20 seconds
        if response.status_code != 200:
            return f"Error: Unable to retrieve the PDF (status code {response.status_code})"
        
        # Open the PDF file using pdfplumber
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            full_text = ""
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text
        
        # Limit the text length
        cleaned_text = full_text[:30000]
        return cleaned_text
    except requests.exceptions.Timeout:
        return "Error: Request timed out after 20 seconds"
    except Exception as e:
        return f"Error: {str(e)}"


In [2]:
if __name__ == "__main__":
    # 定义输入和输出路径
    url_cache_path = "/fs/archive/share/u2023000153/Search-o1/cache/url_cache_with_links.json"
    url_cache_new1_path = "/fs/archive/share/u2023000153/Search-o1/cache/url_cache_with_links_1.json"
    
    # 读取url_cache文件
    with open(url_cache_path, 'r', encoding='utf-8') as f:
        url_cache = json.load(f)
    
    # 提取所有错误的URL
    urls = []
    error_indicators = [
        'limit exceeded',
        'Error fetching URL',
        'Account balance not enough to run this query, please recharge.',
        'Invalid bearer token',
        'HTTP error occurred', 
        'Error: Connection error occurred',
        'Error: Request timed out',
        'Unexpected error',
        'Please turn on Javascript'
    ]
    
    for url, page in url_cache.items():
        if isinstance(page, str) and (any(indicator.lower() in page.lower() for indicator in error_indicators) or page.startswith('http') or len(page) < 10):
            urls.append(url)
            
    print(f"Total URLs to fetch: {len(urls)}")
    
    # 重新获取错误URL的内容
    if urls:
        new_cache = fetch_page_content(urls, use_jina=False, show_progress=True, keep_links=True)
        
        # 将新获取的内容合并到原始缓存中
        for url, content in new_cache.items():
            url_cache[url] = content
        
    # 保存完整的url_cache到新文件
    with open(url_cache_new1_path, 'w', encoding='utf-8') as f:
        json.dump(url_cache, f, ensure_ascii=False, indent=4)
    print(f"Updated complete cache saved to {url_cache_new1_path}")


Total URLs to fetch: 1388


  soup = BeautifulSoup(response.text, 'lxml')
Fetching URLs: 100%|██████████| 1388/1388 [44:33<00:00,  1.93s/it]  

Updated complete cache saved to /fs/archive/share/u2023000153/Search-o1/cache/url_cache_with_links_1.json





In [None]:
# 读取url_cache_new1_path并计算平均page字符数
import matplotlib.pyplot as plt

with open(url_cache_new1_path, 'r', encoding='utf-8') as f:
    url_cache = json.load(f)

# 收集所有页面的字符长度
page_lengths = []
for page in url_cache.values():
    if isinstance(page, str):
        page_lengths.append(len(page))

# 计算统计信息
total_chars = sum(page_lengths)
valid_pages = len(page_lengths)
avg_chars = total_chars / valid_pages if valid_pages > 0 else 0

# 绘制直方图
plt.figure(figsize=(10, 6))
plt.hist(page_lengths, bins=10, edgecolor='black') # Changed bins from 50 to 10
plt.title('Distribution of Page Lengths')
plt.xlabel('Number of Characters')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

# 添加平均值线
plt.axvline(avg_chars, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {avg_chars:.0f}')
plt.legend()

plt.show()

print(f"Average characters per page: {avg_chars:.2f}")
print(f"Total valid pages: {valid_pages}")

In [9]:
# Define URLs to fetch content from
urls = [
    "https://en.wikipedia.org/wiki/Unlambda",
    "https://www.researchgate.net/scientific-contributions/Iram-Khan-2100479969",
    # "https://onlinelibrary.wiley.com/doi/full/10.1111/jph.12451", 
    # "https://journals.le.ac.uk/ojs1/index.php/jist/article/view/733",
    # "https://oda.oslomet.no/oda-xmlui/handle/10642/3162",
    # "https://www.journalijar.com/article/26843/a-simple-model-for-analyzing-the-customer-retention-comparing-rural-and-urban-store",
    # "https://www.mdpi.com/2076-2607/11/1/123?type=check_update&version=2",
    # "https://arxiv.org/",
    # "https://www.virtuerestaurant.com",
    # "https://replit.com",
    # "https://github.com/sunnynexus/Search-o1",
    # "https://www.base-search.net",
    # "https://www.virtuerestaurant.com/menus",
]

# Fetch content from URLs with progress bar
# contents = fetch_page_content(urls, keep_links=True)
contents = fetch_page_content(urls, use_web_parser=True)

# Print URL and content for each result
print("\nFetched contents:")
for url, content in contents.items():
    print(f"---\nURL: {url}")
    print(f"{content[:30000]}...")  # Print first 500 chars



Fetched contents:
---
URL: https://en.wikipedia.org/wiki/Unlambda
[Jump to content](https://en.wikipedia.org/wiki/<#bodyContent>)
[ ![](https://en.wikipedia.org/static/images/icons/wikipedia.png) ![Wikipedia](https://en.wikipedia.org/static/images/mobile/copyright/wikipedia-wordmark-en.svg) ![The Free Encyclopedia](https://en.wikipedia.org/static/images/mobile/copyright/wikipedia-tagline-en.svg) ](https://en.wikipedia.org/wiki/</wiki/Main_Page>)
[ Search ](https://en.wikipedia.org/wiki/</wiki/Special:Search> "Search Wikipedia \[alt-shift-f\]")
Search
[ ![Banner logo](https://upload.wikimedia.org/wikipedia/commons/f/f9/Wikimania_logo.svg) **Wikimania 2025 Program Proposals are now open!** Click here to Apply NowThis application is open until Monday 31st March, 2025 end of day ](https://en.wikipedia.org/wiki/</wikimania.wikimedia.org/wiki/Special:MyLanguage/2025:Program>)
[ ](https://en.wikipedia.org/wiki/</wikimania.wikimedia.org/wiki/Special:MyLanguage/2025:Program>)[ [ Help with tran