From eac640fa33e3d64a8215ee869c407fda154bd3fc Mon Sep 17 00:00:00 2001 From: Hidetaka Okamoto Date: Tue, 11 Mar 2025 15:27:15 +0900 Subject: [PATCH 1/2] Fix: Handle HTTP 404 responses in StripeDocsDocumentLoader and improve HTML content extraction --- .../src/StripeDocsLoader.ts | 30 +++-- .../langchain-stripe-loader/src/index.spec.ts | 125 +++++++++++++++++- packages/langchain-stripe-loader/src/utils.ts | 67 +++++++++- 3 files changed, 210 insertions(+), 12 deletions(-) diff --git a/packages/langchain-stripe-loader/src/StripeDocsLoader.ts b/packages/langchain-stripe-loader/src/StripeDocsLoader.ts index f9833b9..a12e7b9 100644 --- a/packages/langchain-stripe-loader/src/StripeDocsLoader.ts +++ b/packages/langchain-stripe-loader/src/StripeDocsLoader.ts @@ -41,15 +41,29 @@ export class StripeDocsDocumentLoader extends BaseDocumentLoader { const documentUrls = await this.fetchURLsFromSitemap(); const arcitles: StripeDocsArticle[] = []; for await (const docsUrl of documentUrls) { - const response = await fetch(`${docsUrl}?locale=${locale}`); - const html = await response.text(); - const articles = extractArticleFromHTML(html); - const titleMatch = html.match(/]*>([^<]+)<\/title>/); - const descMatch = html.match(/]*name="description"[^>]*content="([^"]+)"[^>]*>/); + try { + console.log(`Fetching ${docsUrl}?locale=${locale}`); + const response = await fetch(`${docsUrl}?locale=${locale}`); + + // HTTPステータスコードが400以上の場合はスキップ + if (response.status >= 400) { + console.log(`Skipping ${docsUrl} - HTTP status: ${response.status}`); + continue; + } + + const html = await response.text(); + const articles = extractArticleFromHTML(html); + const titleMatch = html.match(/]*>([^<]+)<\/title>/); + const descMatch = html.match(/]*name="description"[^>]*content="([^"]+)"[^>]*>/); - const title = titleMatch ? titleMatch[1].trim() : 'Unknown'; - const description = descMatch ? descMatch[1].trim() : 'No description'; - arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description }))); + const title = titleMatch ? titleMatch[1].trim() : 'Unknown'; + const description = descMatch ? descMatch[1].trim() : 'No description'; + arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description }))); + } catch (error) { + console.error(`Error fetching ${docsUrl}: ${error}`); + // エラーが発生した場合もスキップして次のURLに進む + continue; + } } return arcitles; } diff --git a/packages/langchain-stripe-loader/src/index.spec.ts b/packages/langchain-stripe-loader/src/index.spec.ts index 2a06cdb..fc39bda 100644 --- a/packages/langchain-stripe-loader/src/index.spec.ts +++ b/packages/langchain-stripe-loader/src/index.spec.ts @@ -1,5 +1,17 @@ -import { it, expect } from 'vitest'; -import { StripeComDocumentLoader } from './index'; +import { it, expect, vi, describe } from 'vitest'; +import { StripeComDocumentLoader, StripeDocsDocumentLoader } from './index'; +import { SitemapProcessor } from 'stripe-loaders-core'; + +// SitemapProcessorをモック +vi.mock('stripe-loaders-core', () => { + return { + SitemapProcessor: vi.fn().mockImplementation(() => { + return { + fetchAndProcessSitemap: vi.fn().mockResolvedValue(['https://docs.stripe.com/test']) + }; + }) + }; +}); it('silence is golden', () => { expect(true).toBe(true); @@ -10,7 +22,7 @@ it('silence is golden', () => { * Just for local development */ it.skip( - 'test', + 'test for StripeComDocumentLoader', { timeout: 10000, }, @@ -22,3 +34,110 @@ it.skip( expect(documents).toStrictEqual([]); } ); + +describe('StripeDocsDocumentLoader', () => { + it( + 'should load documents successfully', + { + timeout: 10000, + }, + async () => { + // fetchのモック + global.fetch = vi.fn().mockResolvedValue({ + status: 200, + text: vi.fn().mockResolvedValue(` + + + Test Title + + + +
+
+

Test Content

+

This is test content for the Stripe docs.

+
+
+ + + `) + }); + + const loader = new StripeDocsDocumentLoader(); + const documents = await loader.load(); + + // 期待する結果 + expect(documents.length).toBeGreaterThan(0); + expect(documents[0].pageContent).toContain('Test Content'); + expect(documents[0].metadata.title).toBe('Test Title'); + expect(documents[0].metadata.description).toBe('Test Description'); + expect(documents[0].metadata.source).toBe('https://docs.stripe.com/test'); + } + ); + + it( + 'should skip pages with HTTP 404 status', + { + timeout: 10000, + }, + async () => { + // SitemapProcessorのモックを上書き + (SitemapProcessor as any).mockImplementation(() => { + return { + fetchAndProcessSitemap: vi.fn().mockResolvedValue([ + 'https://docs.stripe.com/valid-page', + 'https://docs.stripe.com/not-found-page' + ]) + }; + }); + + // fetchのモック - 最初のURLは成功、2番目のURLは404 + global.fetch = vi.fn() + .mockImplementationOnce(() => Promise.resolve({ + status: 200, + text: () => Promise.resolve(` + + + Valid Page + + + +
+
+

Valid Content

+

This is valid content.

+
+
+ + + `) + })) + .mockImplementationOnce(() => Promise.resolve({ + status: 404, + text: () => Promise.resolve(` + + + 404 Not Found + + +

404 Not Found

+ + + `) + })); + + const loader = new StripeDocsDocumentLoader(); + const documents = await loader.load(); + + // 404ページはスキップされるため、有効なページからのドキュメントのみが含まれる + expect(documents.length).toBeGreaterThan(0); + expect(documents[0].pageContent).toContain('Valid Content'); + expect(documents[0].metadata.title).toBe('Valid Page'); + expect(documents[0].metadata.source).toBe('https://docs.stripe.com/valid-page'); + + // fetchが2回呼ばれたことを確認 + expect(global.fetch).toHaveBeenCalledTimes(2); + } + ); +}); + \ No newline at end of file diff --git a/packages/langchain-stripe-loader/src/utils.ts b/packages/langchain-stripe-loader/src/utils.ts index 33f0008..8848eea 100644 --- a/packages/langchain-stripe-loader/src/utils.ts +++ b/packages/langchain-stripe-loader/src/utils.ts @@ -36,6 +36,12 @@ export function extractBodyFromHTML(htmlString: string) { */ export function extractArticleFromHTML(htmlString: string) { try { + // 入力がnullまたはundefinedの場合は空配列を返す + if (!htmlString) { + console.log('Input HTML is null or undefined'); + return []; + } + // Regular expression to extract article tag and its contents // [\s\S]*? - Non-greedy match for any characters including newlines const articleRegex = /]*>([\s\S]*?)<\/article>/g; @@ -49,8 +55,67 @@ export function extractArticleFromHTML(htmlString: string) { articles.push(match[1].trim()); } + // If no article tags found, try to find content by main-content ID + if (articles.length === 0) { + console.log('No article tags found, trying to find main-content'); + + // より柔軟な方法でコンテンツを抽出 + // 1. まず、main-contentを含む要素を探す + const mainContentMatch = htmlString.match(/<[^>]*id=["']main-content["'][^>]*>([\s\S]*?)<\/[^>]*>/i); + + if (mainContentMatch && mainContentMatch[1]) { + console.log('Found element with main-content ID'); + + // 2. Content-articleクラスを持つ要素を探す + const contentArticleMatch = mainContentMatch[1].match(/]*class=["'][^"']*Content-article[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); + + if (contentArticleMatch && contentArticleMatch[1]) { + console.log('Found content with Content-article class'); + articles.push(contentArticleMatch[1].trim()); + } else { + // 3. Document要素を探す + const documentMatch = mainContentMatch[1].match(/]*class=["'][^"']*Document[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); + + if (documentMatch && documentMatch[1]) { + console.log('Found content with Document class'); + articles.push(documentMatch[1].trim()); + } else { + // 4. main-content内の全コンテンツを使用 + console.log('Using all content within main-content'); + articles.push(mainContentMatch[1].trim()); + } + } + } else { + // main-contentが見つからない場合、bodyタグ内のコンテンツを探す + console.log('No main-content found, trying to find body content'); + const bodyMatch = htmlString.match(/]*>([\s\S]*?)<\/body>/i); + + if (bodyMatch && bodyMatch[1]) { + // Content-articleクラスを持つ要素を探す + const contentArticleMatch = bodyMatch[1].match(/]*class=["'][^"']*Content-article[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); + + if (contentArticleMatch && contentArticleMatch[1]) { + console.log('Found content with Content-article class in body'); + articles.push(contentArticleMatch[1].trim()); + } else { + // Documentクラスを持つ要素を探す + const documentMatch = bodyMatch[1].match(/]*class=["'][^"']*Document[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); + + if (documentMatch && documentMatch[1]) { + console.log('Found content with Document class in body'); + articles.push(documentMatch[1].trim()); + } else { + console.log('No specific content container found'); + } + } + } else { + console.log('No body content found'); + } + } + } + if (articles.length === 0) { - console.log('No article tags found'); + console.log('No content found in the HTML'); return []; } From 3556380503b5544f0ade30b65656543a7bf74d96 Mon Sep 17 00:00:00 2001 From: Hidetaka Okamoto Date: Tue, 11 Mar 2025 15:35:50 +0900 Subject: [PATCH 2/2] update regs --- packages/langchain-stripe-loader/src/utils.ts | 189 ++++++++++-------- 1 file changed, 108 insertions(+), 81 deletions(-) diff --git a/packages/langchain-stripe-loader/src/utils.ts b/packages/langchain-stripe-loader/src/utils.ts index 8848eea..b5ab0d7 100644 --- a/packages/langchain-stripe-loader/src/utils.ts +++ b/packages/langchain-stripe-loader/src/utils.ts @@ -1,38 +1,43 @@ /** - * Function that extracts the content of body tags from HTML using regular expressions - * @param {string} htmlString HTML string - * @returns {string[]} Array of extracted body tag contents + * Extracts the body tag and its contents from an HTML string + * @param {string} htmlString - The HTML string to extract from + * @returns {string[]} Array of extracted body contents */ export function extractBodyFromHTML(htmlString: string) { try { - // Regular expression to extract body tag and its contents - // [\s\S]*? - Non-greedy match for any characters including newlines - const bodyRegex = /]*>([\s\S]*?)<\/body>/g; + if (!htmlString) { + console.log('Input HTML is null or undefined'); + return []; + } - const bodies = []; + // 安全な正規表現を使用してbodyタグを抽出 + // 非貪欲マッチングを使用し、ネストされたタグも考慮 + const bodyRegex = /]*>([\s\S]*?)<\/body>/gi; + const matches = []; let match; - // Find all matches while ((match = bodyRegex.exec(htmlString)) !== null) { - // Add matched content (group 1) to the array - bodies.push(match[1].trim()); + if (match[1]) { + matches.push(match[1].trim()); + } } - if (bodies.length === 0) { + if (matches.length === 0) { console.log('No body tags found'); return []; } - return bodies; + return matches; } catch (error) { console.error('An error occurred:', error); return []; } } + /** - * Function that extracts the content of article tags from HTML using regular expressions - * @param {string} htmlString HTML string - * @returns {string[]} Array of extracted article tag contents + * Extracts article tags and their contents from an HTML string + * @param {string} htmlString - The HTML string to extract from + * @returns {string[]} Array of extracted article contents */ export function extractArticleFromHTML(htmlString: string) { try { @@ -42,86 +47,108 @@ export function extractArticleFromHTML(htmlString: string) { return []; } - // Regular expression to extract article tag and its contents - // [\s\S]*? - Non-greedy match for any characters including newlines - const articleRegex = /]*>([\s\S]*?)<\/article>/g; - - const articles = []; - let match; - - // Find all matches - while ((match = articleRegex.exec(htmlString)) !== null) { - // Add matched content (group 1) to the array - articles.push(match[1].trim()); - } - - // If no article tags found, try to find content by main-content ID - if (articles.length === 0) { - console.log('No article tags found, trying to find main-content'); + const articles: string[] = []; + + // 1. まず、articleタグを探す - より限定的な正規表現に変更 + // 1つのarticleタグを処理する関数を作成 + const processArticleTags = () => { + // articleタグのみに一致するように修正 + const articleRegex = /]*)?>(.*?)<\/article>/gis; + let articleMatch; + let foundArticles = false; + + while ((articleMatch = articleRegex.exec(htmlString)) !== null) { + if (articleMatch[1]) { + articles.push(articleMatch[1].trim()); + foundArticles = true; + } + } - // より柔軟な方法でコンテンツを抽出 - // 1. まず、main-contentを含む要素を探す - const mainContentMatch = htmlString.match(/<[^>]*id=["']main-content["'][^>]*>([\s\S]*?)<\/[^>]*>/i); + return foundArticles; + }; + + // 2. main-contentを持つ要素を探す関数 + const processMainContent = () => { + // タグを限定し、より具体的なパターンに修正 + const mainContentRegex = /<(div|section|main|article)(?:\s+[^>]*?)id=["']main-content["'](?:[^>]*?)>(.*?)<\/\1>/is; + const mainContentMatch = mainContentRegex.exec(htmlString); - if (mainContentMatch && mainContentMatch[1]) { + if (mainContentMatch && mainContentMatch[2]) { console.log('Found element with main-content ID'); - // 2. Content-articleクラスを持つ要素を探す - const contentArticleMatch = mainContentMatch[1].match(/]*class=["'][^"']*Content-article[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); + // Content-articleクラスを持つdivタグを探す + const contentArticleRegex = /]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is; + const contentArticleMatch = contentArticleRegex.exec(mainContentMatch[2]); if (contentArticleMatch && contentArticleMatch[1]) { console.log('Found content with Content-article class'); articles.push(contentArticleMatch[1].trim()); - } else { - // 3. Document要素を探す - const documentMatch = mainContentMatch[1].match(/]*class=["'][^"']*Document[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); - - if (documentMatch && documentMatch[1]) { - console.log('Found content with Document class'); - articles.push(documentMatch[1].trim()); - } else { - // 4. main-content内の全コンテンツを使用 - console.log('Using all content within main-content'); - articles.push(mainContentMatch[1].trim()); - } + return true; + } + + // Documentクラスを持つdivタグを探す + const documentRegex = /]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is; + const documentMatch = documentRegex.exec(mainContentMatch[2]); + + if (documentMatch && documentMatch[1]) { + console.log('Found content with Document class'); + articles.push(documentMatch[1].trim()); + return true; } - } else { - // main-contentが見つからない場合、bodyタグ内のコンテンツを探す - console.log('No main-content found, trying to find body content'); - const bodyMatch = htmlString.match(/]*>([\s\S]*?)<\/body>/i); - if (bodyMatch && bodyMatch[1]) { - // Content-articleクラスを持つ要素を探す - const contentArticleMatch = bodyMatch[1].match(/]*class=["'][^"']*Content-article[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); - - if (contentArticleMatch && contentArticleMatch[1]) { - console.log('Found content with Content-article class in body'); - articles.push(contentArticleMatch[1].trim()); - } else { - // Documentクラスを持つ要素を探す - const documentMatch = bodyMatch[1].match(/]*class=["'][^"']*Document[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); - - if (documentMatch && documentMatch[1]) { - console.log('Found content with Document class in body'); - articles.push(documentMatch[1].trim()); - } else { - console.log('No specific content container found'); - } - } - } else { - console.log('No body content found'); + // main-content内の全コンテンツを使用 + console.log('Using all content within main-content'); + articles.push(mainContentMatch[2].trim()); + return true; + } + + return false; + }; + + // 3. bodyタグ内のコンテンツを探す関数 + const processBody = () => { + // bodyタグを探す - 限定的なパターン + const bodyRegex = /]*?)>(.*?)<\/body>/is; + const bodyMatch = bodyRegex.exec(htmlString); + + if (bodyMatch && bodyMatch[1]) { + // Content-articleクラスを持つdivタグを探す + const contentArticleRegex = /]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is; + const contentArticleMatch = contentArticleRegex.exec(bodyMatch[1]); + + if (contentArticleMatch && contentArticleMatch[1]) { + console.log('Found content with Content-article class in body'); + articles.push(contentArticleMatch[1].trim()); + return true; + } + + // Documentクラスを持つdivタグを探す + const documentRegex = /]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is; + const documentMatch = documentRegex.exec(bodyMatch[1]); + + if (documentMatch && documentMatch[1]) { + console.log('Found content with Document class in body'); + articles.push(documentMatch[1].trim()); + return true; } } + + console.log('No specific content container found'); + return false; + }; + + // 順番に処理を試行 + if (!processArticleTags()) { + console.log('No article tags found, trying to find main-content'); + if (!processMainContent()) { + console.log('No main-content found, trying to find body content'); + processBody(); + } } - - if (articles.length === 0) { - console.log('No content found in the HTML'); - return []; - } - + return articles; } catch (error) { - console.error('An error occurred:', error); + console.error('Error extracting article from HTML:', error); return []; } -} +} \ No newline at end of file