wpkyoto · hideokamoto · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025 · gemini-code-assist
diff --git a/packages/langchain-stripe-loader/src/StripeDocsLoader.ts b/packages/langchain-stripe-loader/src/StripeDocsLoader.ts
@@ -41,15 +41,29 @@ export class StripeDocsDocumentLoader extends BaseDocumentLoader {
     const documentUrls = await this.fetchURLsFromSitemap();
     const arcitles: StripeDocsArticle[] = [];
     for await (const docsUrl of documentUrls) {
-      const response = await fetch(`${docsUrl}?locale=${locale}`);
-      const html = await response.text();
-      const articles = extractArticleFromHTML(html);
-      const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
-      const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"[^>]*>/);
+      try {
+        console.log(`Fetching ${docsUrl}?locale=${locale}`);
+        const response = await fetch(`${docsUrl}?locale=${locale}`);
+
+        // HTTPステータスコードが400以上の場合はスキップ
+        if (response.status >= 400) {
+          console.log(`Skipping ${docsUrl} - HTTP status: ${response.status}`);
+          continue;
+        }
+
+        const html = await response.text();
+        const articles = extractArticleFromHTML(html);
+        const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
+        const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"[^>]*>/);
 
-      const title = titleMatch ? titleMatch[1].trim() : 'Unknown';
-      const description = descMatch ? descMatch[1].trim() : 'No description';
-      arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description })));
+        const title = titleMatch ? titleMatch[1].trim() : 'Unknown';
+        const description = descMatch ? descMatch[1].trim() : 'No description';
+        arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description })));
+      } catch (error) {
+        console.error(`Error fetching ${docsUrl}: ${error}`);
+        // エラーが発生した場合もスキップして次のURLに進む
+        continue;
+      }
     }
     return arcitles;
   }

diff --git a/packages/langchain-stripe-loader/src/index.spec.ts b/packages/langchain-stripe-loader/src/index.spec.ts
@@ -1,5 +1,17 @@
-import { it, expect } from 'vitest';
-import { StripeComDocumentLoader } from './index';
+import { it, expect, vi, describe } from 'vitest';
+import { StripeComDocumentLoader, StripeDocsDocumentLoader } from './index';
+import { SitemapProcessor } from 'stripe-loaders-core';
+
+// SitemapProcessorをモック
+vi.mock('stripe-loaders-core', () => {
+  return {
+    SitemapProcessor: vi.fn().mockImplementation(() => {
+      return {
+        fetchAndProcessSitemap: vi.fn().mockResolvedValue(['https://docs.stripe.com/test'])
+      };
+    })
+  };
+});
 
 it('silence is golden', () => {
   expect(true).toBe(true);
@@ -10,7 +22,7 @@ it('silence is golden', () => {
  * Just for local development
  */
 it.skip(
-  'test',
+  'test for StripeComDocumentLoader',
   {
     timeout: 10000,
   },
@@ -22,3 +34,110 @@ it.skip(
     expect(documents).toStrictEqual([]);
   }
 );
+
+describe('StripeDocsDocumentLoader', () => {
+  it(
+    'should load documents successfully',
+    {
+      timeout: 10000,
+    },
+    async () => {
+      // fetchのモック
+      global.fetch = vi.fn().mockResolvedValue({
+        status: 200,
+        text: vi.fn().mockResolvedValue(`
+          <html>
+            <head>
+              <title>Test Title</title>
+              <meta name="description" content="Test Description">
+            </head>
+            <body>
+              <div id="main-content">
+                <div class="Content-article">
+                  <h1>Test Content</h1>
+                  <p>This is test content for the Stripe docs.</p>
+                </div>
+              </div>
+            </body>
+          </html>
+        `)
+      });
+
+      const loader = new StripeDocsDocumentLoader();
+      const documents = await loader.load();
+
+      // 期待する結果
+      expect(documents.length).toBeGreaterThan(0);
+      expect(documents[0].pageContent).toContain('Test Content');
+      expect(documents[0].metadata.title).toBe('Test Title');
+      expect(documents[0].metadata.description).toBe('Test Description');
+      expect(documents[0].metadata.source).toBe('https://docs.stripe.com/test');
+    }
+  );
+
+  it(
+    'should skip pages with HTTP 404 status',
+    {
+      timeout: 10000,
+    },
+    async () => {
+      // SitemapProcessorのモックを上書き
+      (SitemapProcessor as any).mockImplementation(() => {
+        return {
+          fetchAndProcessSitemap: vi.fn().mockResolvedValue([
+            'https://docs.stripe.com/valid-page',
+            'https://docs.stripe.com/not-found-page'
+          ])
+        };
+      });
+
+      // fetchのモック - 最初のURLは成功、2番目のURLは404
+      global.fetch = vi.fn()
+        .mockImplementationOnce(() => Promise.resolve({
+          status: 200,
+          text: () => Promise.resolve(`
+            <html>
+              <head>
+                <title>Valid Page</title>
+                <meta name="description" content="Valid Description">
+              </head>
+              <body>
+                <div id="main-content">
+                  <div class="Content-article">
+                    <h1>Valid Content</h1>
+                    <p>This is valid content.</p>
+                  </div>
+                </div>
+              </body>
+            </html>
+          `)
+        }))
+        .mockImplementationOnce(() => Promise.resolve({
+          status: 404,
+          text: () => Promise.resolve(`
+            <html>
+              <head>
+                <title>404 Not Found</title>
+              </head>
+              <body>
+                <h1>404 Not Found</h1>
+              </body>
+            </html>
+          `)
+        }));
+
+      const loader = new StripeDocsDocumentLoader();
+      const documents = await loader.load();
+
+      // 404ページはスキップされるため、有効なページからのドキュメントのみが含まれる
+      expect(documents.length).toBeGreaterThan(0);
+      expect(documents[0].pageContent).toContain('Valid Content');
+      expect(documents[0].metadata.title).toBe('Valid Page');
+      expect(documents[0].metadata.source).toBe('https://docs.stripe.com/valid-page');
+
+      // fetchが2回呼ばれたことを確認
+      expect(global.fetch).toHaveBeenCalledTimes(2);
+    }
+  );
+});
+
diff --git a/packages/langchain-stripe-loader/src/utils.ts b/packages/langchain-stripe-loader/src/utils.ts
@@ -1,62 +1,154 @@
 /**
- * Function that extracts the content of body tags from HTML using regular expressions
- * @param {string} htmlString HTML string
- * @returns {string[]} Array of extracted body tag contents
+ * Extracts the body tag and its contents from an HTML string
+ * @param {string} htmlString - The HTML string to extract from
+ * @returns {string[]} Array of extracted body contents
  */
 export function extractBodyFromHTML(htmlString: string) {
   try {
-    // Regular expression to extract body tag and its contents
-    // [\s\S]*? - Non-greedy match for any characters including newlines
-    const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/g;
+    if (!htmlString) {
+      console.log('Input HTML is null or undefined');
+      return [];
+    }
 
-    const bodies = [];
+    // 安全な正規表現を使用してbodyタグを抽出
+    // 非貪欲マッチングを使用し、ネストされたタグも考慮
+    const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/gi;
+    const matches = [];
     let match;
 
-    // Find all matches
     while ((match = bodyRegex.exec(htmlString)) !== null) {
-      // Add matched content (group 1) to the array
-      bodies.push(match[1].trim());
+      if (match[1]) {
+        matches.push(match[1].trim());
+      }
     }
 
-    if (bodies.length === 0) {
+    if (matches.length === 0) {
       console.log('No body tags found');
       return [];
     }
 
-    return bodies;
+    return matches;
   } catch (error) {
     console.error('An error occurred:', error);
     return [];
   }
 }
+
 /**
- * Function that extracts the content of article tags from HTML using regular expressions
- * @param {string} htmlString HTML string
- * @returns {string[]} Array of extracted article tag contents
+ * Extracts article tags and their contents from an HTML string
+ * @param {string} htmlString - The HTML string to extract from
+ * @returns {string[]} Array of extracted article contents
  */
 export function extractArticleFromHTML(htmlString: string) {
   try {
-    // Regular expression to extract article tag and its contents
-    // [\s\S]*? - Non-greedy match for any characters including newlines
-    const articleRegex = /<article[^>]*>([\s\S]*?)<\/article>/g;
-
-    const articles = [];
-    let match;
-
-    // Find all matches
-    while ((match = articleRegex.exec(htmlString)) !== null) {
-      // Add matched content (group 1) to the array
-      articles.push(match[1].trim());
-    }
-
-    if (articles.length === 0) {
-      console.log('No article tags found');
+    // 入力がnullまたはundefinedの場合は空配列を返す
+    if (!htmlString) {
+      console.log('Input HTML is null or undefined');
       return [];
     }
-
+
+    const articles: string[] = [];
+
+    // 1. まず、articleタグを探す - より限定的な正規表現に変更
+    // 1つのarticleタグを処理する関数を作成
+    const processArticleTags = () => {
+      // articleタグのみに一致するように修正
+      const articleRegex = /<article(?:\s+[^>]*)?>(.*?)<\/article>/gis;
+      let articleMatch;
+      let foundArticles = false;
+
+      while ((articleMatch = articleRegex.exec(htmlString)) !== null) {
+        if (articleMatch[1]) {
+          articles.push(articleMatch[1].trim());
+          foundArticles = true;
+        }
+      }
+
+      return foundArticles;
+    };
+
+    // 2. main-contentを持つ要素を探す関数
+    const processMainContent = () => {
+      // タグを限定し、より具体的なパターンに修正
+      const mainContentRegex = /<(div|section|main|article)(?:\s+[^>]*?)id=["']main-content["'](?:[^>]*?)>(.*?)<\/\1>/is;
+      const mainContentMatch = mainContentRegex.exec(htmlString);
+
+      if (mainContentMatch && mainContentMatch[2]) {
+        console.log('Found element with main-content ID');
+
+        // Content-articleクラスを持つdivタグを探す
+        const contentArticleRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
+        const contentArticleMatch = contentArticleRegex.exec(mainContentMatch[2]);
+
+        if (contentArticleMatch && contentArticleMatch[1]) {
+          console.log('Found content with Content-article class');
+          articles.push(contentArticleMatch[1].trim());
+          return true;
+        } 
+
+        // Documentクラスを持つdivタグを探す
+        const documentRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
+        const documentMatch = documentRegex.exec(mainContentMatch[2]);
+
+        if (documentMatch && documentMatch[1]) {
+          console.log('Found content with Document class');
+          articles.push(documentMatch[1].trim());
+          return true;
+        }
+
+        // main-content内の全コンテンツを使用
+        console.log('Using all content within main-content');
+        articles.push(mainContentMatch[2].trim());
+        return true;
+      }
+
+      return false;
+    };
+
+    // 3. bodyタグ内のコンテンツを探す関数
+    const processBody = () => {
+      // bodyタグを探す - 限定的なパターン
+      const bodyRegex = /<body(?:\s+[^>]*?)>(.*?)<\/body>/is;
+      const bodyMatch = bodyRegex.exec(htmlString);
+
+      if (bodyMatch && bodyMatch[1]) {
+        // Content-articleクラスを持つdivタグを探す
+        const contentArticleRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
+        const contentArticleMatch = contentArticleRegex.exec(bodyMatch[1]);
+
+        if (contentArticleMatch && contentArticleMatch[1]) {
+          console.log('Found content with Content-article class in body');
+          articles.push(contentArticleMatch[1].trim());
+          return true;
+        }
+
+        // Documentクラスを持つdivタグを探す
+        const documentRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
+        const documentMatch = documentRegex.exec(bodyMatch[1]);
+
+        if (documentMatch && documentMatch[1]) {
+          console.log('Found content with Document class in body');
+          articles.push(documentMatch[1].trim());
+          return true;
+        }
+      }
+
+      console.log('No specific content container found');
+      return false;
+    };
+
+    // 順番に処理を試行
+    if (!processArticleTags()) {
+      console.log('No article tags found, trying to find main-content');
+      if (!processMainContent()) {
+        console.log('No main-content found, trying to find body content');
+        processBody();
+      }
+    }
+
     return articles;
   } catch (error) {
-    console.error('An error occurred:', error);
+    console.error('Error extracting article from HTML:', error);
     return [];
   }
-}
+}