From eac640fa33e3d64a8215ee869c407fda154bd3fc Mon Sep 17 00:00:00 2001
From: Hidetaka Okamoto <kokkoku214@gmail.com>
Date: Tue, 11 Mar 2025 15:27:15 +0900
Subject: [PATCH 1/2] Fix: Handle HTTP 404 responses in
 StripeDocsDocumentLoader and improve HTML content extraction

---
 .../src/StripeDocsLoader.ts                   |  30 +++--
 .../langchain-stripe-loader/src/index.spec.ts | 125 +++++++++++++++++-
 packages/langchain-stripe-loader/src/utils.ts |  67 +++++++++-
 3 files changed, 210 insertions(+), 12 deletions(-)

diff --git a/packages/langchain-stripe-loader/src/StripeDocsLoader.ts b/packages/langchain-stripe-loader/src/StripeDocsLoader.ts
index f9833b9..a12e7b9 100644
--- a/packages/langchain-stripe-loader/src/StripeDocsLoader.ts
+++ b/packages/langchain-stripe-loader/src/StripeDocsLoader.ts
@@ -41,15 +41,29 @@ export class StripeDocsDocumentLoader extends BaseDocumentLoader {
     const documentUrls = await this.fetchURLsFromSitemap();
     const arcitles: StripeDocsArticle[] = [];
     for await (const docsUrl of documentUrls) {
-      const response = await fetch(`${docsUrl}?locale=${locale}`);
-      const html = await response.text();
-      const articles = extractArticleFromHTML(html);
-      const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
-      const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"[^>]*>/);
+      try {
+        console.log(`Fetching ${docsUrl}?locale=${locale}`);
+        const response = await fetch(`${docsUrl}?locale=${locale}`);
+        
+        // HTTPステータスコードが400以上の場合はスキップ
+        if (response.status >= 400) {
+          console.log(`Skipping ${docsUrl} - HTTP status: ${response.status}`);
+          continue;
+        }
+        
+        const html = await response.text();
+        const articles = extractArticleFromHTML(html);
+        const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
+        const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"[^>]*>/);
 
-      const title = titleMatch ? titleMatch[1].trim() : 'Unknown';
-      const description = descMatch ? descMatch[1].trim() : 'No description';
-      arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description })));
+        const title = titleMatch ? titleMatch[1].trim() : 'Unknown';
+        const description = descMatch ? descMatch[1].trim() : 'No description';
+        arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description })));
+      } catch (error) {
+        console.error(`Error fetching ${docsUrl}: ${error}`);
+        // エラーが発生した場合もスキップして次のURLに進む
+        continue;
+      }
     }
     return arcitles;
   }
diff --git a/packages/langchain-stripe-loader/src/index.spec.ts b/packages/langchain-stripe-loader/src/index.spec.ts
index 2a06cdb..fc39bda 100644
--- a/packages/langchain-stripe-loader/src/index.spec.ts
+++ b/packages/langchain-stripe-loader/src/index.spec.ts
@@ -1,5 +1,17 @@
-import { it, expect } from 'vitest';
-import { StripeComDocumentLoader } from './index';
+import { it, expect, vi, describe } from 'vitest';
+import { StripeComDocumentLoader, StripeDocsDocumentLoader } from './index';
+import { SitemapProcessor } from 'stripe-loaders-core';
+
+// SitemapProcessorをモック
+vi.mock('stripe-loaders-core', () => {
+  return {
+    SitemapProcessor: vi.fn().mockImplementation(() => {
+      return {
+        fetchAndProcessSitemap: vi.fn().mockResolvedValue(['https://docs.stripe.com/test'])
+      };
+    })
+  };
+});
 
 it('silence is golden', () => {
   expect(true).toBe(true);
@@ -10,7 +22,7 @@ it('silence is golden', () => {
  * Just for local development
  */
 it.skip(
-  'test',
+  'test for StripeComDocumentLoader',
   {
     timeout: 10000,
   },
@@ -22,3 +34,110 @@ it.skip(
     expect(documents).toStrictEqual([]);
   }
 );
+
+describe('StripeDocsDocumentLoader', () => {
+  it(
+    'should load documents successfully',
+    {
+      timeout: 10000,
+    },
+    async () => {
+      // fetchのモック
+      global.fetch = vi.fn().mockResolvedValue({
+        status: 200,
+        text: vi.fn().mockResolvedValue(`
+          <html>
+            <head>
+              <title>Test Title</title>
+              <meta name="description" content="Test Description">
+            </head>
+            <body>
+              <div id="main-content">
+                <div class="Content-article">
+                  <h1>Test Content</h1>
+                  <p>This is test content for the Stripe docs.</p>
+                </div>
+              </div>
+            </body>
+          </html>
+        `)
+      });
+      
+      const loader = new StripeDocsDocumentLoader();
+      const documents = await loader.load();
+      
+      // 期待する結果
+      expect(documents.length).toBeGreaterThan(0);
+      expect(documents[0].pageContent).toContain('Test Content');
+      expect(documents[0].metadata.title).toBe('Test Title');
+      expect(documents[0].metadata.description).toBe('Test Description');
+      expect(documents[0].metadata.source).toBe('https://docs.stripe.com/test');
+    }
+  );
+
+  it(
+    'should skip pages with HTTP 404 status',
+    {
+      timeout: 10000,
+    },
+    async () => {
+      // SitemapProcessorのモックを上書き
+      (SitemapProcessor as any).mockImplementation(() => {
+        return {
+          fetchAndProcessSitemap: vi.fn().mockResolvedValue([
+            'https://docs.stripe.com/valid-page',
+            'https://docs.stripe.com/not-found-page'
+          ])
+        };
+      });
+
+      // fetchのモック - 最初のURLは成功、2番目のURLは404
+      global.fetch = vi.fn()
+        .mockImplementationOnce(() => Promise.resolve({
+          status: 200,
+          text: () => Promise.resolve(`
+            <html>
+              <head>
+                <title>Valid Page</title>
+                <meta name="description" content="Valid Description">
+              </head>
+              <body>
+                <div id="main-content">
+                  <div class="Content-article">
+                    <h1>Valid Content</h1>
+                    <p>This is valid content.</p>
+                  </div>
+                </div>
+              </body>
+            </html>
+          `)
+        }))
+        .mockImplementationOnce(() => Promise.resolve({
+          status: 404,
+          text: () => Promise.resolve(`
+            <html>
+              <head>
+                <title>404 Not Found</title>
+              </head>
+              <body>
+                <h1>404 Not Found</h1>
+              </body>
+            </html>
+          `)
+        }));
+      
+      const loader = new StripeDocsDocumentLoader();
+      const documents = await loader.load();
+      
+      // 404ページはスキップされるため、有効なページからのドキュメントのみが含まれる
+      expect(documents.length).toBeGreaterThan(0);
+      expect(documents[0].pageContent).toContain('Valid Content');
+      expect(documents[0].metadata.title).toBe('Valid Page');
+      expect(documents[0].metadata.source).toBe('https://docs.stripe.com/valid-page');
+      
+      // fetchが2回呼ばれたことを確認
+      expect(global.fetch).toHaveBeenCalledTimes(2);
+    }
+  );
+});
+  
\ No newline at end of file
diff --git a/packages/langchain-stripe-loader/src/utils.ts b/packages/langchain-stripe-loader/src/utils.ts
index 33f0008..8848eea 100644
--- a/packages/langchain-stripe-loader/src/utils.ts
+++ b/packages/langchain-stripe-loader/src/utils.ts
@@ -36,6 +36,12 @@ export function extractBodyFromHTML(htmlString: string) {
  */
 export function extractArticleFromHTML(htmlString: string) {
   try {
+    // 入力がnullまたはundefinedの場合は空配列を返す
+    if (!htmlString) {
+      console.log('Input HTML is null or undefined');
+      return [];
+    }
+    
     // Regular expression to extract article tag and its contents
     // [\s\S]*? - Non-greedy match for any characters including newlines
     const articleRegex = /<article[^>]*>([\s\S]*?)<\/article>/g;
@@ -49,8 +55,67 @@ export function extractArticleFromHTML(htmlString: string) {
       articles.push(match[1].trim());
     }
 
+    // If no article tags found, try to find content by main-content ID
+    if (articles.length === 0) {
+      console.log('No article tags found, trying to find main-content');
+      
+      // より柔軟な方法でコンテンツを抽出
+      // 1. まず、main-contentを含む要素を探す
+      const mainContentMatch = htmlString.match(/<[^>]*id=["']main-content["'][^>]*>([\s\S]*?)<\/[^>]*>/i);
+      
+      if (mainContentMatch && mainContentMatch[1]) {
+        console.log('Found element with main-content ID');
+        
+        // 2. Content-articleクラスを持つ要素を探す
+        const contentArticleMatch = mainContentMatch[1].match(/<div[^>]*class=["'][^"']*Content-article[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
+        
+        if (contentArticleMatch && contentArticleMatch[1]) {
+          console.log('Found content with Content-article class');
+          articles.push(contentArticleMatch[1].trim());
+        } else {
+          // 3. Document要素を探す
+          const documentMatch = mainContentMatch[1].match(/<div[^>]*class=["'][^"']*Document[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
+          
+          if (documentMatch && documentMatch[1]) {
+            console.log('Found content with Document class');
+            articles.push(documentMatch[1].trim());
+          } else {
+            // 4. main-content内の全コンテンツを使用
+            console.log('Using all content within main-content');
+            articles.push(mainContentMatch[1].trim());
+          }
+        }
+      } else {
+        // main-contentが見つからない場合、bodyタグ内のコンテンツを探す
+        console.log('No main-content found, trying to find body content');
+        const bodyMatch = htmlString.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
+        
+        if (bodyMatch && bodyMatch[1]) {
+          // Content-articleクラスを持つ要素を探す
+          const contentArticleMatch = bodyMatch[1].match(/<div[^>]*class=["'][^"']*Content-article[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
+          
+          if (contentArticleMatch && contentArticleMatch[1]) {
+            console.log('Found content with Content-article class in body');
+            articles.push(contentArticleMatch[1].trim());
+          } else {
+            // Documentクラスを持つ要素を探す
+            const documentMatch = bodyMatch[1].match(/<div[^>]*class=["'][^"']*Document[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
+            
+            if (documentMatch && documentMatch[1]) {
+              console.log('Found content with Document class in body');
+              articles.push(documentMatch[1].trim());
+            } else {
+              console.log('No specific content container found');
+            }
+          }
+        } else {
+          console.log('No body content found');
+        }
+      }
+    }
+
     if (articles.length === 0) {
-      console.log('No article tags found');
+      console.log('No content found in the HTML');
       return [];
     }
 

From 3556380503b5544f0ade30b65656543a7bf74d96 Mon Sep 17 00:00:00 2001
From: Hidetaka Okamoto <kokkoku214@gmail.com>
Date: Tue, 11 Mar 2025 15:35:50 +0900
Subject: [PATCH 2/2] update regs

---
 packages/langchain-stripe-loader/src/utils.ts | 189 ++++++++++--------
 1 file changed, 108 insertions(+), 81 deletions(-)

diff --git a/packages/langchain-stripe-loader/src/utils.ts b/packages/langchain-stripe-loader/src/utils.ts
index 8848eea..b5ab0d7 100644
--- a/packages/langchain-stripe-loader/src/utils.ts
+++ b/packages/langchain-stripe-loader/src/utils.ts
@@ -1,38 +1,43 @@
 /**
- * Function that extracts the content of body tags from HTML using regular expressions
- * @param {string} htmlString HTML string
- * @returns {string[]} Array of extracted body tag contents
+ * Extracts the body tag and its contents from an HTML string
+ * @param {string} htmlString - The HTML string to extract from
+ * @returns {string[]} Array of extracted body contents
  */
 export function extractBodyFromHTML(htmlString: string) {
   try {
-    // Regular expression to extract body tag and its contents
-    // [\s\S]*? - Non-greedy match for any characters including newlines
-    const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/g;
+    if (!htmlString) {
+      console.log('Input HTML is null or undefined');
+      return [];
+    }
 
-    const bodies = [];
+    // 安全な正規表現を使用してbodyタグを抽出
+    // 非貪欲マッチングを使用し、ネストされたタグも考慮
+    const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/gi;
+    const matches = [];
     let match;
 
-    // Find all matches
     while ((match = bodyRegex.exec(htmlString)) !== null) {
-      // Add matched content (group 1) to the array
-      bodies.push(match[1].trim());
+      if (match[1]) {
+        matches.push(match[1].trim());
+      }
     }
 
-    if (bodies.length === 0) {
+    if (matches.length === 0) {
       console.log('No body tags found');
       return [];
     }
 
-    return bodies;
+    return matches;
   } catch (error) {
     console.error('An error occurred:', error);
     return [];
   }
 }
+
 /**
- * Function that extracts the content of article tags from HTML using regular expressions
- * @param {string} htmlString HTML string
- * @returns {string[]} Array of extracted article tag contents
+ * Extracts article tags and their contents from an HTML string
+ * @param {string} htmlString - The HTML string to extract from
+ * @returns {string[]} Array of extracted article contents
  */
 export function extractArticleFromHTML(htmlString: string) {
   try {
@@ -42,86 +47,108 @@ export function extractArticleFromHTML(htmlString: string) {
       return [];
     }
     
-    // Regular expression to extract article tag and its contents
-    // [\s\S]*? - Non-greedy match for any characters including newlines
-    const articleRegex = /<article[^>]*>([\s\S]*?)<\/article>/g;
-
-    const articles = [];
-    let match;
-
-    // Find all matches
-    while ((match = articleRegex.exec(htmlString)) !== null) {
-      // Add matched content (group 1) to the array
-      articles.push(match[1].trim());
-    }
-
-    // If no article tags found, try to find content by main-content ID
-    if (articles.length === 0) {
-      console.log('No article tags found, trying to find main-content');
+    const articles: string[] = [];
+    
+    // 1. まず、articleタグを探す - より限定的な正規表現に変更
+    // 1つのarticleタグを処理する関数を作成
+    const processArticleTags = () => {
+      // articleタグのみに一致するように修正
+      const articleRegex = /<article(?:\s+[^>]*)?>(.*?)<\/article>/gis;
+      let articleMatch;
+      let foundArticles = false;
+      
+      while ((articleMatch = articleRegex.exec(htmlString)) !== null) {
+        if (articleMatch[1]) {
+          articles.push(articleMatch[1].trim());
+          foundArticles = true;
+        }
+      }
       
-      // より柔軟な方法でコンテンツを抽出
-      // 1. まず、main-contentを含む要素を探す
-      const mainContentMatch = htmlString.match(/<[^>]*id=["']main-content["'][^>]*>([\s\S]*?)<\/[^>]*>/i);
+      return foundArticles;
+    };
+    
+    // 2. main-contentを持つ要素を探す関数
+    const processMainContent = () => {
+      // タグを限定し、より具体的なパターンに修正
+      const mainContentRegex = /<(div|section|main|article)(?:\s+[^>]*?)id=["']main-content["'](?:[^>]*?)>(.*?)<\/\1>/is;
+      const mainContentMatch = mainContentRegex.exec(htmlString);
       
-      if (mainContentMatch && mainContentMatch[1]) {
+      if (mainContentMatch && mainContentMatch[2]) {
         console.log('Found element with main-content ID');
         
-        // 2. Content-articleクラスを持つ要素を探す
-        const contentArticleMatch = mainContentMatch[1].match(/<div[^>]*class=["'][^"']*Content-article[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
+        // Content-articleクラスを持つdivタグを探す
+        const contentArticleRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
+        const contentArticleMatch = contentArticleRegex.exec(mainContentMatch[2]);
         
         if (contentArticleMatch && contentArticleMatch[1]) {
           console.log('Found content with Content-article class');
           articles.push(contentArticleMatch[1].trim());
-        } else {
-          // 3. Document要素を探す
-          const documentMatch = mainContentMatch[1].match(/<div[^>]*class=["'][^"']*Document[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
-          
-          if (documentMatch && documentMatch[1]) {
-            console.log('Found content with Document class');
-            articles.push(documentMatch[1].trim());
-          } else {
-            // 4. main-content内の全コンテンツを使用
-            console.log('Using all content within main-content');
-            articles.push(mainContentMatch[1].trim());
-          }
+          return true;
+        } 
+        
+        // Documentクラスを持つdivタグを探す
+        const documentRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
+        const documentMatch = documentRegex.exec(mainContentMatch[2]);
+        
+        if (documentMatch && documentMatch[1]) {
+          console.log('Found content with Document class');
+          articles.push(documentMatch[1].trim());
+          return true;
         }
-      } else {
-        // main-contentが見つからない場合、bodyタグ内のコンテンツを探す
-        console.log('No main-content found, trying to find body content');
-        const bodyMatch = htmlString.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
         
-        if (bodyMatch && bodyMatch[1]) {
-          // Content-articleクラスを持つ要素を探す
-          const contentArticleMatch = bodyMatch[1].match(/<div[^>]*class=["'][^"']*Content-article[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
-          
-          if (contentArticleMatch && contentArticleMatch[1]) {
-            console.log('Found content with Content-article class in body');
-            articles.push(contentArticleMatch[1].trim());
-          } else {
-            // Documentクラスを持つ要素を探す
-            const documentMatch = bodyMatch[1].match(/<div[^>]*class=["'][^"']*Document[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
-            
-            if (documentMatch && documentMatch[1]) {
-              console.log('Found content with Document class in body');
-              articles.push(documentMatch[1].trim());
-            } else {
-              console.log('No specific content container found');
-            }
-          }
-        } else {
-          console.log('No body content found');
+        // main-content内の全コンテンツを使用
+        console.log('Using all content within main-content');
+        articles.push(mainContentMatch[2].trim());
+        return true;
+      }
+      
+      return false;
+    };
+    
+    // 3. bodyタグ内のコンテンツを探す関数
+    const processBody = () => {
+      // bodyタグを探す - 限定的なパターン
+      const bodyRegex = /<body(?:\s+[^>]*?)>(.*?)<\/body>/is;
+      const bodyMatch = bodyRegex.exec(htmlString);
+      
+      if (bodyMatch && bodyMatch[1]) {
+        // Content-articleクラスを持つdivタグを探す
+        const contentArticleRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
+        const contentArticleMatch = contentArticleRegex.exec(bodyMatch[1]);
+        
+        if (contentArticleMatch && contentArticleMatch[1]) {
+          console.log('Found content with Content-article class in body');
+          articles.push(contentArticleMatch[1].trim());
+          return true;
+        }
+        
+        // Documentクラスを持つdivタグを探す
+        const documentRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
+        const documentMatch = documentRegex.exec(bodyMatch[1]);
+        
+        if (documentMatch && documentMatch[1]) {
+          console.log('Found content with Document class in body');
+          articles.push(documentMatch[1].trim());
+          return true;
         }
       }
+      
+      console.log('No specific content container found');
+      return false;
+    };
+    
+    // 順番に処理を試行
+    if (!processArticleTags()) {
+      console.log('No article tags found, trying to find main-content');
+      if (!processMainContent()) {
+        console.log('No main-content found, trying to find body content');
+        processBody();
+      }
     }
-
-    if (articles.length === 0) {
-      console.log('No content found in the HTML');
-      return [];
-    }
-
+    
     return articles;
   } catch (error) {
-    console.error('An error occurred:', error);
+    console.error('Error extracting article from HTML:', error);
     return [];
   }
-}
+}
\ No newline at end of file