Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions packages/langchain-stripe-loader/src/StripeDocsLoader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,29 @@ export class StripeDocsDocumentLoader extends BaseDocumentLoader {
const documentUrls = await this.fetchURLsFromSitemap();
const arcitles: StripeDocsArticle[] = [];
for await (const docsUrl of documentUrls) {
const response = await fetch(`${docsUrl}?locale=${locale}`);
const html = await response.text();
const articles = extractArticleFromHTML(html);
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"[^>]*>/);
try {
console.log(`Fetching ${docsUrl}?locale=${locale}`);
const response = await fetch(`${docsUrl}?locale=${locale}`);

// HTTPステータスコードが400以上の場合はスキップ
if (response.status >= 400) {
console.log(`Skipping ${docsUrl} - HTTP status: ${response.status}`);
continue;
}
Comment on lines +48 to +52

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider extracting this HTTP status check into a separate function for better readability and reusability. This would also make the code easier to test in isolation.

        if (this.isErrorStatus(response.status)) {
          console.log(`Skipping ${docsUrl} - HTTP status: ${response.status}`);
          continue;
        }


const html = await response.text();
const articles = extractArticleFromHTML(html);
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"[^>]*>/);

const title = titleMatch ? titleMatch[1].trim() : 'Unknown';
const description = descMatch ? descMatch[1].trim() : 'No description';
arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description })));
const title = titleMatch ? titleMatch[1].trim() : 'Unknown';
const description = descMatch ? descMatch[1].trim() : 'No description';
arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description })));
} catch (error) {
console.error(`Error fetching ${docsUrl}: ${error}`);
// エラーが発生した場合もスキップして次のURLに進む
continue;
Comment on lines +63 to +65

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

While the console.error is helpful for debugging, consider adding more context to the error message, such as the specific error type or a stack trace, to facilitate easier troubleshooting. Also, consider whether a more robust logging mechanism should be used instead of console.error.

        console.error(`Error fetching ${docsUrl}: ${error.message}, stack: ${error.stack}`);

}
}
return arcitles;
}
Expand Down
125 changes: 122 additions & 3 deletions packages/langchain-stripe-loader/src/index.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
import { it, expect } from 'vitest';
import { StripeComDocumentLoader } from './index';
import { it, expect, vi, describe } from 'vitest';
import { StripeComDocumentLoader, StripeDocsDocumentLoader } from './index';
import { SitemapProcessor } from 'stripe-loaders-core';

// SitemapProcessorをモック
vi.mock('stripe-loaders-core', () => {
return {
SitemapProcessor: vi.fn().mockImplementation(() => {
return {
fetchAndProcessSitemap: vi.fn().mockResolvedValue(['https://docs.stripe.com/test'])
};
})
};
});

it('silence is golden', () => {
expect(true).toBe(true);
Expand All @@ -10,7 +22,7 @@ it('silence is golden', () => {
* Just for local development
*/
it.skip(
'test',
'test for StripeComDocumentLoader',
{
timeout: 10000,
},
Expand All @@ -22,3 +34,110 @@ it.skip(
expect(documents).toStrictEqual([]);
}
);

describe('StripeDocsDocumentLoader', () => {
it(
'should load documents successfully',
{
timeout: 10000,
},
async () => {
// fetchのモック
global.fetch = vi.fn().mockResolvedValue({
status: 200,
text: vi.fn().mockResolvedValue(`
<html>
<head>
<title>Test Title</title>
<meta name="description" content="Test Description">
</head>
<body>
<div id="main-content">
<div class="Content-article">
<h1>Test Content</h1>
<p>This is test content for the Stripe docs.</p>
</div>
</div>
</body>
</html>
`)
});

const loader = new StripeDocsDocumentLoader();
const documents = await loader.load();

// 期待する結果
expect(documents.length).toBeGreaterThan(0);
expect(documents[0].pageContent).toContain('Test Content');
expect(documents[0].metadata.title).toBe('Test Title');
expect(documents[0].metadata.description).toBe('Test Description');
expect(documents[0].metadata.source).toBe('https://docs.stripe.com/test');
}
);

it(
'should skip pages with HTTP 404 status',
{
timeout: 10000,
},
async () => {
// SitemapProcessorのモックを上書き
(SitemapProcessor as any).mockImplementation(() => {
return {
fetchAndProcessSitemap: vi.fn().mockResolvedValue([
'https://docs.stripe.com/valid-page',
'https://docs.stripe.com/not-found-page'
])
};
});

// fetchのモック - 最初のURLは成功、2番目のURLは404
global.fetch = vi.fn()
.mockImplementationOnce(() => Promise.resolve({
status: 200,
text: () => Promise.resolve(`
<html>
<head>
<title>Valid Page</title>
<meta name="description" content="Valid Description">
</head>
<body>
<div id="main-content">
<div class="Content-article">
<h1>Valid Content</h1>
<p>This is valid content.</p>
</div>
</div>
</body>
</html>
`)
}))
.mockImplementationOnce(() => Promise.resolve({
status: 404,
text: () => Promise.resolve(`
<html>
<head>
<title>404 Not Found</title>
</head>
<body>
<h1>404 Not Found</h1>
</body>
</html>
`)
}));

const loader = new StripeDocsDocumentLoader();
const documents = await loader.load();

// 404ページはスキップされるため、有効なページからのドキュメントのみが含まれる
expect(documents.length).toBeGreaterThan(0);
expect(documents[0].pageContent).toContain('Valid Content');
expect(documents[0].metadata.title).toBe('Valid Page');
expect(documents[0].metadata.source).toBe('https://docs.stripe.com/valid-page');

// fetchが2回呼ばれたことを確認
expect(global.fetch).toHaveBeenCalledTimes(2);
}
);
});

158 changes: 125 additions & 33 deletions packages/langchain-stripe-loader/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,62 +1,154 @@
/**
* Function that extracts the content of body tags from HTML using regular expressions
* @param {string} htmlString HTML string
* @returns {string[]} Array of extracted body tag contents
* Extracts the body tag and its contents from an HTML string
* @param {string} htmlString - The HTML string to extract from
* @returns {string[]} Array of extracted body contents
*/
export function extractBodyFromHTML(htmlString: string) {
try {
// Regular expression to extract body tag and its contents
// [\s\S]*? - Non-greedy match for any characters including newlines
const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/g;
if (!htmlString) {
console.log('Input HTML is null or undefined');
return [];
}

const bodies = [];
// 安全な正規表現を使用してbodyタグを抽出
// 非貪欲マッチングを使用し、ネストされたタグも考慮
const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/gi;
const matches = [];
let match;

// Find all matches
while ((match = bodyRegex.exec(htmlString)) !== null) {
// Add matched content (group 1) to the array
bodies.push(match[1].trim());
if (match[1]) {
matches.push(match[1].trim());
}
}

if (bodies.length === 0) {
if (matches.length === 0) {
console.log('No body tags found');
return [];
}

return bodies;
return matches;
} catch (error) {
console.error('An error occurred:', error);
return [];
}
}

/**
* Function that extracts the content of article tags from HTML using regular expressions
* @param {string} htmlString HTML string
* @returns {string[]} Array of extracted article tag contents
* Extracts article tags and their contents from an HTML string
* @param {string} htmlString - The HTML string to extract from
* @returns {string[]} Array of extracted article contents
*/
export function extractArticleFromHTML(htmlString: string) {
try {
// Regular expression to extract article tag and its contents
// [\s\S]*? - Non-greedy match for any characters including newlines
const articleRegex = /<article[^>]*>([\s\S]*?)<\/article>/g;

const articles = [];
let match;

// Find all matches
while ((match = articleRegex.exec(htmlString)) !== null) {
// Add matched content (group 1) to the array
articles.push(match[1].trim());
}

if (articles.length === 0) {
console.log('No article tags found');
// 入力がnullまたはundefinedの場合は空配列を返す
if (!htmlString) {
console.log('Input HTML is null or undefined');
return [];
}


const articles: string[] = [];

// 1. まず、articleタグを探す - より限定的な正規表現に変更
// 1つのarticleタグを処理する関数を作成
const processArticleTags = () => {
// articleタグのみに一致するように修正
const articleRegex = /<article(?:\s+[^>]*)?>(.*?)<\/article>/gis;
let articleMatch;
let foundArticles = false;

while ((articleMatch = articleRegex.exec(htmlString)) !== null) {
if (articleMatch[1]) {
articles.push(articleMatch[1].trim());
foundArticles = true;
}
}

return foundArticles;
};
Comment on lines +50 to +68

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The processArticleTags function could be simplified by directly returning articles.length > 0 instead of using a foundArticles variable.

      while ((articleMatch = articleRegex.exec(htmlString)) !== null) {
        if (articleMatch[1]) {
          articles.push(articleMatch[1].trim());
        }
      }
      
      return articles.length > 0;


// 2. main-contentを持つ要素を探す関数
const processMainContent = () => {
// タグを限定し、より具体的なパターンに修正
const mainContentRegex = /<(div|section|main|article)(?:\s+[^>]*?)id=["']main-content["'](?:[^>]*?)>(.*?)<\/\1>/is;
const mainContentMatch = mainContentRegex.exec(htmlString);

if (mainContentMatch && mainContentMatch[2]) {
console.log('Found element with main-content ID');

// Content-articleクラスを持つdivタグを探す
const contentArticleRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
const contentArticleMatch = contentArticleRegex.exec(mainContentMatch[2]);

if (contentArticleMatch && contentArticleMatch[1]) {
console.log('Found content with Content-article class');
articles.push(contentArticleMatch[1].trim());
return true;
}

// Documentクラスを持つdivタグを探す
const documentRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
const documentMatch = documentRegex.exec(mainContentMatch[2]);

if (documentMatch && documentMatch[1]) {
console.log('Found content with Document class');
articles.push(documentMatch[1].trim());
return true;
}

// main-content内の全コンテンツを使用
console.log('Using all content within main-content');
articles.push(mainContentMatch[2].trim());
return true;
}

return false;
};

// 3. bodyタグ内のコンテンツを探す関数
const processBody = () => {
// bodyタグを探す - 限定的なパターン
const bodyRegex = /<body(?:\s+[^>]*?)>(.*?)<\/body>/is;
const bodyMatch = bodyRegex.exec(htmlString);

if (bodyMatch && bodyMatch[1]) {
// Content-articleクラスを持つdivタグを探す
const contentArticleRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
const contentArticleMatch = contentArticleRegex.exec(bodyMatch[1]);

if (contentArticleMatch && contentArticleMatch[1]) {
console.log('Found content with Content-article class in body');
articles.push(contentArticleMatch[1].trim());
return true;
}

// Documentクラスを持つdivタグを探す
const documentRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is;
const documentMatch = documentRegex.exec(bodyMatch[1]);

if (documentMatch && documentMatch[1]) {
console.log('Found content with Document class in body');
articles.push(documentMatch[1].trim());
return true;
}
}

console.log('No specific content container found');
return false;
};

// 順番に処理を試行
if (!processArticleTags()) {
console.log('No article tags found, trying to find main-content');
if (!processMainContent()) {
console.log('No main-content found, trying to find body content');
processBody();
}
}

return articles;
} catch (error) {
console.error('An error occurred:', error);
console.error('Error extracting article from HTML:', error);
return [];
}
}
}