-
Notifications
You must be signed in to change notification settings - Fork 0
Fix: Handle HTTP 404 responses in StripeDocsDocumentLoader and improv… #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,15 +41,29 @@ export class StripeDocsDocumentLoader extends BaseDocumentLoader { | |
| const documentUrls = await this.fetchURLsFromSitemap(); | ||
| const arcitles: StripeDocsArticle[] = []; | ||
| for await (const docsUrl of documentUrls) { | ||
| const response = await fetch(`${docsUrl}?locale=${locale}`); | ||
| const html = await response.text(); | ||
| const articles = extractArticleFromHTML(html); | ||
| const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/); | ||
| const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"[^>]*>/); | ||
| try { | ||
| console.log(`Fetching ${docsUrl}?locale=${locale}`); | ||
| const response = await fetch(`${docsUrl}?locale=${locale}`); | ||
|
|
||
| // HTTPステータスコードが400以上の場合はスキップ | ||
| if (response.status >= 400) { | ||
| console.log(`Skipping ${docsUrl} - HTTP status: ${response.status}`); | ||
| continue; | ||
| } | ||
hideokamoto marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| const html = await response.text(); | ||
| const articles = extractArticleFromHTML(html); | ||
| const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/); | ||
| const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"[^>]*>/); | ||
|
|
||
| const title = titleMatch ? titleMatch[1].trim() : 'Unknown'; | ||
| const description = descMatch ? descMatch[1].trim() : 'No description'; | ||
| arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description }))); | ||
| const title = titleMatch ? titleMatch[1].trim() : 'Unknown'; | ||
| const description = descMatch ? descMatch[1].trim() : 'No description'; | ||
| arcitles.push(...articles.map(content => ({ url: docsUrl, content, title, description }))); | ||
| } catch (error) { | ||
| console.error(`Error fetching ${docsUrl}: ${error}`); | ||
| // エラーが発生した場合もスキップして次のURLに進む | ||
| continue; | ||
|
Comment on lines
+63
to
+65
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While the console.error(`Error fetching ${docsUrl}: ${error.message}, stack: ${error.stack}`); |
||
| } | ||
| } | ||
| return arcitles; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,62 +1,154 @@ | ||
| /** | ||
| * Function that extracts the content of body tags from HTML using regular expressions | ||
| * @param {string} htmlString HTML string | ||
| * @returns {string[]} Array of extracted body tag contents | ||
| * Extracts the body tag and its contents from an HTML string | ||
| * @param {string} htmlString - The HTML string to extract from | ||
| * @returns {string[]} Array of extracted body contents | ||
| */ | ||
| export function extractBodyFromHTML(htmlString: string) { | ||
| try { | ||
| // Regular expression to extract body tag and its contents | ||
| // [\s\S]*? - Non-greedy match for any characters including newlines | ||
| const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/g; | ||
| if (!htmlString) { | ||
| console.log('Input HTML is null or undefined'); | ||
| return []; | ||
| } | ||
|
|
||
| const bodies = []; | ||
| // 安全な正規表現を使用してbodyタグを抽出 | ||
| // 非貪欲マッチングを使用し、ネストされたタグも考慮 | ||
| const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/gi; | ||
| const matches = []; | ||
| let match; | ||
|
|
||
| // Find all matches | ||
| while ((match = bodyRegex.exec(htmlString)) !== null) { | ||
| // Add matched content (group 1) to the array | ||
| bodies.push(match[1].trim()); | ||
| if (match[1]) { | ||
| matches.push(match[1].trim()); | ||
| } | ||
| } | ||
|
|
||
| if (bodies.length === 0) { | ||
| if (matches.length === 0) { | ||
| console.log('No body tags found'); | ||
| return []; | ||
| } | ||
|
|
||
| return bodies; | ||
| return matches; | ||
| } catch (error) { | ||
| console.error('An error occurred:', error); | ||
| return []; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Function that extracts the content of article tags from HTML using regular expressions | ||
| * @param {string} htmlString HTML string | ||
| * @returns {string[]} Array of extracted article tag contents | ||
| * Extracts article tags and their contents from an HTML string | ||
| * @param {string} htmlString - The HTML string to extract from | ||
| * @returns {string[]} Array of extracted article contents | ||
| */ | ||
| export function extractArticleFromHTML(htmlString: string) { | ||
| try { | ||
| // Regular expression to extract article tag and its contents | ||
| // [\s\S]*? - Non-greedy match for any characters including newlines | ||
| const articleRegex = /<article[^>]*>([\s\S]*?)<\/article>/g; | ||
|
|
||
| const articles = []; | ||
| let match; | ||
|
|
||
| // Find all matches | ||
| while ((match = articleRegex.exec(htmlString)) !== null) { | ||
| // Add matched content (group 1) to the array | ||
| articles.push(match[1].trim()); | ||
| } | ||
|
|
||
| if (articles.length === 0) { | ||
| console.log('No article tags found'); | ||
| // 入力がnullまたはundefinedの場合は空配列を返す | ||
| if (!htmlString) { | ||
| console.log('Input HTML is null or undefined'); | ||
| return []; | ||
| } | ||
|
|
||
|
|
||
| const articles: string[] = []; | ||
|
|
||
| // 1. まず、articleタグを探す - より限定的な正規表現に変更 | ||
| // 1つのarticleタグを処理する関数を作成 | ||
| const processArticleTags = () => { | ||
| // articleタグのみに一致するように修正 | ||
| const articleRegex = /<article(?:\s+[^>]*)?>(.*?)<\/article>/gis; | ||
| let articleMatch; | ||
| let foundArticles = false; | ||
|
|
||
| while ((articleMatch = articleRegex.exec(htmlString)) !== null) { | ||
| if (articleMatch[1]) { | ||
| articles.push(articleMatch[1].trim()); | ||
| foundArticles = true; | ||
| } | ||
| } | ||
|
|
||
| return foundArticles; | ||
| }; | ||
|
Comment on lines
+50
to
+68
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| // 2. main-contentを持つ要素を探す関数 | ||
| const processMainContent = () => { | ||
| // タグを限定し、より具体的なパターンに修正 | ||
| const mainContentRegex = /<(div|section|main|article)(?:\s+[^>]*?)id=["']main-content["'](?:[^>]*?)>(.*?)<\/\1>/is; | ||
| const mainContentMatch = mainContentRegex.exec(htmlString); | ||
|
|
||
| if (mainContentMatch && mainContentMatch[2]) { | ||
| console.log('Found element with main-content ID'); | ||
|
|
||
| // Content-articleクラスを持つdivタグを探す | ||
| const contentArticleRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is; | ||
| const contentArticleMatch = contentArticleRegex.exec(mainContentMatch[2]); | ||
|
|
||
| if (contentArticleMatch && contentArticleMatch[1]) { | ||
| console.log('Found content with Content-article class'); | ||
| articles.push(contentArticleMatch[1].trim()); | ||
| return true; | ||
| } | ||
|
|
||
| // Documentクラスを持つdivタグを探す | ||
| const documentRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is; | ||
| const documentMatch = documentRegex.exec(mainContentMatch[2]); | ||
|
|
||
| if (documentMatch && documentMatch[1]) { | ||
| console.log('Found content with Document class'); | ||
| articles.push(documentMatch[1].trim()); | ||
| return true; | ||
| } | ||
|
|
||
| // main-content内の全コンテンツを使用 | ||
| console.log('Using all content within main-content'); | ||
| articles.push(mainContentMatch[2].trim()); | ||
| return true; | ||
| } | ||
|
|
||
| return false; | ||
| }; | ||
|
|
||
| // 3. bodyタグ内のコンテンツを探す関数 | ||
| const processBody = () => { | ||
| // bodyタグを探す - 限定的なパターン | ||
| const bodyRegex = /<body(?:\s+[^>]*?)>(.*?)<\/body>/is; | ||
| const bodyMatch = bodyRegex.exec(htmlString); | ||
|
|
||
| if (bodyMatch && bodyMatch[1]) { | ||
| // Content-articleクラスを持つdivタグを探す | ||
| const contentArticleRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Content-article[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is; | ||
| const contentArticleMatch = contentArticleRegex.exec(bodyMatch[1]); | ||
|
|
||
| if (contentArticleMatch && contentArticleMatch[1]) { | ||
| console.log('Found content with Content-article class in body'); | ||
| articles.push(contentArticleMatch[1].trim()); | ||
| return true; | ||
| } | ||
|
|
||
| // Documentクラスを持つdivタグを探す | ||
| const documentRegex = /<div(?:\s+[^>]*?)class=["'][^"']*Document[^"']*["'](?:[^>]*?)>(.*?)<\/div>/is; | ||
| const documentMatch = documentRegex.exec(bodyMatch[1]); | ||
|
|
||
| if (documentMatch && documentMatch[1]) { | ||
| console.log('Found content with Document class in body'); | ||
| articles.push(documentMatch[1].trim()); | ||
| return true; | ||
| } | ||
| } | ||
|
|
||
| console.log('No specific content container found'); | ||
| return false; | ||
| }; | ||
|
|
||
| // 順番に処理を試行 | ||
| if (!processArticleTags()) { | ||
| console.log('No article tags found, trying to find main-content'); | ||
| if (!processMainContent()) { | ||
| console.log('No main-content found, trying to find body content'); | ||
| processBody(); | ||
| } | ||
| } | ||
|
|
||
| return articles; | ||
| } catch (error) { | ||
| console.error('An error occurred:', error); | ||
| console.error('Error extracting article from HTML:', error); | ||
| return []; | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider extracting this HTTP status check into a separate function for better readability and reusability. This would also make the code easier to test in isolation.