diff --git a/.eslintignore b/.eslintignore deleted file mode 100644 index 67ab7da..0000000 --- a/.eslintignore +++ /dev/null @@ -1,2 +0,0 @@ -dist/ -**/*.test.ts \ No newline at end of file diff --git a/.eslintrc.cjs b/.eslintrc.cjs deleted file mode 100644 index 5929160..0000000 --- a/.eslintrc.cjs +++ /dev/null @@ -1,26 +0,0 @@ -module.exports = { - parser: '@typescript-eslint/parser', - parserOptions: { - ecmaVersion: 'latest', - sourceType: 'module', - project: './tsconfig.json', - }, - plugins: ['@typescript-eslint'], - extends: [ - 'eslint:recommended', - 'plugin:@typescript-eslint/recommended', - 'plugin:@typescript-eslint/recommended-requiring-type-checking', - 'plugin:jest/recommended', - 'prettier', - ], - env: { - node: true, - es2022: true, - jest: true, - }, - rules: { - 'no-console': 'warn', - '@typescript-eslint/explicit-function-return-type': 'warn', - '@typescript-eslint/no-unused-vars': 'error', - }, -}; diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index 71efb1d..0000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Publish Package - -on: - push: - branches: - - main - -jobs: - publish: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Setup Node.js - uses: actions/setup-node@v2 - with: - node-version: '20' - - - name: Install dependencies - run: npm install - - - name: Publish package - run: npm publish - env: - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..964ab41 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,52 @@ +# https://semantic-release.gitbook.io/semantic-release/recipes/ci-configurations/github-actions +name: Release +on: + push: + branches: + - main + +permissions: + contents: read + +jobs: + release: + name: Release + + runs-on: ubuntu-latest + + permissions: + contents: write + issues: write + pull-requests: write + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 'lts/*' + + - name: Install pnpm + uses: pnpm/action-setup@v2 + with: + version: 10 + + - name: Install dependencies + run: pnpm install + + - name: Build + run: pnpm build + + - name: Verify the integrity of provenance attestations and registry signatures for installed dependencies + run: npm audit signatures + + - name: Release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + run: npx semantic-release diff --git a/.github/workflows/ci.yml b/.github/workflows/test.yml similarity index 65% rename from .github/workflows/ci.yml rename to .github/workflows/test.yml index baf5bd8..7f515b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/test.yml @@ -1,26 +1,24 @@ -name: CI +name: Test on: - push: - branches: [main] pull_request: - branches: [main] + branches: + - main jobs: test: runs-on: ubuntu-latest - strategy: - matrix: - node-version: [20.x, 22.x] - steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 - - name: Use Node.js ${{ matrix.node-version }} + - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: ${{ matrix.node-version }} + node-version: 'lts/*' - name: Install pnpm uses: pnpm/action-setup@v2 @@ -36,8 +34,8 @@ jobs: - name: Check formatting run: pnpm format:check - - name: Run tests - run: pnpm test - - name: Build run: pnpm build + + - name: Run tests + run: pnpm test diff --git a/.releaserc.yml b/.releaserc.yml new file mode 100644 index 0000000..41810f4 --- /dev/null +++ b/.releaserc.yml @@ -0,0 +1,4 @@ +release: + branches: + - main + - next diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..c329342 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Thomas Gambet + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea964d8 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +# Fetch MCP Server + +A port of the official [Fetch MCP Server](https://github.com/modelcontextprotocol/servers/tree/main/src/fetch) (python) for Node.js. + +## Description + +A [Model Context Protocol](https://modelcontextprotocol.io/) server that provides web content fetching capabilities. This server enables LLMs to retrieve and process content from web pages, converting HTML to markdown for easier consumption. + +The fetch tool will truncate the response, but by using the `start_index` argument, you can specify where to start the content extraction. This lets models read a webpage in chunks, until they find the information they need. + +### Available Tools + +- `fetch` - Fetches a URL from the internet and extracts its contents as markdown. + - `url` (string, required): URL to fetch + - `max_length` (integer, optional): Maximum number of characters to return (default: 5000) + - `start_index` (integer, optional): Start content from this character index (default: 0) + - `raw` (boolean, optional): Get raw content without markdown conversion (default: false) + +### Available Prompts + +- `fetch` - Fetch a URL and extract its contents as markdown + - `url` (string, required): URL to fetch + +## Usage + +```json +"mcpServers": { + "fetch": { + "command": "npx", + "args": ["mcp-fetch-node"] + } +} +``` + +```json +"mcpServers": { + "fetch": { + "command": "docker", + "args": ["run", "-i", "--rm", "tgambet/mcp-fetch-node"] + } +} +``` + +### Customization - robots.txt + +By default, the server will obey a websites robots.txt file if the request came from the model (via a tool), but not if +the request was user initiated (via a prompt). This can be disabled by adding the argument `--ignore-robots-txt` to the +`args` list in the configuration. + +### Customization - User-agent + +By default, depending on if the request came from the model (via a tool), or was user initiated (via a prompt), the +server will use either the user-agent + +``` +ModelContextProtocol/1.0 (Autonomous; +https://github.com/tgambet/mcp-fetch-node) +``` + +or + +``` +ModelContextProtocol/1.0 (User-Specified; +https://github.com/tgambet/mcp-fetch-node) +``` + +This can be customized by adding the argument `--user-agent=YourUserAgent` to the `args` list in the configuration. + +## Features + +- [x] Fetch and extract content from a URL +- [x] Respect `robots.txt` (can be disabled) +- [x] User-Agent customization +- [x] Relevant content extraction +- [x] Raw content or markdown conversion +- [x] Pagination +- [ ] In-memory temporary cache for faster responses, especially when paginating +- [ ] Logs and progress + +## Development + +```bash +pnpm install +pnpm dev +pnpm lint:fix +pnpm format +pnpm test +pnpm build +pnpm start +# test with MCP CLI +pnpx @wong2/mcp-cli --sse http://localhost:8080/sse +``` + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. + +## License + +[MIT](https://choosealicense.com/licenses/mit/) + +## TODO + +- [ ] Explain key differences with the original mcp/fetch tool +- [ ] Add LRU cache +- [ ] Publish to npm +- [ ] Dockerize and publish to docker hub +- [ ] Integrate semantic release +- [ ] Add user logs and progress +- [ ] Add tests +- [ ] Add documentation & examples +- [ ] Add benchmarks for extraction: cf https://github.com/adbar/trafilatura/blob/master/tests/comparison_small.py +- [ ] Showcase on FastMCP and MCP repositories diff --git a/eslint.config.js b/eslint.config.js new file mode 100644 index 0000000..e82809b --- /dev/null +++ b/eslint.config.js @@ -0,0 +1,21 @@ +// @ts-check + +import eslint from '@eslint/js'; +import tseslint from 'typescript-eslint'; + +export default tseslint.config( + eslint.configs.recommended, + tseslint.configs.strictTypeChecked, + tseslint.configs.stylisticTypeChecked, + { + languageOptions: { + parserOptions: { + projectService: true, + tsconfigRootDir: import.meta.dirname, + }, + }, + rules: { + '@typescript-eslint/no-explicit-any': 'off', + }, + }, +); diff --git a/jest.config.cjs b/jest.config.cjs deleted file mode 100644 index c49d554..0000000 --- a/jest.config.cjs +++ /dev/null @@ -1,21 +0,0 @@ -module.exports = { - preset: 'ts-jest', - testEnvironment: 'node', - extensionsToTreatAsEsm: ['.ts'], - moduleNameMapper: { - '^(\\.{1,2}/.*)\\.js$': '$1', - }, - moduleFileExtensions: ['js', 'ts'], - transform: { - '^.+\\.ts$': [ - 'ts-jest', - { - useESM: true, - }, - ], - }, - testMatch: ['**/*.test.ts'], - collectCoverage: true, - coverageDirectory: 'coverage', - coverageProvider: 'v8', -}; diff --git a/package.json b/package.json index 21e3d01..07744e0 100644 --- a/package.json +++ b/package.json @@ -1,39 +1,63 @@ { - "name": "nodejs-project", - "version": "1.0.0", - "description": "A Node.js project with best practices", + "name": "mcp-fetch-node", + "version": "0.0.0-development", + "description": "A Model Context Protocol server that provides web content fetching capabilities", "type": "module", "main": "dist/index.js", "scripts": { - "start": "node dist/index.js", - "dev": "tsx watch src/index.ts", + "start": "node dist/main.js", + "dev": "tsx watch src/main.ts", "build": "tsc", - "lint": "eslint . --ext .ts", - "lint:fix": "eslint . --ext .ts --fix", - "format": "prettier --write \"**/*.{ts,json,md}\"", + "lint": "eslint src/**", + "lint:fix": "eslint src/** --fix", + "format": "prettier --write \"**/*.{ts,json,md,yml,js}\"", "format:check": "prettier --check .", - "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js" + "test": "node --import tsx --test tests/**" }, - "keywords": [], "author": "Thomas Gambet", + "repository": { + "type": "git", + "url": "https://github.com/tgambet/mcp-fetch-node.git" + }, + "keywords": [ + "mcp", + "fetch" + ], "license": "MIT", "engines": { - "node": ">=20", + "node": ">=22", "pnpm": ">=10" }, + "publishConfig": { + "access": "public" + }, + "files": [ + "dist", + "README.md", + "LICENSE" + ], + "dependencies": { + "fastmcp": "^1.16.3", + "html-minifier": "^4.0.0", + "linkedom": "^0.18.9", + "robots-parser": "^3.0.1", + "sanitize-html": "^2.14.0", + "turndown": "^7.2.0", + "turndown-plugin-gfm": "^1.0.2", + "zod": "^3.24.2" + }, "devDependencies": { - "@types/jest": "^29.5.14", - "@types/node": "^20.17.19", - "@typescript-eslint/eslint-plugin": "^7.18.0", - "@typescript-eslint/parser": "^7.18.0", - "eslint": "^8.57.1", - "eslint-config-prettier": "^9.1.0", - "eslint-plugin-jest": "^27.9.0", - "jest": "^29.7.0", + "@eslint/js": "^9.20.0", + "@types/html-minifier": "^4.0.5", + "@types/node": "^22.13.4", + "@types/sanitize-html": "^2.13.0", + "@types/turndown": "^5.0.5", + "eslint": "^9.20.1", "prettier": "^3.5.1", - "ts-jest": "^29.2.5", + "semantic-release": "^24.2.3", "tsx": "^4.19.2", - "typescript": "^5.7.3" + "typescript": "^5.7.3", + "typescript-eslint": "^8.24.0" }, - "packageManager": "pnpm@10.4.0+sha512.6b849d0787d97f8f4e1f03a9b8ff8f038e79e153d6f11ae539ae7c435ff9e796df6a862c991502695c7f9e8fac8aeafc1ac5a8dab47e36148d183832d886dd52" + "packageManager": "pnpm@10.4.1+sha512.c753b6c3ad7afa13af388fa6d808035a008e30ea9993f58c6663e2bc5ff21679aa834db094987129aa4d488b86df57f7b634981b2f827cdcacc698cc0cfb88af" } diff --git a/src/constants.ts b/src/constants.ts new file mode 100644 index 0000000..7739512 --- /dev/null +++ b/src/constants.ts @@ -0,0 +1,5 @@ +export const DEFAULT_USER_AGENT_AUTONOMOUS = + 'ModelContextProtocol/1.0 (Autonomous; +https://github.com/tgambet/mcp-fetch-node)'; + +export const DEFAULT_USER_AGENT_MANUAL = + 'ModelContextProtocol/1.0 (User-Specified; +https://github.com/tgambet/mcp-fetch-node)'; diff --git a/src/index.ts b/src/index.ts deleted file mode 100644 index 7707744..0000000 --- a/src/index.ts +++ /dev/null @@ -1,6 +0,0 @@ -function main(): void { - // Your application code here - console.log('Hello, World!'); -} - -main(); diff --git a/src/main.ts b/src/main.ts new file mode 100644 index 0000000..c8aeda7 --- /dev/null +++ b/src/main.ts @@ -0,0 +1,42 @@ +import { FastMCP } from 'fastmcp'; +import { fetchPrompt } from './prompts/fetch.prompt.js'; +import { fetchTool } from './tools/fetch.tool.js'; +import { parseArgs } from './utils/parse-args.js'; + +const args = parseArgs(); + +const userAgent = args['user-agent'] as string | undefined; + +const ignoreRobotsTxt = args['ignore-robots-txt'] as boolean | undefined; + +export async function serve() { + const server = new FastMCP({ + name: 'mcp-fetch-node', + version: '0.0.0', // TODO: use package.json version? + }); + + server.on('connect', (event) => { + console.log('Client connected'); + event.session.on('error', (event) => { + console.error('Session error:', event.error); + }); + }); + + server.on('disconnect', () => { + console.log('Client disconnected'); + }); + + server.addTool(fetchTool(userAgent, ignoreRobotsTxt)); + + server.addPrompt(fetchPrompt(userAgent)); + + await server.start({ + transportType: 'sse', + sse: { + endpoint: '/sse', + port: 8080, // TODO: make this configurable + }, + }); +} + +await serve(); diff --git a/src/prompts/fetch.prompt.ts b/src/prompts/fetch.prompt.ts new file mode 100644 index 0000000..b0c966a --- /dev/null +++ b/src/prompts/fetch.prompt.ts @@ -0,0 +1,23 @@ +import { UserError } from 'fastmcp'; +import { DEFAULT_USER_AGENT_MANUAL } from '../constants.js'; +import { processURL } from '../utils/process-url.js'; + +export const fetchPrompt = (userAgent?: string) => ({ + name: 'fetch', + description: 'Fetch a URL and extract its contents as markdown', + arguments: [ + { + name: 'url', + description: 'URL to fetch', + required: true, + }, + ], + load: async ({ url }: { url?: string }) => { + if (!url) { + throw new UserError('Missing required argument: url'); + } + const ua = userAgent ?? DEFAULT_USER_AGENT_MANUAL; + const [content, prefix] = await processURL(url, ua, false); + return [prefix, content].join('\n'); + }, +}); diff --git a/src/tests/index.test.ts b/src/tests/index.test.ts deleted file mode 100644 index a167bab..0000000 --- a/src/tests/index.test.ts +++ /dev/null @@ -1,7 +0,0 @@ -import '../index.js'; - -describe('Main application', () => { - it('should run without errors', () => { - expect(true).toBe(true); - }); -}); diff --git a/src/tools/fetch.tool.ts b/src/tools/fetch.tool.ts new file mode 100644 index 0000000..791cd07 --- /dev/null +++ b/src/tools/fetch.tool.ts @@ -0,0 +1,49 @@ +import { z } from 'zod'; +import { paginate } from '../utils/paginate.js'; +import { processURL } from '../utils/process-url.js'; +import { checkRobotsTxt } from '../utils/check-robots-txt.js'; +import { DEFAULT_USER_AGENT_AUTONOMOUS } from '../constants.js'; + +export const fetchToolSchema = z.object({ + url: z.string().describe('URL to fetch.'), + max_length: z + .number() + .min(0) + .max(1000000) + .default(5000) + .describe('Maximum number of characters to return.'), + start_index: z + .number() + .min(0) + .default(0) + .describe( + 'Return output starting at this character index, useful if a previous fetch was truncated and more context is required.', + ), + raw: z + .boolean() + .default(false) + .describe( + 'Get the actual HTML content of the requested page, without simplification.', + ), +}); + +export const fetchTool = (userAgent?: string, ignoreRobotsTxt?: boolean) => ({ + name: 'fetch', + description: `Fetches a URL from the internet and optionally extracts its contents as markdown. + +This tool grants you internet access. You can fetch the most up-to-date information and let the user know that.`, + parameters: fetchToolSchema, + execute: async ({ + url, + max_length, + start_index, + raw, + }: z.infer) => { + const ua = userAgent ?? DEFAULT_USER_AGENT_AUTONOMOUS; + if (!ignoreRobotsTxt) { + await checkRobotsTxt(url, ua); + } + const [content, prefix] = await processURL(url, ua, raw); + return paginate(url, content, prefix, start_index, max_length); + }, +}); diff --git a/src/utils/check-robots-txt.ts b/src/utils/check-robots-txt.ts new file mode 100644 index 0000000..68e6310 --- /dev/null +++ b/src/utils/check-robots-txt.ts @@ -0,0 +1,62 @@ +import { URL } from 'url'; +import robotsParser, { Robot } from 'robots-parser'; + +export class RobotsTxtError extends Error { + constructor(message: string, cause?: unknown) { + super(message, { cause }); + this.name = 'RobotsTxtError'; + } +} + +export async function checkRobotsTxt( + targetUrl: string, + userAgent: string, +): Promise { + // TODO: check if the targetUrl is a valid URL + const { protocol, host } = new URL(targetUrl); + + const robotsTxtUrl = `${protocol}//${host}/robots.txt`; + + try { + const response = await fetch(robotsTxtUrl, { + headers: { 'User-Agent': userAgent }, + redirect: 'follow', + }); + + if (response.status === 401 || response.status === 403) { + throw new RobotsTxtError( + `When fetching robots.txt (${robotsTxtUrl}), received status ${response.status.toString()} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt`, + ); + } else if (response.status >= 400 && response.status < 500) { + return; + } + + const robotTxt = await response.text(); + + const processedRobotTxt = robotTxt + .split('\n') + .filter((line) => !line.trim().startsWith('#')) + .join('\n'); + + // @ts-expect-error : bad types + const robotsTxt = robotsParser(robotsTxtUrl, processedRobotTxt) as Robot; + + if (robotsTxt.isDisallowed(targetUrl, userAgent)) { + throw new RobotsTxtError( + `The sites robots.txt (${robotsTxtUrl}), specifies that autonomous fetching of this page is not allowed, ` + + `${userAgent}\n` + + `${targetUrl}` + + `\n${robotTxt}\n\n` + + `The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n` + + `The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.`, + ); + } + } catch (error) { + if (error instanceof RobotsTxtError) { + throw error; + } + throw new RobotsTxtError(`Failed to verify ${robotsTxtUrl}`, { + cause: error, + }); + } +} diff --git a/src/utils/extract.ts b/src/utils/extract.ts new file mode 100644 index 0000000..cfaeab5 --- /dev/null +++ b/src/utils/extract.ts @@ -0,0 +1,118 @@ +import { minify } from 'html-minifier'; +import { parseHTML } from 'linkedom'; +import sanitizeHtml from 'sanitize-html'; + +/* eslint-disable @typescript-eslint/no-unsafe-member-access */ +/* eslint-disable @typescript-eslint/no-unsafe-call */ +/* eslint-disable @typescript-eslint/no-unsafe-assignment */ +/* eslint-disable @typescript-eslint/no-unsafe-return */ + +export class ExtractError extends Error { + constructor(message: string, cause?: unknown) { + super(message, { cause }); + this.name = 'ExtractError'; + } +} + +export const preProcessHtml = (html: string) => { + return html + .replace(/]*?\/?>([\S\s]*?)<\/style>/gim, '') + .replace(/]*?\/?>([\S\s]*?)<\/script>/gim, '') + .replace(/]*?\/?>([\S\s]*?)<\/template>/gim, ''); +}; + +const nodesToRemove = [ + 'template', + 'img', + 'svg', + 'nav', + 'footer', + 'header', + 'head', + 'button', + 'form', + 'input', + 'textarea', + 'select', +]; + +export function extract(html: string) { + try { + // Pre-sanitize the HTML + let result = preProcessHtml(html); + + // Sanitize the HTML + result = sanitizeHtml(result, { + allowedTags: [ + 'html', + 'body', + ...sanitizeHtml.defaults.allowedTags, + ...nodesToRemove, + ], + allowedAttributes: { + '*': ['hidden', 'class', 'type', 'aria-hidden', 'href'], + }, + disallowedTagsMode: 'completelyDiscard', + }); + + // Parse the HTML + const { document } = parseHTML(result); + + // Remove unwanted elements + document.body + .querySelectorAll( + [ + '[hidden]', + '[aria-hidden]', + '[type="button"]', + '.hide-sm', + '.sr-only', + '.d-none', + '.d-sm-none', + // TODO check popular CSS frameworks classes + ...nodesToRemove, + ].join(', '), + ) + ?.forEach((a: any) => a.remove()); + + // Remove nav-liked lists + document.querySelectorAll('ul, table, section').forEach((node: any) => { + const list = node.cloneNode(true); + list.querySelectorAll('a').forEach((a: any) => { + a.innerHTML = ''; + }); + const htmlLength = list.innerHTML.length; + const textLength = list.innerText.length; + if (textLength / htmlLength < 0.2) node.remove(); + }); + + // Remove empty links + document.querySelectorAll('a').forEach((a: any) => { + if (a.textContent.trim() === '') { + a.remove(); + } + }); + + // Sanitize again + result = sanitizeHtml(document.body.innerHTML as string, { + allowedAttributes: { a: ['href'] }, + }); + + // Minify + result = minify(result, { + collapseWhitespace: true, + preserveLineBreaks: false, + decodeEntities: true, + conservativeCollapse: false, + collapseInlineTagWhitespace: false, + removeEmptyElements: true, + }); + + return result; + } catch (error) { + if (error instanceof ExtractError) { + throw error; + } + throw new ExtractError('Failed to extract content', error); + } +} diff --git a/src/utils/fetch.ts b/src/utils/fetch.ts new file mode 100644 index 0000000..aaada6b --- /dev/null +++ b/src/utils/fetch.ts @@ -0,0 +1,32 @@ +export class FetchError extends Error { + constructor(message: string, cause?: unknown) { + super(message, { cause }); + this.name = 'FetchError'; + } +} + +export async function fetch( + url: string, + userAgent: string, +): Promise<{ content: string; contentType: string | null }> { + try { + const response = await global.fetch(url, { + redirect: 'follow', + headers: { 'User-Agent': userAgent }, + }); + if (!response.ok) { + throw new FetchError( + `Failed to fetch ${url} - status code ${response.status.toString()}`, + ); + } + return { + content: await response.text(), + contentType: response.headers.get('content-type'), + }; + } catch (error) { + if (error instanceof FetchError) { + throw error; + } + throw new FetchError(`Failed to fetch ${url}`, error); + } +} diff --git a/src/utils/format.ts b/src/utils/format.ts new file mode 100644 index 0000000..e8b1bf3 --- /dev/null +++ b/src/utils/format.ts @@ -0,0 +1,49 @@ +import TurndownService from 'turndown'; +// @ts-expect-error : missing types +import turndownPluginGfm from 'turndown-plugin-gfm'; + +/* eslint-disable @typescript-eslint/no-unsafe-member-access */ +/* eslint-disable @typescript-eslint/no-unsafe-call */ +/* eslint-disable @typescript-eslint/restrict-template-expressions */ + +export class FormatError extends Error { + constructor(message: string, cause?: unknown) { + super(message, { cause }); + this.name = 'FormatError'; + } +} + +const turndownService = new TurndownService({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + bulletListMarker: '-', + hr: '\n', +}); + +const tables = turndownPluginGfm.tables as TurndownService.Plugin; + +turndownService.use(tables); + +turndownService.addRule('pre', { + filter: 'pre', + replacement: (content) => { + return `\`\`\`\n${content}\n\`\`\``; + }, +}); + +turndownService.addRule('a', { + filter: 'a', + replacement: (_content, node) => { + return node.href && node.innerText.trim() + ? `[${node.innerText.trim()}](${node.href})` + : ''; + }, +}); + +export function format(html: string): string { + try { + return turndownService.turndown(html); + } catch (error) { + throw new FormatError('Failed to convert HTML to Markdown', error); + } +} diff --git a/src/utils/paginate.ts b/src/utils/paginate.ts new file mode 100644 index 0000000..e9a7a45 --- /dev/null +++ b/src/utils/paginate.ts @@ -0,0 +1,26 @@ +export function paginate( + url: string, + content: string, + prefix: string, + startIndex: number, + maxLength: number, +) { + const originalLength = content.length; + let result = content; + if (startIndex >= originalLength) { + result = 'No more content available.'; + } else { + result = result.slice(startIndex, startIndex + maxLength); + if (!result) { + result = 'No more content available.'; + } else { + const actualLength = result.length; + const remainingLength = originalLength - startIndex - actualLength; + if (actualLength === maxLength && remainingLength > 0) { + const nextStartIndex = startIndex + actualLength; + result += `\n\nContent truncated. Call the fetch tool with a start_index of ${nextStartIndex.toString()} to get more content.`; + } + } + } + return [prefix, `Contents of ${url}`, result].join('\n'); +} diff --git a/src/utils/parse-args.ts b/src/utils/parse-args.ts new file mode 100644 index 0000000..96cdba0 --- /dev/null +++ b/src/utils/parse-args.ts @@ -0,0 +1,22 @@ +type ParsedArgs = Record; + +export function parseArgs(args: string[] = process.argv.slice(2)): ParsedArgs { + const parsedArgs: ParsedArgs = {}; + + for (let i = 0; i < args.length; i++) { + const arg = args[i]; + + if (arg.startsWith('--')) { + const key = arg.slice(2); + + if (i + 1 < args.length && !args[i + 1].startsWith('--')) { + parsedArgs[key] = args[i + 1]; + i++; + } else { + parsedArgs[key] = true; + } + } + } + + return parsedArgs; +} diff --git a/src/utils/process-url.ts b/src/utils/process-url.ts new file mode 100644 index 0000000..152d828 --- /dev/null +++ b/src/utils/process-url.ts @@ -0,0 +1,29 @@ +import { extract } from './extract.js'; +import { format } from './format.js'; +import { fetch } from './fetch.js'; + +function isHTML(content: string, contentType?: string | null): boolean { + return contentType?.includes('text/html') ?? content.includes('Page failed to be simplified from HTML', '']; + } + return [formatted, '']; + } + + if (raw) { + return [content, `Here is the raw ${contentType ?? 'unknown'} content:`]; + } + + return [ + content, + `Content type ${contentType ?? 'unknown'} cannot be simplified to markdown, but here is the raw content:`, + ]; +} diff --git a/tests/index.test.ts b/tests/index.test.ts new file mode 100644 index 0000000..06ec056 --- /dev/null +++ b/tests/index.test.ts @@ -0,0 +1,8 @@ +import assert from 'node:assert'; +import { describe, it } from 'node:test'; + +describe('Main application', () => { + it('dummy test', () => { + assert.strictEqual(true, true); + }); +}); diff --git a/tsconfig.json b/tsconfig.json index fcf9a74..528ff8b 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -15,5 +15,5 @@ "declaration": true }, "include": ["src/**/*"], - "exclude": ["node_modules", "dist", "**/*.test.ts"] + "exclude": ["node_modules", "dist"] }