From de5e62c93ca63bb0136a2ed69bb08b7bc0431cf9 Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Sat, 15 Feb 2025 08:10:44 +0100 Subject: [PATCH 01/10] chore: eslint upgrade , readme, license, github actions --- .eslintignore | 2 -- .eslintrc.cjs | 26 --------------- .github/workflows/ci.yml | 12 +++---- .github/workflows/publish.yml | 20 ++++++++---- LICENSE.md | 21 ++++++++++++ README.md | 54 +++++++++++++++++++++++++++++++ eslint.config.js | 18 +++++++++++ jest.config.cjs => jest.config.ts | 13 ++++---- package.json | 31 +++++++++++------- src/index.ts | 10 +++--- src/service.ts | 5 +++ tsconfig.json | 3 +- 12 files changed, 149 insertions(+), 66 deletions(-) delete mode 100644 .eslintignore delete mode 100644 .eslintrc.cjs create mode 100644 LICENSE.md create mode 100644 README.md create mode 100644 eslint.config.js rename jest.config.cjs => jest.config.ts (70%) create mode 100644 src/service.ts diff --git a/.eslintignore b/.eslintignore deleted file mode 100644 index 67ab7da..0000000 --- a/.eslintignore +++ /dev/null @@ -1,2 +0,0 @@ -dist/ -**/*.test.ts \ No newline at end of file diff --git a/.eslintrc.cjs b/.eslintrc.cjs deleted file mode 100644 index 5929160..0000000 --- a/.eslintrc.cjs +++ /dev/null @@ -1,26 +0,0 @@ -module.exports = { - parser: '@typescript-eslint/parser', - parserOptions: { - ecmaVersion: 'latest', - sourceType: 'module', - project: './tsconfig.json', - }, - plugins: ['@typescript-eslint'], - extends: [ - 'eslint:recommended', - 'plugin:@typescript-eslint/recommended', - 'plugin:@typescript-eslint/recommended-requiring-type-checking', - 'plugin:jest/recommended', - 'prettier', - ], - env: { - node: true, - es2022: true, - jest: true, - }, - rules: { - 'no-console': 'warn', - '@typescript-eslint/explicit-function-return-type': 'warn', - '@typescript-eslint/no-unused-vars': 'error', - }, -}; diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index baf5bd8..53195f3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,8 +1,8 @@ name: CI on: - push: - branches: [main] + # push: + # branches: [main] pull_request: branches: [main] @@ -10,17 +10,13 @@ jobs: test: runs-on: ubuntu-latest - strategy: - matrix: - node-version: [20.x, 22.x] - steps: - uses: actions/checkout@v4 - - name: Use Node.js ${{ matrix.node-version }} + - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: ${{ matrix.node-version }} + node-version: "lts/*" - name: Install pnpm uses: pnpm/action-setup@v2 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 71efb1d..4739502 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,4 +1,4 @@ -name: Publish Package +name: Publish on: push: @@ -11,17 +11,25 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Setup Node.js - uses: actions/setup-node@v2 + uses: actions/setup-node@v4 with: - node-version: '20' + node-version: 'lts/*' + - name: Install pnpm + uses: pnpm/action-setup@v2 + with: + version: 10 + - name: Install dependencies - run: npm install + run: pnpm install + + - name: Build + run: pnpm build - name: Publish package - run: npm publish + run: pnpm publish env: NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..c329342 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Thomas Gambet + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..60344da --- /dev/null +++ b/README.md @@ -0,0 +1,54 @@ +# Fetch MCP Server + +A port of [Fetch MCP Server](https://github.com/modelcontextprotocol/servers/tree/main/src/fetch) for Node.js. + +## Description + +A Model Context Protocol server that provides web content fetching capabilities. This server enables LLMs to retrieve and process content from web pages, converting HTML to markdown for easier consumption. + +The fetch tool will truncate the response, but by using the `start_index` argument, you can specify where to start the content extraction. This lets models read a webpage in chunks, until they find the information they need. + +### Available Tools + +- `fetch` - Fetches a URL from the internet and extracts its contents as markdown. + - `url` (string, required): URL to fetch + - `max_length` (integer, optional): Maximum number of characters to return (default: 5000) + - `start_index` (integer, optional): Start content from this character index (default: 0) + - `raw` (boolean, optional): Get raw content without markdown conversion (default: false) + +### Prompts + +- **fetch** + - Fetch a URL and extract its contents as markdown + - Arguments: + - `url` (string, required): URL to fetch + +## Installation + +```bash + +``` + +## Usage + +## Features + +## Development + +```bash +pnpm install +pnpm dev +pnpm lint:fix +pnpm format +pnpm test +pnpm build +pnpm start +``` + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. + +## License + +[MIT](https://choosealicense.com/licenses/mit/) diff --git a/eslint.config.js b/eslint.config.js new file mode 100644 index 0000000..8334381 --- /dev/null +++ b/eslint.config.js @@ -0,0 +1,18 @@ +// @ts-check + +import eslint from '@eslint/js'; +import tseslint from 'typescript-eslint'; + +export default tseslint.config( + eslint.configs.recommended, + tseslint.configs.strictTypeChecked, + tseslint.configs.stylisticTypeChecked, + { + languageOptions: { + parserOptions: { + projectService: true, + tsconfigRootDir: import.meta.dirname, + }, + }, + }, +); \ No newline at end of file diff --git a/jest.config.cjs b/jest.config.ts similarity index 70% rename from jest.config.cjs rename to jest.config.ts index c49d554..d44eb84 100644 --- a/jest.config.cjs +++ b/jest.config.ts @@ -1,4 +1,6 @@ -module.exports = { +import type { Config } from 'jest'; + +const config: Config = { preset: 'ts-jest', testEnvironment: 'node', extensionsToTreatAsEsm: ['.ts'], @@ -7,15 +9,12 @@ module.exports = { }, moduleFileExtensions: ['js', 'ts'], transform: { - '^.+\\.ts$': [ - 'ts-jest', - { - useESM: true, - }, - ], + '^.+\\.ts$': ['ts-jest', { useESM: true }], }, testMatch: ['**/*.test.ts'], collectCoverage: true, coverageDirectory: 'coverage', coverageProvider: 'v8', }; + +export default config; diff --git a/package.json b/package.json index 21e3d01..fea5877 100644 --- a/package.json +++ b/package.json @@ -1,39 +1,48 @@ { - "name": "nodejs-project", + "name": "mcp-fetch-node", "version": "1.0.0", - "description": "A Node.js project with best practices", + "description": "A Model Context Protocol server that provides web content fetching capabilities", "type": "module", "main": "dist/index.js", "scripts": { "start": "node dist/index.js", "dev": "tsx watch src/index.ts", "build": "tsc", - "lint": "eslint . --ext .ts", - "lint:fix": "eslint . --ext .ts --fix", + "lint": "eslint src/**", + "lint:fix": "eslint src/** --fix", "format": "prettier --write \"**/*.{ts,json,md}\"", "format:check": "prettier --check .", "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js" }, - "keywords": [], "author": "Thomas Gambet", + "repository": { + "type": "git", + "url": "https://github.com/tgambet/mcp-fetch-node.git" + }, + "keywords": [ + "mcp", + "fetch" + ], "license": "MIT", "engines": { "node": ">=20", "pnpm": ">=10" }, + "dependencies": { + "fastmcp": "^1.16.3" + }, "devDependencies": { + "@eslint/js": "^9.20.0", "@types/jest": "^29.5.14", "@types/node": "^20.17.19", - "@typescript-eslint/eslint-plugin": "^7.18.0", - "@typescript-eslint/parser": "^7.18.0", - "eslint": "^8.57.1", - "eslint-config-prettier": "^9.1.0", - "eslint-plugin-jest": "^27.9.0", + "eslint": "^9.20.1", "jest": "^29.7.0", "prettier": "^3.5.1", "ts-jest": "^29.2.5", + "ts-node": "^10.9.2", "tsx": "^4.19.2", - "typescript": "^5.7.3" + "typescript": "^5.7.3", + "typescript-eslint": "^8.24.0" }, "packageManager": "pnpm@10.4.0+sha512.6b849d0787d97f8f4e1f03a9b8ff8f038e79e153d6f11ae539ae7c435ff9e796df6a862c991502695c7f9e8fac8aeafc1ac5a8dab47e36148d183832d886dd52" } diff --git a/src/index.ts b/src/index.ts index 7707744..a631056 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,8 @@ -function main(): void { - // Your application code here - console.log('Hello, World!'); +import { fetchPage } from "./service.js"; + +async function main() { + const page = await fetchPage('https://www.google.com'); + console.log(page); } -main(); +await main(); diff --git a/src/service.ts b/src/service.ts new file mode 100644 index 0000000..8439e3d --- /dev/null +++ b/src/service.ts @@ -0,0 +1,5 @@ +export const fetchPage = async (url: string) => { +// const response = await fetch(url); +// return response.text(); +return url +}; diff --git a/tsconfig.json b/tsconfig.json index fcf9a74..9f6a960 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -14,6 +14,5 @@ "sourceMap": true, "declaration": true }, - "include": ["src/**/*"], - "exclude": ["node_modules", "dist", "**/*.test.ts"] + "include": ["src/**/*"] } From 2860332143665404490fd92392efaf7cdb0feef4 Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Sat, 15 Feb 2025 17:12:27 +0100 Subject: [PATCH 02/10] feat: initial implementation --- .github/workflows/ci.yml | 6 +- README.md | 48 +++++++++++--- eslint.config.js | 3 + jest.config.ts | 20 ------ package.json | 28 ++++---- src/constants.ts | 4 ++ src/index.ts | 8 --- src/main.ts | 31 +++++++++ src/prompts/fetch.prompt.ts | 23 +++++++ src/service.ts | 5 -- src/tests/index.test.ts | 7 -- src/tools/fetch.tool.ts | 49 ++++++++++++++ src/utils/check-robots-txt.ts | 62 ++++++++++++++++++ src/utils/extract.ts | 118 ++++++++++++++++++++++++++++++++++ src/utils/fetch.ts | 32 +++++++++ src/utils/format.ts | 52 +++++++++++++++ src/utils/paginate.ts | 26 ++++++++ src/utils/parse-args.ts | 22 +++++++ src/utils/process-url.ts | 29 +++++++++ tests/index.test.ts | 8 +++ tsconfig.json | 3 +- 21 files changed, 520 insertions(+), 64 deletions(-) delete mode 100644 jest.config.ts create mode 100644 src/constants.ts delete mode 100644 src/index.ts create mode 100644 src/main.ts create mode 100644 src/prompts/fetch.prompt.ts delete mode 100644 src/service.ts delete mode 100644 src/tests/index.test.ts create mode 100644 src/tools/fetch.tool.ts create mode 100644 src/utils/check-robots-txt.ts create mode 100644 src/utils/extract.ts create mode 100644 src/utils/fetch.ts create mode 100644 src/utils/format.ts create mode 100644 src/utils/paginate.ts create mode 100644 src/utils/parse-args.ts create mode 100644 src/utils/process-url.ts create mode 100644 tests/index.test.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 53195f3..9c74ffc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,8 +32,8 @@ jobs: - name: Check formatting run: pnpm format:check - - name: Run tests - run: pnpm test - - name: Build run: pnpm build + + - name: Run tests + run: pnpm test diff --git a/README.md b/README.md index 60344da..54e5af1 100644 --- a/README.md +++ b/README.md @@ -16,23 +16,42 @@ The fetch tool will truncate the response, but by using the `start_index` argume - `start_index` (integer, optional): Start content from this character index (default: 0) - `raw` (boolean, optional): Get raw content without markdown conversion (default: false) -### Prompts +### Available Prompts -- **fetch** - - Fetch a URL and extract its contents as markdown - - Arguments: - - `url` (string, required): URL to fetch - -## Installation +- `fetch` - Fetch a URL and extract its contents as markdown + - `url` (string, required): URL to fetch -```bash +## Usage +```json +"mcpServers": { + "fetch": { + "command": "npx", + "args": ["mcp-fetch-node"] + } +} ``` -## Usage +```json +"mcpServers": { + "fetch": { + "command": "docker", + "args": ["run", "-i", "--rm", "tgambet/mcp-fetch-node"] + } +} +``` ## Features +- [x] Fetch and extract content from a URL +- [x] Respect `robots.txt` (can be disabled) +- [x] User-Agent customization +- [x] Relevant content extraction +- [x] Raw content or markdown conversion +- [x] Pagination +- [ ] In-memory temporary cache for faster responses +- [ ] Logs and progress + ## Development ```bash @@ -52,3 +71,14 @@ Contributions are welcome! Please feel free to submit a Pull Request. ## License [MIT](https://choosealicense.com/licenses/mit/) + +## TODO + +- [ ] Add LRU cache +- [ ] Publish to npm +- [ ] Dockerize and publish to docker hub +- [ ] Integrate semantic release +- [ ] Add user logs and progress +- [ ] Add tests +- [ ] Add documentation & examples +- [ ] Add benchmarks for extraction: cf https://github.com/adbar/trafilatura/blob/master/tests/comparison_small.py diff --git a/eslint.config.js b/eslint.config.js index 8334381..cf808eb 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -14,5 +14,8 @@ export default tseslint.config( tsconfigRootDir: import.meta.dirname, }, }, + rules: { + '@typescript-eslint/no-explicit-any': 'off', + }, }, ); \ No newline at end of file diff --git a/jest.config.ts b/jest.config.ts deleted file mode 100644 index d44eb84..0000000 --- a/jest.config.ts +++ /dev/null @@ -1,20 +0,0 @@ -import type { Config } from 'jest'; - -const config: Config = { - preset: 'ts-jest', - testEnvironment: 'node', - extensionsToTreatAsEsm: ['.ts'], - moduleNameMapper: { - '^(\\.{1,2}/.*)\\.js$': '$1', - }, - moduleFileExtensions: ['js', 'ts'], - transform: { - '^.+\\.ts$': ['ts-jest', { useESM: true }], - }, - testMatch: ['**/*.test.ts'], - collectCoverage: true, - coverageDirectory: 'coverage', - coverageProvider: 'v8', -}; - -export default config; diff --git a/package.json b/package.json index fea5877..5df378f 100644 --- a/package.json +++ b/package.json @@ -1,18 +1,18 @@ { "name": "mcp-fetch-node", - "version": "1.0.0", + "version": "0.0.1", "description": "A Model Context Protocol server that provides web content fetching capabilities", "type": "module", "main": "dist/index.js", "scripts": { - "start": "node dist/index.js", - "dev": "tsx watch src/index.ts", + "start": "node dist/main.js", + "dev": "tsx watch src/main.ts", "build": "tsc", "lint": "eslint src/**", "lint:fix": "eslint src/** --fix", "format": "prettier --write \"**/*.{ts,json,md}\"", "format:check": "prettier --check .", - "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js" + "test": "node --import tsx --test tests/**" }, "author": "Thomas Gambet", "repository": { @@ -25,21 +25,27 @@ ], "license": "MIT", "engines": { - "node": ">=20", + "node": ">=22", "pnpm": ">=10" }, "dependencies": { - "fastmcp": "^1.16.3" + "fastmcp": "^1.16.3", + "html-minifier": "^4.0.0", + "linkedom": "^0.18.9", + "robots-parser": "^3.0.1", + "sanitize-html": "^2.14.0", + "turndown": "^7.2.0", + "turndown-plugin-gfm": "^1.0.2", + "zod": "^3.24.2" }, "devDependencies": { "@eslint/js": "^9.20.0", - "@types/jest": "^29.5.14", - "@types/node": "^20.17.19", + "@types/html-minifier": "^4.0.5", + "@types/node": "^22.13.4", + "@types/sanitize-html": "^2.13.0", + "@types/turndown": "^5.0.5", "eslint": "^9.20.1", - "jest": "^29.7.0", "prettier": "^3.5.1", - "ts-jest": "^29.2.5", - "ts-node": "^10.9.2", "tsx": "^4.19.2", "typescript": "^5.7.3", "typescript-eslint": "^8.24.0" diff --git a/src/constants.ts b/src/constants.ts new file mode 100644 index 0000000..6599968 --- /dev/null +++ b/src/constants.ts @@ -0,0 +1,4 @@ +export const DEFAULT_USER_AGENT_AUTONOMOUS = + 'ModelContextProtocol/1.0 (Autonomous; +https://github.com/tgambet/mcp-fetch-node)'; +export const DEFAULT_USER_AGENT_MANUAL = + 'ModelContextProtocol/1.0 (User-Specified; +https://github.com/tgambet/mcp-fetch-node)'; diff --git a/src/index.ts b/src/index.ts deleted file mode 100644 index a631056..0000000 --- a/src/index.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { fetchPage } from "./service.js"; - -async function main() { - const page = await fetchPage('https://www.google.com'); - console.log(page); -} - -await main(); diff --git a/src/main.ts b/src/main.ts new file mode 100644 index 0000000..1a4046d --- /dev/null +++ b/src/main.ts @@ -0,0 +1,31 @@ +import { FastMCP } from 'fastmcp'; +import { fetchPrompt } from './prompts/fetch.prompt.js'; +import { fetchTool } from './tools/fetch.tool.js'; +import { parseArgs } from './utils/parse-args.js'; + +const args = parseArgs(); + +const userAgent = args['user-agent'] as string | undefined; + +const ignoreRobotsTxt = args['ignore-robots-txt'] as boolean | undefined; + +export async function serve() { + const server = new FastMCP({ + name: 'mcp-fetch-node', + version: '0.0.1', + }); + + server.addTool(fetchTool(userAgent, ignoreRobotsTxt)); + + server.addPrompt(fetchPrompt(userAgent)); + + await server.start({ + transportType: 'sse', + sse: { + endpoint: '/sse', + port: 8080, + }, + }); +} + +await serve(); diff --git a/src/prompts/fetch.prompt.ts b/src/prompts/fetch.prompt.ts new file mode 100644 index 0000000..b0c966a --- /dev/null +++ b/src/prompts/fetch.prompt.ts @@ -0,0 +1,23 @@ +import { UserError } from 'fastmcp'; +import { DEFAULT_USER_AGENT_MANUAL } from '../constants.js'; +import { processURL } from '../utils/process-url.js'; + +export const fetchPrompt = (userAgent?: string) => ({ + name: 'fetch', + description: 'Fetch a URL and extract its contents as markdown', + arguments: [ + { + name: 'url', + description: 'URL to fetch', + required: true, + }, + ], + load: async ({ url }: { url?: string }) => { + if (!url) { + throw new UserError('Missing required argument: url'); + } + const ua = userAgent ?? DEFAULT_USER_AGENT_MANUAL; + const [content, prefix] = await processURL(url, ua, false); + return [prefix, content].join('\n'); + }, +}); diff --git a/src/service.ts b/src/service.ts deleted file mode 100644 index 8439e3d..0000000 --- a/src/service.ts +++ /dev/null @@ -1,5 +0,0 @@ -export const fetchPage = async (url: string) => { -// const response = await fetch(url); -// return response.text(); -return url -}; diff --git a/src/tests/index.test.ts b/src/tests/index.test.ts deleted file mode 100644 index a167bab..0000000 --- a/src/tests/index.test.ts +++ /dev/null @@ -1,7 +0,0 @@ -import '../index.js'; - -describe('Main application', () => { - it('should run without errors', () => { - expect(true).toBe(true); - }); -}); diff --git a/src/tools/fetch.tool.ts b/src/tools/fetch.tool.ts new file mode 100644 index 0000000..791cd07 --- /dev/null +++ b/src/tools/fetch.tool.ts @@ -0,0 +1,49 @@ +import { z } from 'zod'; +import { paginate } from '../utils/paginate.js'; +import { processURL } from '../utils/process-url.js'; +import { checkRobotsTxt } from '../utils/check-robots-txt.js'; +import { DEFAULT_USER_AGENT_AUTONOMOUS } from '../constants.js'; + +export const fetchToolSchema = z.object({ + url: z.string().describe('URL to fetch.'), + max_length: z + .number() + .min(0) + .max(1000000) + .default(5000) + .describe('Maximum number of characters to return.'), + start_index: z + .number() + .min(0) + .default(0) + .describe( + 'Return output starting at this character index, useful if a previous fetch was truncated and more context is required.', + ), + raw: z + .boolean() + .default(false) + .describe( + 'Get the actual HTML content of the requested page, without simplification.', + ), +}); + +export const fetchTool = (userAgent?: string, ignoreRobotsTxt?: boolean) => ({ + name: 'fetch', + description: `Fetches a URL from the internet and optionally extracts its contents as markdown. + +This tool grants you internet access. You can fetch the most up-to-date information and let the user know that.`, + parameters: fetchToolSchema, + execute: async ({ + url, + max_length, + start_index, + raw, + }: z.infer) => { + const ua = userAgent ?? DEFAULT_USER_AGENT_AUTONOMOUS; + if (!ignoreRobotsTxt) { + await checkRobotsTxt(url, ua); + } + const [content, prefix] = await processURL(url, ua, raw); + return paginate(url, content, prefix, start_index, max_length); + }, +}); diff --git a/src/utils/check-robots-txt.ts b/src/utils/check-robots-txt.ts new file mode 100644 index 0000000..33e172a --- /dev/null +++ b/src/utils/check-robots-txt.ts @@ -0,0 +1,62 @@ +import { URL } from 'url'; +import robotsParser, { Robot } from 'robots-parser'; + +export class RobotsTxtError extends Error { + constructor(message: string, cause?: unknown) { + super(message, { cause }); + this.name = 'RobotsTxtError'; + } +} + +export async function checkRobotsTxt( + targetUrl: string, + userAgent: string, +): Promise { + // TODO: check if the targetUrl is a valid URL + const { protocol, host } = new URL(targetUrl); + + const robotsTxtUrl = `${protocol}//${host}/robots.txt`; + + try { + const response = await fetch(robotsTxtUrl, { + headers: { 'User-Agent': userAgent }, + redirect: 'follow', + }); + + if (response.status === 401 || response.status === 403) { + throw new RobotsTxtError( + `When fetching robots.txt (${robotsTxtUrl}), received status ${response.status.toString()} so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt`, + ); + } else if (response.status >= 400 && response.status < 500) { + return; + } + + const robotTxt = await response.text(); + + const processedRobotTxt = robotTxt + .split('\n') + .filter((line) => !line.trim().startsWith('#')) + .join('\n'); + + // @ts-expect-error : bad types + const robotsTxt = robotsParser(robotTxtUrl, processedRobotTxt) as Robot; + + if (robotsTxt.isDisallowed(targetUrl, userAgent)) { + throw new RobotsTxtError( + `The sites robots.txt (${robotsTxtUrl}), specifies that autonomous fetching of this page is not allowed, ` + + `${userAgent}\n` + + `${targetUrl}` + + `\n${robotTxt}\n\n` + + `The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.\n` + + `The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI.`, + ); + } + } catch (error) { + if (error instanceof RobotsTxtError) { + throw error; + } + throw new RobotsTxtError(`Failed to verify ${robotsTxtUrl}`, { + cause: error, + }); + } +} diff --git a/src/utils/extract.ts b/src/utils/extract.ts new file mode 100644 index 0000000..03bbee4 --- /dev/null +++ b/src/utils/extract.ts @@ -0,0 +1,118 @@ +import { minify } from 'html-minifier'; +import { parseHTML } from 'linkedom'; +import sanitizeHtml from 'sanitize-html'; + +/* eslint-disable @typescript-eslint/no-unsafe-member-access */ +/* eslint-disable @typescript-eslint/no-unsafe-call */ +/* eslint-disable @typescript-eslint/no-unsafe-assignment */ +/* eslint-disable @typescript-eslint/no-unsafe-return */ + +export class ExtractError extends Error { + constructor(message: string, cause?: unknown) { + super(message, { cause }); + this.name = 'ExtractError'; + } +} + +export const preProcessHtml = (html: string) => { + return html + .replace(/]*?\/?>([\S\s]*?)<\/style>/gim, '') + .replace(/]*?\/?>([\S\s]*?)<\/script>/gim, '') + .replace(/]*?\/?>([\S\s]*?)<\/template>/gim, ''); +}; + +const nodesToRemove = [ + 'template', + 'img', + 'svg', + 'nav', + 'footer', + 'header', + 'head', + 'button', + 'form', + 'input', + 'textarea', + 'select', +]; + +export function extract(html: string) { + try { + // Pre-sanitize the HTML + let result = preProcessHtml(html); + + // Sanitize the HTML + result = sanitizeHtml(result, { + allowedTags: [ + 'html', + 'body', + ...sanitizeHtml.defaults.allowedTags, + ...nodesToRemove, + ], + allowedAttributes: { + '*': ['hidden', 'class', 'type', 'aria-hidden', 'href'], + }, + disallowedTagsMode: 'completelyDiscard', + }); + + // Parse the HTML + const { document } = parseHTML(result); + + // Remove unwanted elements + document.body + .querySelectorAll( + [ + '[hidden]', + '[aria-hidden]', + '[type="button"]', + '.hide-sm', + '.sr-only', + '.d-none', + '.d-sm-none', + // TODO check popular CSS frameworks classes + ...nodesToRemove, + ].join(', '), + ) + ?.forEach((a: any) => a.remove()); + + // Remove nav-liked lists + document.querySelectorAll('ul, table, section').forEach((node: any) => { + const list = node.cloneNode(true); + list.querySelectorAll('a').forEach((a: any) => { + a.innerHTML = ''; + }); + const htmlLength = list.innerHTML.length; + const textLength = list.innerText.length; + if (textLength / htmlLength < 0.2) node.remove(); + }); + + // Remove empty links + document.querySelectorAll('a').forEach((a: any) => { + if (a.textContent.trim() === '') { + a.remove(); + } + }); + + // Sanitize again + result = sanitizeHtml(document.body.innerHTML as string, { + allowedAttributes: { a: ['href'] }, + }); + + // Minify + result = minify(result, { + collapseWhitespace: true, + preserveLineBreaks: false, + decodeEntities: true, + conservativeCollapse: false, + collapseInlineTagWhitespace: false, + removeEmptyElements: true, + }); + + return result.trim(); + } catch (error) { + if (error instanceof ExtractError) { + throw error; + } + throw new ExtractError('Failed to extract content', error); + } +} diff --git a/src/utils/fetch.ts b/src/utils/fetch.ts new file mode 100644 index 0000000..aaada6b --- /dev/null +++ b/src/utils/fetch.ts @@ -0,0 +1,32 @@ +export class FetchError extends Error { + constructor(message: string, cause?: unknown) { + super(message, { cause }); + this.name = 'FetchError'; + } +} + +export async function fetch( + url: string, + userAgent: string, +): Promise<{ content: string; contentType: string | null }> { + try { + const response = await global.fetch(url, { + redirect: 'follow', + headers: { 'User-Agent': userAgent }, + }); + if (!response.ok) { + throw new FetchError( + `Failed to fetch ${url} - status code ${response.status.toString()}`, + ); + } + return { + content: await response.text(), + contentType: response.headers.get('content-type'), + }; + } catch (error) { + if (error instanceof FetchError) { + throw error; + } + throw new FetchError(`Failed to fetch ${url}`, error); + } +} diff --git a/src/utils/format.ts b/src/utils/format.ts new file mode 100644 index 0000000..cbf950a --- /dev/null +++ b/src/utils/format.ts @@ -0,0 +1,52 @@ +import TurndownService from 'turndown'; +// @ts-expect-error : missing types +import turndownPluginGfm from 'turndown-plugin-gfm'; + +/* eslint-disable @typescript-eslint/no-unsafe-member-access */ +/* eslint-disable @typescript-eslint/no-unsafe-call */ +/* eslint-disable @typescript-eslint/restrict-template-expressions */ + +export class FormatError extends Error { + constructor(message: string, cause?: unknown) { + super(message, { cause }); + this.name = 'FormatError'; + } +} + +const turndownService = new TurndownService({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + bulletListMarker: '-', + hr: '\n', +}); + +const tables = turndownPluginGfm.tables as TurndownService.Plugin; + +turndownService.use(tables); + +turndownService.addRule('pre', { + filter: 'pre', + replacement: (content) => { + return `\`\`\`\n${content}\n\`\`\``; + }, +}); + +turndownService.addRule('a', { + filter: 'a', + replacement: (_content, node) => { + return node.href && node.innerText.trim() + ? `[${node.innerText.trim()}](${node.href})` + : ''; + }, +}); + +export function format(html: string): string { + try { + return turndownService.turndown(html); + } catch (error) { + if (error instanceof FormatError) { + throw error; + } + throw new FormatError('Failed to convert HTML to Markdown', error); + } +} diff --git a/src/utils/paginate.ts b/src/utils/paginate.ts new file mode 100644 index 0000000..e9a7a45 --- /dev/null +++ b/src/utils/paginate.ts @@ -0,0 +1,26 @@ +export function paginate( + url: string, + content: string, + prefix: string, + startIndex: number, + maxLength: number, +) { + const originalLength = content.length; + let result = content; + if (startIndex >= originalLength) { + result = 'No more content available.'; + } else { + result = result.slice(startIndex, startIndex + maxLength); + if (!result) { + result = 'No more content available.'; + } else { + const actualLength = result.length; + const remainingLength = originalLength - startIndex - actualLength; + if (actualLength === maxLength && remainingLength > 0) { + const nextStartIndex = startIndex + actualLength; + result += `\n\nContent truncated. Call the fetch tool with a start_index of ${nextStartIndex.toString()} to get more content.`; + } + } + } + return [prefix, `Contents of ${url}`, result].join('\n'); +} diff --git a/src/utils/parse-args.ts b/src/utils/parse-args.ts new file mode 100644 index 0000000..96cdba0 --- /dev/null +++ b/src/utils/parse-args.ts @@ -0,0 +1,22 @@ +type ParsedArgs = Record; + +export function parseArgs(args: string[] = process.argv.slice(2)): ParsedArgs { + const parsedArgs: ParsedArgs = {}; + + for (let i = 0; i < args.length; i++) { + const arg = args[i]; + + if (arg.startsWith('--')) { + const key = arg.slice(2); + + if (i + 1 < args.length && !args[i + 1].startsWith('--')) { + parsedArgs[key] = args[i + 1]; + i++; + } else { + parsedArgs[key] = true; + } + } + } + + return parsedArgs; +} diff --git a/src/utils/process-url.ts b/src/utils/process-url.ts new file mode 100644 index 0000000..a12d2ff --- /dev/null +++ b/src/utils/process-url.ts @@ -0,0 +1,29 @@ +import { extract } from './extract.js'; +import { format } from './format.js'; +import { fetch } from './fetch.js'; + +function isHTML(content: string, contentType?: string | null): boolean { + return contentType?.includes('text/html') ?? content.includes('Page failed to be simplified from HTML', '']; + } + return [formatted, '']; + } + + if (raw) { + return [content, `Here is the raw ${contentType ?? 'unknown'} content:`]; + } + + return [ + content, + `Content type ${contentType ?? 'unknown'} cannot be simplified to markdown, but here is the raw content:`, + ]; +} diff --git a/tests/index.test.ts b/tests/index.test.ts new file mode 100644 index 0000000..06ec056 --- /dev/null +++ b/tests/index.test.ts @@ -0,0 +1,8 @@ +import assert from 'node:assert'; +import { describe, it } from 'node:test'; + +describe('Main application', () => { + it('dummy test', () => { + assert.strictEqual(true, true); + }); +}); diff --git a/tsconfig.json b/tsconfig.json index 9f6a960..528ff8b 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -14,5 +14,6 @@ "sourceMap": true, "declaration": true }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] } From 3cdf684c392d0d83603385a2c7bdd3c14b56971a Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Mon, 17 Feb 2025 11:05:42 +0100 Subject: [PATCH 03/10] chore: format --- README.md | 6 +++--- src/utils/process-url.ts | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 54e5af1..5d1c736 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Fetch MCP Server -A port of [Fetch MCP Server](https://github.com/modelcontextprotocol/servers/tree/main/src/fetch) for Node.js. +A port of the official [Fetch MCP Server](https://github.com/modelcontextprotocol/servers/tree/main/src/fetch) (python) for Node.js. ## Description -A Model Context Protocol server that provides web content fetching capabilities. This server enables LLMs to retrieve and process content from web pages, converting HTML to markdown for easier consumption. +A [Model Context Protocol](https://modelcontextprotocol.io/) server that provides web content fetching capabilities. This server enables LLMs to retrieve and process content from web pages, converting HTML to markdown for easier consumption. The fetch tool will truncate the response, but by using the `start_index` argument, you can specify where to start the content extraction. This lets models read a webpage in chunks, until they find the information they need. @@ -49,7 +49,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume - [x] Relevant content extraction - [x] Raw content or markdown conversion - [x] Pagination -- [ ] In-memory temporary cache for faster responses +- [ ] In-memory temporary cache for faster responses, especially when paginating - [ ] Logs and progress ## Development diff --git a/src/utils/process-url.ts b/src/utils/process-url.ts index a12d2ff..152d828 100644 --- a/src/utils/process-url.ts +++ b/src/utils/process-url.ts @@ -8,7 +8,7 @@ function isHTML(content: string, contentType?: string | null): boolean { export async function processURL(url: string, userAgent: string, raw: boolean) { const { content, contentType } = await fetch(url, userAgent); - + if (!raw && isHTML(content, contentType)) { const extracted = extract(content); const formatted = format(extracted); From d223771d6f27c3cdb4d88d4c53aed8d8c6ad3f97 Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Mon, 17 Feb 2025 11:07:31 +0100 Subject: [PATCH 04/10] chore: format --- .github/workflows/ci.yml | 2 +- .github/workflows/publish.yml | 2 +- eslint.config.js | 2 +- package.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9c74ffc..206b00d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: "lts/*" + node-version: 'lts/*' - name: Install pnpm uses: pnpm/action-setup@v2 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 4739502..b7ffc87 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -22,7 +22,7 @@ jobs: uses: pnpm/action-setup@v2 with: version: 10 - + - name: Install dependencies run: pnpm install diff --git a/eslint.config.js b/eslint.config.js index cf808eb..e82809b 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -18,4 +18,4 @@ export default tseslint.config( '@typescript-eslint/no-explicit-any': 'off', }, }, -); \ No newline at end of file +); diff --git a/package.json b/package.json index 5df378f..c7ec630 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,7 @@ "build": "tsc", "lint": "eslint src/**", "lint:fix": "eslint src/** --fix", - "format": "prettier --write \"**/*.{ts,json,md}\"", + "format": "prettier --write \"**/*.{ts,json,md,yml,js}\"", "format:check": "prettier --check .", "test": "node --import tsx --test tests/**" }, From 4e45771290e8b8be2a66fedf8645fc359bf94927 Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Mon, 17 Feb 2025 12:56:11 +0100 Subject: [PATCH 05/10] chore: setup semantic release --- .github/workflows/publish.yml | 35 ----------------- .github/workflows/release.yml | 52 ++++++++++++++++++++++++++ .github/workflows/{ci.yml => test.yml} | 4 +- .releaserc.yml | 4 ++ package.json | 5 ++- 5 files changed, 60 insertions(+), 40 deletions(-) delete mode 100644 .github/workflows/publish.yml create mode 100644 .github/workflows/release.yml rename .github/workflows/{ci.yml => test.yml} (93%) create mode 100644 .releaserc.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index b7ffc87..0000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Publish - -on: - push: - branches: - - main - -jobs: - publish: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: 'lts/*' - - - name: Install pnpm - uses: pnpm/action-setup@v2 - with: - version: 10 - - - name: Install dependencies - run: pnpm install - - - name: Build - run: pnpm build - - - name: Publish package - run: pnpm publish - env: - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..896bfb7 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,52 @@ +# https://semantic-release.gitbook.io/semantic-release/recipes/ci-configurations/github-actions +name: Release +on: + push: + branches: + - main + +permissions: + contents: read + +jobs: + release: + name: Release + + runs-on: ubuntu-latest + + permissions: + contents: write + issues: write + pull-requests: write + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: "lts/*" + + - name: Install pnpm + uses: pnpm/action-setup@v2 + with: + version: 10 + + - name: Install dependencies + run: pnpm install + + - name: Build + run: pnpm build + + - name: Verify the integrity of provenance attestations and registry signatures for installed dependencies + run: npm audit signatures + + - name: Release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + run: npx semantic-release diff --git a/.github/workflows/ci.yml b/.github/workflows/test.yml similarity index 93% rename from .github/workflows/ci.yml rename to .github/workflows/test.yml index 206b00d..44f6e84 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/test.yml @@ -1,8 +1,6 @@ -name: CI +name: Test on: - # push: - # branches: [main] pull_request: branches: [main] diff --git a/.releaserc.yml b/.releaserc.yml new file mode 100644 index 0000000..41810f4 --- /dev/null +++ b/.releaserc.yml @@ -0,0 +1,4 @@ +release: + branches: + - main + - next diff --git a/package.json b/package.json index c7ec630..82a4528 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "mcp-fetch-node", - "version": "0.0.1", + "version": "0.0.0-development", "description": "A Model Context Protocol server that provides web content fetching capabilities", "type": "module", "main": "dist/index.js", @@ -46,9 +46,10 @@ "@types/turndown": "^5.0.5", "eslint": "^9.20.1", "prettier": "^3.5.1", + "semantic-release": "^24.2.3", "tsx": "^4.19.2", "typescript": "^5.7.3", "typescript-eslint": "^8.24.0" }, - "packageManager": "pnpm@10.4.0+sha512.6b849d0787d97f8f4e1f03a9b8ff8f038e79e153d6f11ae539ae7c435ff9e796df6a862c991502695c7f9e8fac8aeafc1ac5a8dab47e36148d183832d886dd52" + "packageManager": "pnpm@10.4.1+sha512.c753b6c3ad7afa13af388fa6d808035a008e30ea9993f58c6663e2bc5ff21679aa834db094987129aa4d488b86df57f7b634981b2f827cdcacc698cc0cfb88af" } From bde3de850f91387db227b3890832c0d6e6f04d5e Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Mon, 17 Feb 2025 12:58:01 +0100 Subject: [PATCH 06/10] chore: format --- .github/workflows/release.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 896bfb7..964ab41 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -29,16 +29,16 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: "lts/*" + node-version: 'lts/*' - name: Install pnpm uses: pnpm/action-setup@v2 with: version: 10 - + - name: Install dependencies run: pnpm install - + - name: Build run: pnpm build From 50ca7ec9817a6132178266e50ead3cdcb1b55b19 Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Mon, 17 Feb 2025 15:25:25 +0100 Subject: [PATCH 07/10] chore: prepare for publishing --- .github/workflows/test.yml | 8 ++++++-- package.json | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 44f6e84..dd7ec69 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,14 +2,18 @@ name: Test on: pull_request: - branches: [main] + branches: + - main jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Setup Node.js uses: actions/setup-node@v4 diff --git a/package.json b/package.json index 82a4528..07744e0 100644 --- a/package.json +++ b/package.json @@ -28,6 +28,14 @@ "node": ">=22", "pnpm": ">=10" }, + "publishConfig": { + "access": "public" + }, + "files": [ + "dist", + "README.md", + "LICENSE" + ], "dependencies": { "fastmcp": "^1.16.3", "html-minifier": "^4.0.0", From 097e3531b58b9abdbcabbf177a3e163047761e24 Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Mon, 17 Feb 2025 17:13:10 +0100 Subject: [PATCH 08/10] chore: format --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dd7ec69..7f515b5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Test on: pull_request: - branches: + branches: - main jobs: From 4178ef203850e82dd579f265271976ad69d09acd Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Mon, 17 Feb 2025 17:33:17 +0100 Subject: [PATCH 09/10] fix: checkRobotsTxt, listen to errors --- README.md | 2 ++ src/main.ts | 11 +++++++++++ src/utils/check-robots-txt.ts | 2 +- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5d1c736..600dcd4 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,8 @@ pnpm format pnpm test pnpm build pnpm start +# test with MCP CLI +npx @wong2/mcp-cli --sse http://localhost:8080/sse ``` ## Contributing diff --git a/src/main.ts b/src/main.ts index 1a4046d..569041c 100644 --- a/src/main.ts +++ b/src/main.ts @@ -15,6 +15,17 @@ export async function serve() { version: '0.0.1', }); + server.on('connect', (event) => { + console.log('Client connected'); + event.session.on('error', (event) => { + console.error('Session error:', event.error); + }); + }); + + server.on('disconnect', (event) => { + console.log('Client disconnected:', event.session); + }); + server.addTool(fetchTool(userAgent, ignoreRobotsTxt)); server.addPrompt(fetchPrompt(userAgent)); diff --git a/src/utils/check-robots-txt.ts b/src/utils/check-robots-txt.ts index 33e172a..68e6310 100644 --- a/src/utils/check-robots-txt.ts +++ b/src/utils/check-robots-txt.ts @@ -39,7 +39,7 @@ export async function checkRobotsTxt( .join('\n'); // @ts-expect-error : bad types - const robotsTxt = robotsParser(robotTxtUrl, processedRobotTxt) as Robot; + const robotsTxt = robotsParser(robotsTxtUrl, processedRobotTxt) as Robot; if (robotsTxt.isDisallowed(targetUrl, userAgent)) { throw new RobotsTxtError( From 703dec8b12ef3d251f06e493512df0abd7d59fc7 Mon Sep 17 00:00:00 2001 From: Thomas Gambet Date: Mon, 17 Feb 2025 19:36:35 +0100 Subject: [PATCH 10/10] chore: doc --- README.md | 27 ++++++++++++++++++++++++++- src/constants.ts | 1 + src/main.ts | 8 ++++---- src/utils/extract.ts | 2 +- src/utils/format.ts | 3 --- 5 files changed, 32 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 600dcd4..ea964d8 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,29 @@ The fetch tool will truncate the response, but by using the `start_index` argume } ``` +### Customization - robots.txt + +By default, the server will obey a websites robots.txt file if the request came from the model (via a tool), but not if +the request was user initiated (via a prompt). This can be disabled by adding the argument `--ignore-robots-txt` to the +`args` list in the configuration. + +### Customization - User-agent + +By default, depending on if the request came from the model (via a tool), or was user initiated (via a prompt), the +server will use either the user-agent + +``` +ModelContextProtocol/1.0 (Autonomous; +https://github.com/tgambet/mcp-fetch-node) +``` + +or + +``` +ModelContextProtocol/1.0 (User-Specified; +https://github.com/tgambet/mcp-fetch-node) +``` + +This can be customized by adding the argument `--user-agent=YourUserAgent` to the `args` list in the configuration. + ## Features - [x] Fetch and extract content from a URL @@ -63,7 +86,7 @@ pnpm test pnpm build pnpm start # test with MCP CLI -npx @wong2/mcp-cli --sse http://localhost:8080/sse +pnpx @wong2/mcp-cli --sse http://localhost:8080/sse ``` ## Contributing @@ -76,6 +99,7 @@ Contributions are welcome! Please feel free to submit a Pull Request. ## TODO +- [ ] Explain key differences with the original mcp/fetch tool - [ ] Add LRU cache - [ ] Publish to npm - [ ] Dockerize and publish to docker hub @@ -84,3 +108,4 @@ Contributions are welcome! Please feel free to submit a Pull Request. - [ ] Add tests - [ ] Add documentation & examples - [ ] Add benchmarks for extraction: cf https://github.com/adbar/trafilatura/blob/master/tests/comparison_small.py +- [ ] Showcase on FastMCP and MCP repositories diff --git a/src/constants.ts b/src/constants.ts index 6599968..7739512 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -1,4 +1,5 @@ export const DEFAULT_USER_AGENT_AUTONOMOUS = 'ModelContextProtocol/1.0 (Autonomous; +https://github.com/tgambet/mcp-fetch-node)'; + export const DEFAULT_USER_AGENT_MANUAL = 'ModelContextProtocol/1.0 (User-Specified; +https://github.com/tgambet/mcp-fetch-node)'; diff --git a/src/main.ts b/src/main.ts index 569041c..c8aeda7 100644 --- a/src/main.ts +++ b/src/main.ts @@ -12,7 +12,7 @@ const ignoreRobotsTxt = args['ignore-robots-txt'] as boolean | undefined; export async function serve() { const server = new FastMCP({ name: 'mcp-fetch-node', - version: '0.0.1', + version: '0.0.0', // TODO: use package.json version? }); server.on('connect', (event) => { @@ -22,8 +22,8 @@ export async function serve() { }); }); - server.on('disconnect', (event) => { - console.log('Client disconnected:', event.session); + server.on('disconnect', () => { + console.log('Client disconnected'); }); server.addTool(fetchTool(userAgent, ignoreRobotsTxt)); @@ -34,7 +34,7 @@ export async function serve() { transportType: 'sse', sse: { endpoint: '/sse', - port: 8080, + port: 8080, // TODO: make this configurable }, }); } diff --git a/src/utils/extract.ts b/src/utils/extract.ts index 03bbee4..cfaeab5 100644 --- a/src/utils/extract.ts +++ b/src/utils/extract.ts @@ -108,7 +108,7 @@ export function extract(html: string) { removeEmptyElements: true, }); - return result.trim(); + return result; } catch (error) { if (error instanceof ExtractError) { throw error; diff --git a/src/utils/format.ts b/src/utils/format.ts index cbf950a..e8b1bf3 100644 --- a/src/utils/format.ts +++ b/src/utils/format.ts @@ -44,9 +44,6 @@ export function format(html: string): string { try { return turndownService.turndown(html); } catch (error) { - if (error instanceof FormatError) { - throw error; - } throw new FormatError('Failed to convert HTML to Markdown', error); } }