Skip to content

Commit

Permalink
fix: timeout if page can't be crawled
Browse files Browse the repository at this point in the history
fixes #5
  • Loading branch information
tobiasbueschel committed Mar 26, 2023
1 parent 9c3c2d2 commit 8e48f80
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 60 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ flowchart LR
E --> F[ChatGPT uses this context to provide an answer]
```

Please note: the current implementation feeds Google Search results to `gpt-3.5-turbo` and does not include previous messages in subsequent queries to avoid surpassing the token limit.

## License

This project is licensed under the [MIT license](./license).
174 changes: 114 additions & 60 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ const convert = compile({
],
});

// Store any previous chats
let previousChat = [];

async function startCli() {
rl.question(
chalk.bgHex("#00A67E").white("🧠 Ask me anything:") + " ",
Expand All @@ -52,82 +49,118 @@ async function startCli() {
async function searchGPT(userPrompt) {
process.stdout.write(chalk.dim("> Starting Google Search..."));

previousChat = [
// Step 1: perform Google Search
// We crawl the first 5 pages returned from Google Search as it often contains the result of the query.
// As a fallback, we also include all snippets from other search result pages in case the answer is not
// included in the crawled page already.
const searchResults = await getGoogleSearchResults(userPrompt);
const [context, urlReference] =
(await getTextOfSearchResults(searchResults)) || [];

// Step 2: build up chat messages by providing search result context and user prompt
const chatMessages = [
{
role: "system",
content: `You are my AI assistant and I want you to assume today is ${new Date().toDateString()}.`,
},
{
role: "assistant",
content: context,
},
{
role: "user",
content: `With the information in the assistant's last message, answer this: ${userPrompt}`,
},
];

// Step 1: perform Google Search
// We crawl the first page returned in the Google Search as it often contains the result of the query.
// As a fallback, we also include all snippets from the other search result pages in case the answer is not
// included in the first page already.
const searchResults = await getGoogleSearchResults(userPrompt);
const [firstpage, ...remainingPages] = searchResults.items;
const urlToCheck = firstpage.link;

process.stdout.cursorTo(0);
process.stdout.write(chalk.dim(`> Checking: ${urlToCheck}`));

// Fetch raw HTML of first page & get main content
const htmlString = await fetch(urlToCheck);
let context = convert(await htmlString.text());

// Get all Google Search snippets, clean them up and add to the text
context += remainingPages
.reduce((allPages, currentPage) => `${allPages} ${currentPage.snippet}`, "")
.replaceAll("...", " "); // Remove "..." from Google snippet results;

// Note: we must stay below the max token amount of OpenAI's API.
// Max token amount: 4096, 1 token ~= 4 chars in English
// Hence, we should roughly ensure we stay below 10,000 characters for the input
// and leave the remaining the tokens for the answer.
// - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
// - https://platform.openai.com/docs/api-reference/chat/create
context = context
.replaceAll("\n", " ") // Remove any new lines from raw HTML of first page
.trim()
.substring(0, 10000);

// Provide OpenAI with the context from the Google Search
previousChat.push({
role: "assistant",
content: context,
});

// Step 2: feed search results into OpenAI and answer original question
previousChat.push({
role: "user",
content: `With the information in the assistant's last message, answer this: ${userPrompt}`,
});

const finalResponse = await getOpenAIChatCompletion(previousChat);
// Step 2: reach out to OpenAI to answer original user prompt with attached context
const finalResponse = await getOpenAIChatCompletion(chatMessages);

process.stdout.clearLine(0);
process.stdout.cursorTo(0);

console.log("\n" + chalk.green("> ") + chalk.white(finalResponse));
console.log(chalk.dim(`> Know more: ${urlToCheck}` + "\n"));
console.log(chalk.dim(`> Know more: ${urlReference}` + "\n"));

return finalResponse;
}

async function getGoogleSearchResults(searchTerm) {
/**
* Crawl the first page of Google Search results and get the main content
* If the first page is not accessible, try the next page and so on.
*/
async function getTextOfSearchResults(searchResults) {
try {
const response = await fetch(
`https://www.googleapis.com/customsearch/v1\?key\=${GOOGLE_SEARCH_API_KEY}\&cx=${GOOGLE_SEARCH_ID}\&q\=${searchTerm}`
let urlReference = "";

// Get all Google Search snippets, clean them up by removing "..." and add to the text context
let context = searchResults.items.reduce(
(allPages, currentPage) =>
`${allPages} ${currentPage.snippet.replaceAll("...", " ")}`,
""
);

const data = await response.json();
return data;
// Loop over searchResults.items until we find a page that is accessible, break if we try more than 5 pages or we reached the end of searchResults.items
for (let i = 0; i < searchResults.items.length && i < 5; i++) {
const urlToCheck = searchResults.items[i].link;

process.stdout.clearLine(0);
process.stdout.cursorTo(0);
process.stdout.write(chalk.dim(`> Checking: ${urlToCheck}`));

// Fetch the HTML of the page & get main content. If we get a non 200-code, we try the next page.
// if fetch request gets stuck for more than 5 seconds, we try the next page.
const response = await Promise.race([
fetch(urlToCheck),
new Promise((resolve) => setTimeout(() => resolve(undefined), 5000)),
]);

if (!response?.ok) {
continue;
}

// Get the full text from the raw HTML and remove any new lines from it as we don't need them
const fullText = convert(await response.text())
.replaceAll("\n", " ")
.trim();
context = fullText + context;
urlReference = urlToCheck;

break;
}

// Note: we must stay below the max token amount of OpenAI's API.
// Max token amount: 4096, 1 token ~= 4 chars in English
// Hence, we should roughly ensure we stay below 10,000 characters for the input
// and leave the remaining the tokens for the answer.
// - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
// - https://platform.openai.com/docs/api-reference/chat/create
context = context.substring(0, 10000);

return [context, urlReference];
} catch (error) {
console.error(error);
}
}

/**
* Fetch the first page of Google Search results
*/
async function getGoogleSearchResults(searchTerm) {
const response = await makeFetch(
`https://www.googleapis.com/customsearch/v1\?key\=${GOOGLE_SEARCH_API_KEY}\&cx=${GOOGLE_SEARCH_ID}\&q\=${searchTerm}`
);
const data = await response.json();
return data;
}

/**
* Call OpenAI's chat API to answer the user's prompt with the context from Google Search
*/
async function getOpenAIChatCompletion(previousChat) {
try {
const response = await fetch("https://api.openai.com/v1/chat/completions", {
const response = await makeFetch(
"https://api.openai.com/v1/chat/completions",
{
method: "POST",
headers: {
"Content-Type": "application/json",
Expand All @@ -137,12 +170,33 @@ async function getOpenAIChatCompletion(previousChat) {
model: "gpt-3.5-turbo",
messages: previousChat,
}),
});
}
);

const { choices } = await response.json();
return choices[0].message.content;
}

const { choices } = await response.json();
return choices[0].message.content;
/**
* Helper function to make fetch requests
*/
async function makeFetch(url, options) {
try {
const response = await fetch(url, options);
// The Promise returned from fetch() won’t reject on HTTP error status even if the response is an HTTP 404 or 500.
if (response.ok) {
return response;
}
// for all other status codes (e.g., 404, 500), this will log the error and stop the e2e tests
console.error(
`The ${options.method} ${url}" request failed with code: ${response.status} and message: ${response.statusText}`
);
} catch (error) {
console.error(error);
// if the request is rejected due to e.g., network failures, this will log the error
console.error(
`The ${options.method} ${url}" request failed due to a network error`,
error
);
}
}

Expand Down

0 comments on commit 8e48f80

Please sign in to comment.