fix: timeout if page can't be crawled

fixes #5
tobiasbueschel · Mar 26, 2023 · 8e48f80 · 8e48f80
1 parent 9c3c2d2
commit 8e48f80
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -53,6 +53,8 @@ flowchart LR
   E --> F[ChatGPT uses this context to provide an answer]
 ```
 
+Please note: the current implementation feeds Google Search results to `gpt-3.5-turbo` and does not include previous messages in subsequent queries to avoid surpassing the token limit.
+
 ## License
 
 This project is licensed under the [MIT license](./license).
diff --git a/index.js b/index.js
@@ -36,9 +36,6 @@ const convert = compile({
   ],
 });
 
-// Store any previous chats
-let previousChat = [];
-
 async function startCli() {
   rl.question(
     chalk.bgHex("#00A67E").white("🧠 Ask me anything:") + " ",
@@ -52,82 +49,118 @@ async function startCli() {
 async function searchGPT(userPrompt) {
   process.stdout.write(chalk.dim("> Starting Google Search..."));
 
-  previousChat = [
+  // Step 1: perform Google Search
+  // We crawl the first 5 pages returned from Google Search as it often contains the result of the query.
+  // As a fallback, we also include all snippets from other search result pages in case the answer is not
+  // included in the crawled page already.
+  const searchResults = await getGoogleSearchResults(userPrompt);
+  const [context, urlReference] =
+    (await getTextOfSearchResults(searchResults)) || [];
+
+  // Step 2: build up chat messages by providing search result context and user prompt
+  const chatMessages = [
     {
       role: "system",
       content: `You are my AI assistant and I want you to assume today is ${new Date().toDateString()}.`,
     },
+    {
+      role: "assistant",
+      content: context,
+    },
+    {
+      role: "user",
+      content: `With the information in the assistant's last message, answer this: ${userPrompt}`,
+    },
   ];
 
-  // Step 1: perform Google Search
-  // We crawl the first page returned in the Google Search as it often contains the result of the query.
-  // As a fallback, we also include all snippets from the other search result pages in case the answer is not
-  // included in the first page already.
-  const searchResults = await getGoogleSearchResults(userPrompt);
-  const [firstpage, ...remainingPages] = searchResults.items;
-  const urlToCheck = firstpage.link;
-
-  process.stdout.cursorTo(0);
-  process.stdout.write(chalk.dim(`> Checking: ${urlToCheck}`));
-
-  // Fetch raw HTML of first page & get main content
-  const htmlString = await fetch(urlToCheck);
-  let context = convert(await htmlString.text());
-
-  // Get all Google Search snippets, clean them up and add to the text
-  context += remainingPages
-    .reduce((allPages, currentPage) => `${allPages} ${currentPage.snippet}`, "")
-    .replaceAll("...", " "); // Remove "..." from Google snippet results;
-
-  // Note: we must stay below the max token amount of OpenAI's API.
-  // Max token amount: 4096, 1 token ~= 4 chars in English
-  // Hence, we should roughly ensure we stay below 10,000 characters for the input
-  // and leave the remaining the tokens for the answer.
-  // - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-  // - https://platform.openai.com/docs/api-reference/chat/create
-  context = context
-    .replaceAll("\n", " ") // Remove any new lines from raw HTML of first page
-    .trim()
-    .substring(0, 10000);
-
-  // Provide OpenAI with the context from the Google Search
-  previousChat.push({
-    role: "assistant",
-    content: context,
-  });
-
-  // Step 2: feed search results into OpenAI and answer original question
-  previousChat.push({
-    role: "user",
-    content: `With the information in the assistant's last message, answer this: ${userPrompt}`,
-  });
-
-  const finalResponse = await getOpenAIChatCompletion(previousChat);
+  // Step 2: reach out to OpenAI to answer original user prompt with attached context
+  const finalResponse = await getOpenAIChatCompletion(chatMessages);
 
   process.stdout.clearLine(0);
   process.stdout.cursorTo(0);
+
   console.log("\n" + chalk.green("> ") + chalk.white(finalResponse));
-  console.log(chalk.dim(`> Know more: ${urlToCheck}` + "\n"));
+  console.log(chalk.dim(`> Know more: ${urlReference}` + "\n"));
 
   return finalResponse;
 }
 
-async function getGoogleSearchResults(searchTerm) {
+/**
+ * Crawl the first page of Google Search results and get the main content
+ * If the first page is not accessible, try the next page and so on.
+ */
+async function getTextOfSearchResults(searchResults) {
   try {
-    const response = await fetch(
-      `https://www.googleapis.com/customsearch/v1\?key\=${GOOGLE_SEARCH_API_KEY}\&cx=${GOOGLE_SEARCH_ID}\&q\=${searchTerm}`
+    let urlReference = "";
+
+    // Get all Google Search snippets, clean them up by removing "..." and add to the text context
+    let context = searchResults.items.reduce(
+      (allPages, currentPage) =>
+        `${allPages} ${currentPage.snippet.replaceAll("...", " ")}`,
+      ""
     );
 
-    const data = await response.json();
-    return data;
+    // Loop over searchResults.items until we find a page that is accessible, break if we try more than 5 pages or we reached the end of searchResults.items
+    for (let i = 0; i < searchResults.items.length && i < 5; i++) {
+      const urlToCheck = searchResults.items[i].link;
+
+      process.stdout.clearLine(0);
+      process.stdout.cursorTo(0);
+      process.stdout.write(chalk.dim(`> Checking: ${urlToCheck}`));
+
+      // Fetch the HTML of the page & get main content. If we get a non 200-code, we try the next page.
+      // if fetch request gets stuck for more than 5 seconds, we try the next page.
+      const response = await Promise.race([
+        fetch(urlToCheck),
+        new Promise((resolve) => setTimeout(() => resolve(undefined), 5000)),
+      ]);
+
+      if (!response?.ok) {
+        continue;
+      }
+
+      // Get the full text from the raw HTML and remove any new lines from it as we don't need them
+      const fullText = convert(await response.text())
+        .replaceAll("\n", " ")
+        .trim();
+      context = fullText + context;
+      urlReference = urlToCheck;
+
+      break;
+    }
+
+    // Note: we must stay below the max token amount of OpenAI's API.
+    // Max token amount: 4096, 1 token ~= 4 chars in English
+    // Hence, we should roughly ensure we stay below 10,000 characters for the input
+    // and leave the remaining the tokens for the answer.
+    // - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+    // - https://platform.openai.com/docs/api-reference/chat/create
+    context = context.substring(0, 10000);
+
+    return [context, urlReference];
   } catch (error) {
     console.error(error);
   }
 }
 
+/**
+ * Fetch the first page of Google Search results
+ */
+async function getGoogleSearchResults(searchTerm) {
+  const response = await makeFetch(
+    `https://www.googleapis.com/customsearch/v1\?key\=${GOOGLE_SEARCH_API_KEY}\&cx=${GOOGLE_SEARCH_ID}\&q\=${searchTerm}`
+  );
+  const data = await response.json();
+  return data;
+}
+
+/**
+ * Call OpenAI's chat API to answer the user's prompt with the context from Google Search
+ */
 async function getOpenAIChatCompletion(previousChat) {
-  try {
-    const response = await fetch("https://api.openai.com/v1/chat/completions", {
+  const response = await makeFetch(
+    "https://api.openai.com/v1/chat/completions",
+    {
       method: "POST",
       headers: {
         "Content-Type": "application/json",
@@ -137,12 +170,33 @@ async function getOpenAIChatCompletion(previousChat) {
         model: "gpt-3.5-turbo",
         messages: previousChat,
       }),
-    });
+    }
+  );
+
+  const { choices } = await response.json();
+  return choices[0].message.content;
+}
 
-    const { choices } = await response.json();
-    return choices[0].message.content;
+/**
+ * Helper function to make fetch requests
+ */
+async function makeFetch(url, options) {
+  try {
+    const response = await fetch(url, options);
+    // The Promise returned from fetch() won’t reject on HTTP error status even if the response is an HTTP 404 or 500.
+    if (response.ok) {
+      return response;
+    }
+    // for all other status codes (e.g., 404, 500), this will log the error and stop the e2e tests
+    console.error(
+      `The ${options.method} ${url}" request failed with code: ${response.status} and message: ${response.statusText}`
+    );
   } catch (error) {
-    console.error(error);
+    // if the request is rejected due to e.g., network failures, this will log the error
+    console.error(
+      `The ${options.method} ${url}" request failed due to a network error`,
+      error
+    );
   }
 }