Merge pull request #14 from copilot-extensions/sgoedecke/support-non-streaming-models

sgoedecke · web-flow · commit ce7ca665e83a · 2024-09-23T17:06:02.000+10:00
Support non-streaming models
diff --git a/src/functions/execute-model.ts b/src/functions/execute-model.ts
@@ -70,7 +70,7 @@ Example Queries (IMPORTANT: Phrasing doesn't have to match):
       model: args.model,
       messages: [
         {
-          role: "system",
+          role: ["o1-mini", "o1-preview"].includes(args.model) ? "assistant" : "system",
           content: content.join("\n"),
         },
         { role: "user", content: args.instruction },
diff --git a/src/index.ts b/src/index.ts
@@ -165,13 +165,13 @@ const server = createServer(async (request, response) => {
   }
   console.timeEnd("function-exec");
 
-  // Now that we have a tool result, let's use it to call the model. Note that we're calling the model
-  // via the Models API, instead of the Copilot Chat API, so that if we're in the execute-model tool we
-  // can switch out the default model name for the requested model. We could change this in the future
-  // if we want to handle rate-limited users more gracefully or the model difference becomes a problem.
+  // Now that we have a tool result, let's use it to call the model.
   try {
+    let stream: AsyncIterable<any>;
+
     if (functionToCall.name === executeModel.definition.name) {
-      // fetch the model data from the index (already in-memory) so we have all the information we need
+      // First, let's write a reference with the model we're executing.
+      // Fetch the model data from the index (already in-memory) so we have all the information we need
       // to build out the reference URLs
       const modelData = await modelsAPI.getModelFromIndex(functionCallRes.model);
       const sseData = {
@@ -189,15 +189,30 @@ const server = createServer(async (request, response) => {
       };
       const event = createReferencesEvent([sseData]);
       response.write(event);
-    }
 
-    // We should keep all optional parameters out of this call, so it can work for any model (in case we've
-    // just run the execute-model tool).
-    const stream = await modelsAPI.inference.chat.completions.create({
-      model: functionCallRes.model,
-      messages: functionCallRes.messages,
-      stream: true,
-    });
+      if (["o1-mini", "o1-preview"].includes(args.model)) {
+        // for non-streaming models, we need to still stream the response back, so we build the stream ourselves
+        stream = (async function*() {
+          const result = await modelsAPI.inference.chat.completions.create({
+            model: functionCallRes.model,
+            messages: functionCallRes.messages
+          });
+          yield result;
+        })();
+      } else {
+        stream = await modelsAPI.inference.chat.completions.create({
+          model: functionCallRes.model,
+          messages: functionCallRes.messages,
+          stream: true
+        });
+      }
+    } else {
+      stream = await capiClient.chat.completions.create({
+        stream: true,
+        model: "gpt-4o",
+        messages: functionCallRes.messages,
+      });
+    }
 
     console.time("streaming");
     for await (const chunk of stream) {

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ Example Queries (IMPORTANT: Phrasing doesn't have to match):`
`70`	`70`	`model: args.model,`
`71`	`71`	`messages: [`
`72`	`72`	`{`
`73`		`- role: "system",`
	`73`	`+ role: ["o1-mini", "o1-preview"].includes(args.model) ? "assistant" : "system",`
`74`	`74`	`content: content.join("\n"),`
`75`	`75`	`},`
`76`	`76`	`{ role: "user", content: args.instruction },`