xenova · guschmue · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -38,12 +38,12 @@
   },
   "homepage": "https://github.com/xenova/transformers.js#readme",
   "dependencies": {
-    "onnxruntime-web": "1.14.0",
+    "onnxruntime-web": "1.17.1",
     "sharp": "^0.32.0",
     "@huggingface/jinja": "^0.2.1"
   },
   "optionalDependencies": {
-    "onnxruntime-node": "1.14.0"
+    "onnxruntime-node": "1.17.1"
   },
   "devDependencies": {
     "@types/jest": "^29.5.1",

diff --git a/src/models.js b/src/models.js
@@ -123,23 +123,29 @@ async function constructSession(pretrained_model_name_or_path, fileName, options
     let buffer = await getModelFile(pretrained_model_name_or_path, modelFileName, true, options);
 
     try {
-        return await InferenceSession.create(buffer, {
-            executionProviders,
-        });
-    } catch (err) {
-        // If the execution provided was only wasm, throw the error
-        if (executionProviders.length === 1 && executionProviders[0] === 'wasm') {
-            throw err;
+        let opt = options.session_options || {};
+
+        // use default execution providers if application did not specify one
+        if (opt.executionProviders === undefined) {
+            opt.executionProviders = executionProviders;
         }
 
-        console.warn(err);
-        console.warn(
-            'Something went wrong during model construction (most likely a missing operation). ' +
-            'Using `wasm` as a fallback. '
-        )
-        return await InferenceSession.create(buffer, {
-            executionProviders: ['wasm']
-        });
+        // handle onnx external data files
+        if (opt.externalData !== undefined) {
+            for (let i = 0; i < opt.externalData.length; i++) {
+                const ext = opt.externalData[i];
+                // if the external data is a string, fetch the file and replace the string with its content
+                if (typeof ext.data === "string") {
+                    const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
+                    ext.data = ext_buffer;
+                }
+            }
+        }
+        return await InferenceSession.create(buffer, opt);
+    } catch (err) {
+        // if the session fails, let the application handle it. Ie. if webgpu fails and we
+        // fallback to wasm, let the application decide if we want to use a quantized model, etc.
+        throw err;
     }
 }
 
@@ -741,6 +747,7 @@ export class PreTrainedModel extends Callable {
         local_files_only = false,
         revision = 'main',
         model_file_name = null,
+        session_options = {},
     } = {}) {
 
         let options = {
@@ -751,6 +758,7 @@ export class PreTrainedModel extends Callable {
             local_files_only,
             revision,
             model_file_name,
+            session_options,
         }
 
         const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
@@ -1296,6 +1304,8 @@ export class PreTrainedModel extends Callable {
         } else {
             // TODO support batches (i.e., batch_size > 1)
             const batch_size = 1;
+            const dtype = this.config.precision || 'float32';
+            const empty = (dtype === 'float16') ? new Uint16Array() : [];
 
             // @ts-ignore
             if (this.config.is_encoder_decoder && (this.add_encoder_pkv ?? true)) {
@@ -1305,26 +1315,26 @@ export class PreTrainedModel extends Callable {
                 let decoder_dims = [batch_size, this.num_decoder_heads, 0, this.decoder_dim_kv];
                 // @ts-ignore
                 for (let i = 0; i < this.num_decoder_layers; ++i) {
-                    decoderFeeds[`past_key_values.${i}.encoder.key`] = new Tensor('float32', [], encoder_dims)
-                    decoderFeeds[`past_key_values.${i}.encoder.value`] = new Tensor('float32', [], encoder_dims)
-                    decoderFeeds[`past_key_values.${i}.decoder.key`] = new Tensor('float32', [], decoder_dims)
-                    decoderFeeds[`past_key_values.${i}.decoder.value`] = new Tensor('float32', [], decoder_dims)
+                    decoderFeeds[`past_key_values.${i}.encoder.key`] = new Tensor(dtype, empty, encoder_dims)
+                    decoderFeeds[`past_key_values.${i}.encoder.value`] = new Tensor(dtype, empty, encoder_dims)
+                    decoderFeeds[`past_key_values.${i}.decoder.key`] = new Tensor(dtype, empty, decoder_dims)
+                    decoderFeeds[`past_key_values.${i}.decoder.value`] = new Tensor(dtype, empty, decoder_dims)
                 }
             } else if (this.config.model_type === 'falcon') {
                 // NOTE: Custom implementation for Falcon
                 // @ts-ignore
                 let dims = [batch_size * this.num_heads, 0, this.dim_kv]
                 // @ts-ignore
                 for (let i = 0; i < this.num_layers; ++i) {
-                    decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], dims)
-                    decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], dims)
+                    decoderFeeds[`past_key_values.${i}.key`] = new Tensor(dtype, empty, dims)
+                    decoderFeeds[`past_key_values.${i}.value`] = new Tensor(dtype, empty, dims)
                 }
             } else if (this.config.multi_query) { // e.g., for `gpt_bigcode`
                 // @ts-ignore
                 let dims = [batch_size * this.num_heads, 0, 2 * this.dim_kv]
                 // @ts-ignore
                 for (let i = 0; i < this.num_layers; ++i) {
-                    decoderFeeds[`past_key_values.${i}.key_value`] = new Tensor('float32', [], dims)
+                    decoderFeeds[`past_key_values.${i}.key_value`] = new Tensor(dtype, empty, dims)
                 }
             } else if (this.config.model_type === 'bloom') {
                 // NOTE: Custom implementation for Bloom
@@ -1335,16 +1345,16 @@ export class PreTrainedModel extends Callable {
                 let valueDims = [batch_size * this.num_heads, 0, this.dim_kv] // [batch_size x num_heads,past_sequence_length,64]
                 // @ts-ignore
                 for (let i = 0; i < this.num_layers; ++i) {
-                    decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], keyDims)
-                    decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], valueDims)
+                    decoderFeeds[`past_key_values.${i}.key`] = new Tensor(dtype, empty, keyDims)
+                    decoderFeeds[`past_key_values.${i}.value`] = new Tensor(dtype, empty, valueDims)
                 }
             } else { // Decoder-only
                 // @ts-ignore
                 let dims = [batch_size, this.num_heads, 0, this.dim_kv]
                 // @ts-ignore
                 for (let i = 0; i < this.num_layers; ++i) {
-                    decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], dims)
-                    decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], dims)
+                    decoderFeeds[`past_key_values.${i}.key`] = new Tensor(dtype, empty, dims)
+                    decoderFeeds[`past_key_values.${i}.value`] = new Tensor(dtype, empty, dims)
                 }
             }
         }
@@ -5380,6 +5390,7 @@ export class PretrainedMixin {
         local_files_only = false,
         revision = 'main',
         model_file_name = null,
+        session_options = {},
     } = {}) {
 
         let options = {
@@ -5390,6 +5401,7 @@ export class PretrainedMixin {
             local_files_only,
             revision,
             model_file_name,
+            session_options,
         }
         config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
         if (!options.config) {

diff --git a/src/pipelines.js b/src/pipelines.js
@@ -3019,6 +3019,7 @@ export async function pipeline(
         cache_dir = null,
         local_files_only = false,
         revision = 'main',
+        session_options = {},
     } = {}
 ) {
     // Helper method to construct pipeline
@@ -3046,6 +3047,7 @@ export async function pipeline(
         cache_dir,
         local_files_only,
         revision,
+        session_options,
     }
 
     const classes = new Map([

diff --git a/src/utils/hub.js b/src/utils/hub.js
@@ -30,6 +30,7 @@ if (!globalThis.ReadableStream) {
  * since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
  * NOTE: This setting is ignored for local requests.
  * @property {string} [model_file_name=null] If specified, load the model with this name (excluding the .onnx suffix). Currently only valid for encoder- or decoder-only models.
+ * @property {{}} [session_options={}] Session options passed to the runtime.
  */
 
 class FileResponse {