From 20c5fe1cdd845ea17583ddaaeb8b439c2de6cc13 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Sat, 5 Aug 2023 13:57:10 -0500
Subject: [PATCH 01/13] updated source to supoort mps and cuds

---
 README.md                                     |  30 +--
 .../instruction-following/Llama-v2.yaml       |   2 +-
 convert-to-flexgen.py                         |  63 -----
 docs/Extensions.md                            | 233 ++++++++++++------
 docs/FlexGen.md                               |  64 -----
 docs/Low-VRAM-guide.md                        |  53 ----
 docs/README.md                                |   2 -
 docs/llama.cpp-models.md                      |  53 ----
 extensions/api/blocking_api.py                |  13 +-
 extensions/api/streaming_api.py               |   9 +-
 extensions/api/util.py                        |   1 -
 extensions/example/script.py                  | 137 ++++++++++
 extensions/llava/script.py                    |   8 -
 extensions/multimodal/script.py               |   9 +
 extensions/openai/README.md                   | 117 +++++----
 extensions/openai/completions.py              | 117 ++++++---
 extensions/openai/defaults.py                 |   2 -
 extensions/openai/embeddings.py               |  31 ++-
 extensions/openai/errors.py                   |   8 +-
 extensions/openai/images.py                   |  28 ++-
 extensions/openai/moderations.py              |  15 +-
 extensions/openai/script.py                   |  28 ++-
 extensions/openai/tokens.py                   |   7 +-
 extensions/openai/utils.py                    |   4 +-
 extensions/send_pictures/script.py            |  16 +-
 extensions/superbooga/download_urls.py        |   5 +-
 extensions/whisper_stt/script.py              |  10 +
 models/config.yaml                            |   2 +-
 modules/ComputeDevice.py                      | 164 ++++++++++++
 modules/chat.py                               |   6 +-
 modules/deepspeed_parameters.py               |   2 +-
 modules/extensions.py                         |  23 +-
 modules/llamacpp_hf.py                        |   2 +
 modules/loaders.py                            |   4 +
 modules/models_settings.py                    |   2 -
 modules/shared.py                             |  24 +-
 modules/text_generation.py                    |  67 -----
 modules/training.py                           |   4 +-
 modules/ui.py                                 |  10 +-
 modules/utils.py                              |   5 +-
 server.py                                     |  70 ++++--
 settings-template.yaml                        |   2 +-
 42 files changed, 828 insertions(+), 624 deletions(-)
 delete mode 100644 convert-to-flexgen.py
 delete mode 100644 docs/FlexGen.md
 delete mode 100644 docs/Low-VRAM-guide.md
 delete mode 100644 docs/llama.cpp-models.md
 create mode 100644 extensions/example/script.py
 delete mode 100644 extensions/llava/script.py
 create mode 100644 modules/ComputeDevice.py

diff --git a/README.md b/README.md
index ae485898..a703b01d 100644
--- a/README.md
+++ b/README.md
@@ -36,9 +36,9 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
       - [AutoGPTQ](#autogptq)
       - [ExLlama](#exllama)
       - [GPTQ-for-LLaMa](#gptq-for-llama)
-      - [FlexGen](#flexgen)
       - [DeepSpeed](#deepspeed)
       - [RWKV](#rwkv)
+      - [RoPE (for llama.cpp and ExLlama only)](#rope-for-llamacpp-and-exllama-only)
       - [Gradio](#gradio)
       - [API](#api)
       - [Multimodal](#multimodal)
@@ -47,7 +47,6 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
   - [Community](#community)
   - [Credits](#credits)
 
-
 ## Features
 
 * 3 interface modes: default, notebook, and chat
@@ -56,7 +55,7 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
 * LoRA: load and unload LoRAs on the fly, load multiple LoRAs at the same time, train a new LoRA
 * Precise instruction templates for chat mode, including Alpaca, Vicuna, Open Assistant, Dolly, Koala, ChatGLM, MOSS, RWKV-Raven, Galactica, StableLM, WizardLM, Baize, Ziya, Chinese-Vicuna, MPT, INCITE, Wizard Mega, KoAlpaca, Vigogne, Bactrian, h2o, and OpenBuddy
 * [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
-* 8-bit and 4-bit inference through bitsandbytes **CPU only mode for macOS, bitsandbytes does not support Apple Silicon M1/M2 processors**
+* 8-bit and 4-bit inference through bitsandbytes **CPU only mode for macOS, bitsandbytes does not support Apple Silicon GPU**
 * CPU mode for transformers models
 * [DeepSpeed ZeRO-3 inference](docs/DeepSpeed.md)
 * [Extensions](docs/Extensions.md)
@@ -165,7 +164,7 @@ Optionally, you can use the following command-line flags:
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
 
 #### Accelerate/transformers
 
@@ -203,8 +202,8 @@ Optionally, you can use the following command-line flags:
 | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
 | `--no-mmap` | Prevent mmap from being used. |
 | `--mlock`   | Force the system to keep the model in RAM. |
-| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
-| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
+| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. Does not apply for Apple Silicon GPU since it uses unified memory. |
+| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with Apple Silicon GPU Support for BLAS and llama-cpp using Metal. Load the model and look for **llama_model_load_internal: n_layer in ths STDERR and this will show you the number of layers in the model. Set this value to that number or possibly n + 2, This si very sensitive now an will overrun your data area or tensor cache causing a segmentation fault. |
 | `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 | `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama2 70b. |
@@ -226,8 +225,6 @@ Optionally, you can use the following command-line flags:
 |------------------|-------------|
 |`--gpu-split`     | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
 |`--max_seq_len MAX_SEQ_LEN`           | Maximum sequence length. |
-|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
-|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both. `
 
 #### GPTQ-for-LLaMa
 
@@ -243,14 +240,6 @@ Optionally, you can use the following command-line flags:
 | `--warmup_autotune`    | (triton) Enable warmup autotune. |
 | `--fused_mlp`          | (triton) Enable fused mlp. |
 
-#### FlexGen
-
-| Flag             | Description |
-|------------------|-------------|
-| `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
-| `--compress-weight`               | FlexGen: Whether to compress weight (default: False).|
-| `--pin-weight [PIN_WEIGHT]`       | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
-
 #### DeepSpeed
 
 | Flag                                  | Description |
@@ -266,6 +255,13 @@ Optionally, you can use the following command-line flags:
 | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
 | `--rwkv-cuda-on`                | RWKV: Compile the CUDA kernel for better performance. |
 
+#### RoPE (for llama.cpp and ExLlama only)
+
+| Flag             | Description |
+|------------------|-------------|
+|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
+|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both. |
+
 #### Gradio
 
 | Flag                                  | Description |
@@ -293,8 +289,6 @@ Optionally, you can use the following command-line flags:
 |---------------------------------------|-------------|
 | `--multimodal-pipeline PIPELINE`      | The multimodal pipeline to use. Examples: `llava-7b`, `llava-13b`. |
 
-Out of memory errors? [Check the low VRAM guide](docs/Low-VRAM-guide.md).
-
 ## Presets
 
 Inference settings presets can be created under `presets/` as yaml files. These files are detected automatically at startup.
diff --git a/characters/instruction-following/Llama-v2.yaml b/characters/instruction-following/Llama-v2.yaml
index a3af0e87..d259dd39 100644
--- a/characters/instruction-following/Llama-v2.yaml
+++ b/characters/instruction-following/Llama-v2.yaml
@@ -1,4 +1,4 @@
 user: ""
 bot: ""
 turn_template: "<|user|><|user-message|> [/INST] <|bot|><|bot-message|> </s><s>[INST] "
-context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n"
+context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n\n"
diff --git a/convert-to-flexgen.py b/convert-to-flexgen.py
deleted file mode 100644
index 7654593b..00000000
--- a/convert-to-flexgen.py
+++ /dev/null
@@ -1,63 +0,0 @@
-'''
-
-Converts a transformers model to a format compatible with flexgen.
-
-'''
-
-import argparse
-import os
-from pathlib import Path
-
-import numpy as np
-import torch
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54))
-parser.add_argument('MODEL', type=str, default=None, nargs='?', help="Path to the input model.")
-args = parser.parse_args()
-
-
-def disable_torch_init():
-    """
-    Disable the redundant torch default initialization to accelerate model creation.
-    """
-    import torch
-    global torch_linear_init_backup
-    global torch_layer_norm_init_backup
-
-    torch_linear_init_backup = torch.nn.Linear.reset_parameters
-    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
-
-    torch_layer_norm_init_backup = torch.nn.LayerNorm.reset_parameters
-    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-
-
-def restore_torch_init():
-    """Rollback the change made by disable_torch_init."""
-    import torch
-    setattr(torch.nn.Linear, "reset_parameters", torch_linear_init_backup)
-    setattr(torch.nn.LayerNorm, "reset_parameters", torch_layer_norm_init_backup)
-
-
-if __name__ == '__main__':
-    path = Path(args.MODEL)
-    model_name = path.name
-
-    print(f"Loading {model_name}...")
-    # disable_torch_init()
-    model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
-    # restore_torch_init()
-
-    tokenizer = AutoTokenizer.from_pretrained(path)
-
-    out_folder = Path(f"models/{model_name}-np")
-    if not Path(out_folder).exists():
-        os.mkdir(out_folder)
-
-    print(f"Saving the converted model to {out_folder}...")
-    for name, param in tqdm(list(model.model.named_parameters())):
-        name = name.replace("decoder.final_layer_norm", "decoder.layer_norm")
-        param_path = os.path.join(out_folder, name)
-        with open(param_path, "wb") as f:
-            np.save(f, param.cpu().detach().numpy())
diff --git a/docs/Extensions.md b/docs/Extensions.md
index e156456b..c7d1aa36 100644
--- a/docs/Extensions.md
+++ b/docs/Extensions.md
@@ -1,45 +1,47 @@
-Extensions are defined by files named `script.py` inside subfolders of `text-generation-webui/extensions`. They are loaded at startup if specified with the `--extensions` flag.
+# Extensions
+
+Extensions are defined by files named `script.py` inside subfolders of `text-generation-webui/extensions`. They are loaded at startup if the folder name is specified after the `--extensions` flag.
 
 For instance, `extensions/silero_tts/script.py` gets loaded with `python server.py --extensions silero_tts`.
 
 ## [text-generation-webui-extensions](https://github.com/oobabooga/text-generation-webui-extensions)
 
-The link above contains a directory of user extensions for text-generation-webui.
+The repository above contains a directory of user extensions.
 
-If you create an extension, you are welcome to host it in a GitHub repository and submit it to the list above.
+If you create an extension, you are welcome to host it in a GitHub repository and submit a PR adding it to the list.
 
 ## Built-in extensions
 
-Most of these have been created by the extremely talented contributors that you can find here: [contributors](https://github.com/oobabooga/text-generation-webui/graphs/contributors?from=2022-12-18&to=&type=a).
-
 |Extension|Description|
 |---------|-----------|
-|[api](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/api)| Creates an API with two endpoints, one for streaming at `/api/v1/stream` port 5005 and another for blocking at `/api/v1/generate` port 5000. This is the main API for this web UI. |
+|[api](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/api)| Creates an API with two endpoints, one for streaming at `/api/v1/stream` port 5005 and another for blocking at `/api/v1/generate` port 5000. This is the main API for the webui. |
+|[openai](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai)| Creates an API that mimics the OpenAI API and can be used as a drop-in replacement. |
+|[multimodal](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal) | Adds multimodality support (text+images). For a detailed description see [README.md](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal/README.md) in the extension directory. |
 |[google_translate](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/google_translate)| Automatically translates inputs and outputs using Google Translate.|
-|[character_bias](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/character_bias)| Just a very simple example that biases the bot's responses in chat mode.|
-|[gallery](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/gallery/)| Creates a gallery with the chat characters and their pictures. |
-|[silero_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/silero_tts)| Text-to-speech extension using [Silero](https://github.com/snakers4/silero-models). When used in chat mode, it replaces the responses with an audio widget. |
+|[silero_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/silero_tts)| Text-to-speech extension using [Silero](https://github.com/snakers4/silero-models). When used in chat mode, responses are replaced with an audio widget. |
 |[elevenlabs_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/elevenlabs_tts)| Text-to-speech extension using the [ElevenLabs](https://beta.elevenlabs.io/) API. You need an API key to use it. |
-|[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. |
 |[whisper_stt](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/whisper_stt)| Allows you to enter your inputs in chat mode using your microphone. |
 |[sd_api_pictures](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/sd_api_pictures)| Allows you to request pictures from the bot in chat mode, which will be generated using the AUTOMATIC1111 Stable Diffusion API. See examples [here](https://github.com/oobabooga/text-generation-webui/pull/309). |
-|[multimodal](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal) | Adds multimodality support (text+images). For a detailed description see [README.md](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal/README.md) in the extension directory. |
-|[openai](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai)| Creates an API that mimics the OpenAI API and can be used as a drop-in replacement. |
+|[character_bias](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/character_bias)| Just a very simple example that adds a hidden string at the beginning of the bot's reply in chat mode. |
+|[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. |
+|[gallery](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/gallery/)| Creates a gallery with the chat characters and their pictures. |
 |[superbooga](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/superbooga)| An extension that uses ChromaDB to create an arbitrarily large pseudocontext, taking as input text files, URLs, or pasted text. Based on https://github.com/kaiokendev/superbig. |
+|[ngrok](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/ngrok)| Allows you to access the web UI remotely using the ngrok reverse tunnel service (free). It's an alternative to the built-in Gradio `--share` feature. |
+|[perplexity_colors](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/perplexity_colors)| Colors each token in the output text by its associated probability, as derived from the model logits. |
 
 ## How to write an extension
 
-script.py may define the special functions and variables below.
-
-#### Predefined functions
+The extensions framework is based on special functions and variables that you can define in `script.py`. The functions are the following:
 
 | Function        | Description |
 |-------------|-------------|
+| `def setup()` | Is executed when the extension gets imported. |
 | `def ui()` | Creates custom gradio elements when the UI is launched. | 
 | `def custom_css()` | Returns custom CSS as a string. It is applied whenever the web UI is loaded. |
 | `def custom_js()` | Same as above but for javascript. |
 | `def input_modifier(string, state)`  | Modifies the input string before it enters the model. In chat mode, it is applied to the user message. Otherwise, it is applied to the entire prompt. |
 | `def output_modifier(string, state)`  | Modifies the output string before it is presented in the UI. In chat mode, it is applied to the bot's reply. Otherwise, it is applied to the entire output. |
+| `def chat_input_modifier(text, visible_text, state)` | Modifies both the visible and internal inputs in chat mode. Can be used to hijack the chat input with custom content. |
 | `def bot_prefix_modifier(string, state)`  | Applied in chat mode to the prefix for the bot's reply. |
 | `def state_modifier(state)`  | Modifies the dictionary containing the UI input parameters before it is used by the text generation functions. |
 | `def history_modifier(history)`  | Modifies the chat history before the text generation in chat mode begins. |
@@ -48,9 +50,7 @@ script.py may define the special functions and variables below.
 | `def tokenizer_modifier(state, prompt, input_ids, input_embeds)` | Modifies the `input_ids`/`input_embeds` fed to the model. Should return `prompt`, `input_ids`, `input_embeds`. See the `multimodal` extension for an example. |
 | `def custom_tokenized_length(prompt)` | Used in conjunction with `tokenizer_modifier`, returns the length in tokens of `prompt`. See the `multimodal` extension for an example. |
 
-#### `params` dictionary
-
-In this dictionary, `display_name` is used to define the displayed name of the extension in the UI, and `is_tab` is used to define whether the extension should appear in a new tab. By default, extensions appear at the bottom of the "Text generation" tab.
+Additionally, you can define a special `params` dictionary. In it, the `display_name` key is used to define the displayed name of the extension in the UI, and the `is_tab` key is used to define whether the extension should appear in a new tab. By default, extensions appear at the bottom of the "Text generation" tab.
 
 Example:
 
@@ -61,7 +61,7 @@ params = {
 }
 ```
 
-Additionally, `params` may contain variables that you want to be customizable through a `settings.json` file. For instance, assuming the extension is in `extensions/google_translate`, the variable `language string` in
+The `params` dict may also contain variables that you want to be customizable through a `settings.yaml` file. For instance, assuming the extension is in `extensions/google_translate`, the variable `language string` in
 
 ```python
 params = {
@@ -71,32 +71,19 @@ params = {
 }
 ```
 
-can be customized by adding a key called `google_translate-language string` to `settings.json`:
+can be customized by adding a key called `google_translate-language string` to `settings.yaml`:
 
 ```python
-"google_translate-language string": "fr",
+google_translate-language string: 'fr'
 ``` 
 
-That is, the syntax is `extension_name-variable_name`.
-
-#### `input_hijack` dictionary
-
-```python
-input_hijack = {
-    'state': False,
-    'value': ["", ""]
-}
-```
-This is only used in chat mode. If your extension sets `input_hijack['state'] = True` at any moment, the next call to `modules.chat.chatbot_wrapper` will use the values inside `input_hijack['value']` as the user input for text generation. See the `send_pictures` extension above for an example. 
-
-Additionally, your extension can set the value to be a callback in the form of `def cb(text: str, visible_text: str) -> [str, str]`. See the `multimodal` extension above for an example.
+That is, the syntax for the key is `extension_name-variable_name`.
 
 ## Using multiple extensions at the same time
 
-In order to use your extension, you must start the web UI with the `--extensions` flag followed by the name of your extension (the folder under `text-generation-webui/extension` where `script.py` resides).
-
-You can activate more than one extension at a time by providing their names separated by spaces. The input, output, and bot prefix modifiers will be applied in the specified order. 
+You can activate more than one extension at a time by providing their names separated by spaces after `--extensions`. The input, output, and bot prefix modifiers will be applied in the specified order. 
 
+Example:
 
 ```
 python server.py --extensions enthusiasm translate # First apply enthusiasm, then translate
@@ -106,56 +93,150 @@ python server.py --extensions translate enthusiasm # First apply translate, then
 Do note, that for:
 - `custom_generate_chat_prompt`
 - `custom_generate_reply`
-- `tokenizer_modifier`
 - `custom_tokenized_length`
 
 only the first declaration encountered will be used and the rest will be ignored. 
 
-## The `bot_prefix_modifier`
-
-In chat mode, this function modifies the prefix for a new bot message. For instance, if your bot is named `Marie Antoinette`, the default prefix for a new message will be
-
-```
-Marie Antoinette:
-```
-
-Using `bot_prefix_modifier`, you can change it to:
-
-```
-Marie Antoinette: *I am very enthusiastic*
-```
- 
-Marie Antoinette will become very enthusiastic in all her messages.
-
-## `custom_generate_reply` example
+## A full example
 
-Once defined in a `script.py`, this function is executed in place of the main generation functions. You can use it to connect the web UI to an external API, or to load a custom model that is not supported yet.
-
-Note that in chat mode, this function must only return the new text, whereas in other modes it must return the original prompt + the new text.
+The source code below can be found at [extensions/example/script.py](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/example/script.py).
 
 ```python
-import datetime
-
-def custom_generate_reply(question, original_question, seed, state, stopping_strings):
-    cumulative = ''
-    for i in range(10):
-        cumulative += f"Counting: {i}...\n"
-        yield cumulative
+"""
+An example of extension. It does nothing, but you can add transformations
+before the return statements to customize the webui behavior.
 
-    cumulative += f"Done! {str(datetime.datetime.now())}"
-    yield cumulative
-```
+Starting from history_modifier and ending in output_modifier, the
+functions are declared in the same order that they are called at
+generation time.
+"""
 
-## `custom_generate_chat_prompt` example
+import torch
+from modules import chat
+from modules.text_generation import (
+    decode,
+    encode,
+    generate_reply,
+)
+from transformers import LogitsProcessor
 
-Below is an extension that just reproduces the default prompt generator in `modules/chat.py`. You can modify it freely to come up with your own prompts in chat mode.
+params = {
+    "display_name": "Example Extension",
+    "is_tab": False,
+}
 
-```python
-from modules import chat
+class MyLogits(LogitsProcessor):
+    """
+    Manipulates the probabilities for the next token before it gets sampled.
+    Used in the logits_processor_modifier function below.
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, input_ids, scores):
+        # probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+        # probs[0] /= probs[0].sum()
+        # scores = torch.log(probs / (1 - probs))
+        return scores
+
+def history_modifier(history):
+    """
+    Modifies the chat history.
+    Only used in chat mode.
+    """
+    return history
+
+def state_modifier(state):
+    """
+    Modifies the state variable, which is a dictionary containing the input
+    values in the UI like sliders and checkboxes.
+    """
+    return state
+
+def chat_input_modifier(text, visible_text, state):
+    """
+    Modifies the user input string in chat mode (visible_text).
+    You can also modify the internal representation of the user
+    input (text) to change how it will appear in the prompt.
+    """
+    return text, visible_text
+
+def input_modifier(string, state):
+    """
+    In default/notebook modes, modifies the whole prompt.
+
+    In chat mode, it is the same as chat_input_modifier but only applied
+    to "text", here called "string", and not to "visible_text".
+    """
+    return string
+
+def bot_prefix_modifier(string, state):
+    """
+    Modifies the prefix for the next bot reply in chat mode.
+    By default, the prefix will be something like "Bot Name:".
+    """
+    return string
+
+def tokenizer_modifier(state, prompt, input_ids, input_embeds):
+    """
+    Modifies the input ids and embeds.
+    Used by the multimodal extension to put image embeddings in the prompt.
+    Only used by loaders that use the transformers library for sampling.
+    """
+    return prompt, input_ids, input_embeds
+
+def logits_processor_modifier(processor_list, input_ids):
+    """
+    Adds logits processors to the list, allowing you to access and modify
+    the next token probabilities.
+    Only used by loaders that use the transformers library for sampling.
+    """
+    processor_list.append(MyLogits())
+    return processor_list
+
+def output_modifier(string, state):
+    """
+    Modifies the LLM output before it gets presented.
+
+    In chat mode, the modified version goes into history['visible'],
+    and the original version goes into history['internal'].
+    """
+    return string
 
 def custom_generate_chat_prompt(user_input, state, **kwargs):
-    
-    # Do something with kwargs['history'] or state
-
-    return chat.generate_chat_prompt(user_input, state, **kwargs)
+    """
+    Replaces the function that generates the prompt from the chat history.
+    Only used in chat mode.
+    """
+    result = chat.generate_chat_prompt(user_input, state, **kwargs)
+    return result
+
+def custom_css():
+    """
+    Returns a CSS string that gets appended to the CSS for the webui.
+    """
+    return ''
+
+def custom_js():
+    """
+    Returns a javascript string that gets appended to the javascript
+    for the webui.
+    """
+    return ''
+
+def setup():
+    """
+    Gets executed only once, when the extension is imported.
+    """
+    pass
+
+def ui():
+    """
+    Gets executed when the UI is drawn. Custom gradio elements and
+    their corresponding event handlers should be defined here.
+
+    To learn about gradio components, check out the docs:
+    https://gradio.app/docs/
+    """
+    pass
 ```
diff --git a/docs/FlexGen.md b/docs/FlexGen.md
deleted file mode 100644
index 931cc36f..00000000
--- a/docs/FlexGen.md
+++ /dev/null
@@ -1,64 +0,0 @@
->FlexGen is a high-throughput generation engine for running large language models with limited GPU memory (e.g., a 16GB T4 GPU or a 24GB RTX3090 gaming card!).
-
-https://github.com/FMInference/FlexGen
-
-## Installation
-
-No additional installation steps are necessary. FlexGen is in the `requirements.txt` file for this project.
-
-## Converting a model
-
-FlexGen only works with the OPT model, and it needs to be converted to numpy format before starting the web UI:
-
-```
-python convert-to-flexgen.py models/opt-1.3b/
-```
-
-The output will be saved to `models/opt-1.3b-np/`.
-
-## Usage
-
-The basic command is the following:
-
-```
-python server.py --model opt-1.3b  --loader flexgen
-```
-
-For large models, the RAM usage may be too high and your computer may freeze. If that happens, you can try this:
-
-```
-python server.py --model opt-1.3b  --loader flexgen --compress-weight
-```
-
-With this second command, I was able to run both OPT-6.7b and OPT-13B with **2GB VRAM**, and the speed was good in both cases.
-
-You can also manually set the offload strategy with
-
-```
-python server.py --model opt-1.3b  --loader flexgen --percent 0 100 100 0 100 0
-```
-
-where the six numbers after `--percent` are:
-
-```
-the percentage of weight on GPU
-the percentage of weight on CPU
-the percentage of attention cache on GPU
-the percentage of attention cache on CPU
-the percentage of activations on GPU
-the percentage of activations on CPU
-```
-
-You should typically only change the first two numbers. If their sum is less than 100, the remaining layers will be offloaded to the disk, by default into the `text-generation-webui/cache` folder.
-
-## Performance
-
-In my experiments with OPT-30B using a RTX 3090 on Linux, I have obtained these results:
-
-* `--loader flexgen --compress-weight --percent 0 100 100 0 100 0`: 0.99 seconds per token.
-* `--loader flexgen --compress-weight --percent 100 0 100 0 100 0`: 0.765 seconds per token.
-
-## Limitations
-
-* Only works with the OPT models.
-* Only two generation parameters are available: `temperature` and `do_sample`.
\ No newline at end of file
diff --git a/docs/Low-VRAM-guide.md b/docs/Low-VRAM-guide.md
deleted file mode 100644
index 7814ecb0..00000000
--- a/docs/Low-VRAM-guide.md
+++ /dev/null
@@ -1,53 +0,0 @@
-If you GPU is not large enough to fit a 16-bit model, try these in the following order:
-
-### Load the model in 8-bit mode
-
-```
-python server.py --load-in-8bit
-```
-
-### Load the model in 4-bit mode
-
-```
-python server.py --load-in-4bit
-```
-
-### Split the model across your GPU and CPU
-
-```
-python server.py --auto-devices
-```
-
-If you can load the model with this command but it runs out of memory when you try to generate text, try increasingly limiting the amount of memory allocated to the GPU until the error stops happening:
-
-```
-python server.py --auto-devices --gpu-memory 10
-python server.py --auto-devices --gpu-memory 9
-python server.py --auto-devices --gpu-memory 8
-...
-```
-
-where the number is in GiB.
-
-For finer control, you can also specify the unit in MiB explicitly:
-
-```
-python server.py --auto-devices --gpu-memory 8722MiB
-python server.py --auto-devices --gpu-memory 4725MiB
-python server.py --auto-devices --gpu-memory 3500MiB
-...
-```
-
-### Send layers to a disk cache
-
-As a desperate last measure, you can split the model across your GPU, CPU, and disk:
-
-```
-python server.py --auto-devices --disk
-```
-
-With this, I am able to load a 30b model into my RTX 3090, but it takes 10 seconds to generate 1 word.
-
-### DeepSpeed (experimental)
-
-An experimental alternative to all of the above is to use DeepSpeed: [guide](DeepSpeed.md).
diff --git a/docs/README.md b/docs/README.md
index 06b73b84..f3829855 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -8,10 +8,8 @@
 * [Docker](Docker.md)
 * [ExLlama](ExLlama.md)
 * [Extensions](Extensions.md)
-* [FlexGen](FlexGen.md)
 * [Generation parameters](Generation-parameters.md)
 * [GPTQ models (4 bit mode)](GPTQ-models-(4-bit-mode).md)
-* [llama.cpp models](llama.cpp-models.md)
 * [LLaMA model](LLaMA-model.md)
 * [LoRA](LoRA.md)
 * [Low VRAM guide](Low-VRAM-guide.md)
diff --git a/docs/llama.cpp-models.md b/docs/llama.cpp-models.md
deleted file mode 100644
index bcf3c046..00000000
--- a/docs/llama.cpp-models.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# Using llama.cpp in the web UI
-
-## Setting up the models
-
-#### Pre-converted
-
-Place the model in the `models` folder, making sure that its name contains `ggml` somewhere and ends in `.bin`.
-
-#### Convert LLaMA yourself
-
-Follow the instructions in the llama.cpp README to generate the `ggml-model.bin` file: https://github.com/ggerganov/llama.cpp#usage
-
-## GPU acceleration
-
-Enabled with the `--n-gpu-layers` parameter. 
-
-* If you have enough VRAM, use a high number like `--n-gpu-layers 200000` to offload all layers to the GPU. 
-* Otherwise, start with a low number like `--n-gpu-layers 10` and then gradually increase it until you run out of memory.
-
-To use this feature, you need to manually compile and install `llama-cpp-python` with GPU support.
-
-#### Linux
-
-```
-pip uninstall -y llama-cpp-python
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
-```
-
-#### Windows
-
-```
-pip uninstall -y llama-cpp-python
-set CMAKE_ARGS="-DLLAMA_CUBLAS=on"
-set FORCE_CMAKE=1
-pip install llama-cpp-python --no-cache-dir
-```
-
-#### macOS
-
-```
-pip uninstall -y llama-cpp-python
-CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
-```
-
-Here you can find the different compilation options for OpenBLAS / cuBLAS / CLBlast: https://pypi.org/project/llama-cpp-python/
-
-## Performance
-
-This was the performance of llama-7b int4 on my i5-12400F (cpu only):
-
-> Output generated in 33.07 seconds (6.05 tokens/s, 200 tokens, context 17)
-
-You can change the number of threads with `--threads N`.
diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py
index edc6d8f4..fbbc5ec1 100644
--- a/extensions/api/blocking_api.py
+++ b/extensions/api/blocking_api.py
@@ -7,10 +7,15 @@
 from modules.chat import generate_chat_reply
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
-from modules.models_settings import (get_model_settings_from_yamls,
-                                     update_model_parameters)
-from modules.text_generation import (encode, generate_reply,
-                                     stop_everything_event)
+from modules.models_settings import (
+    get_model_settings_from_yamls,
+    update_model_parameters
+)
+from modules.text_generation import (
+    encode,
+    generate_reply,
+    stop_everything_event
+)
 from modules.utils import get_available_models
 
 
diff --git a/extensions/api/streaming_api.py b/extensions/api/streaming_api.py
index 88359e3e..6afa827d 100644
--- a/extensions/api/streaming_api.py
+++ b/extensions/api/streaming_api.py
@@ -2,12 +2,15 @@
 import json
 from threading import Thread
 
-from websockets.server import serve
-
-from extensions.api.util import build_parameters, try_start_cloudflared, with_api_lock
+from extensions.api.util import (
+    build_parameters,
+    try_start_cloudflared,
+    with_api_lock
+)
 from modules import shared
 from modules.chat import generate_chat_reply
 from modules.text_generation import generate_reply
+from websockets.server import serve
 
 PATH = '/api/v1/stream'
 
diff --git a/extensions/api/util.py b/extensions/api/util.py
index a9d581eb..2358b7d2 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -10,7 +10,6 @@
 from modules.chat import load_character_memoized
 from modules.presets import load_preset_memoized
 
-
 # We use a thread local to store the asyncio lock, so that each thread
 # has its own lock.  This isn't strictly necessary, but it makes it
 # such that if we can support multiple worker threads in the future,
diff --git a/extensions/example/script.py b/extensions/example/script.py
new file mode 100644
index 00000000..669749c0
--- /dev/null
+++ b/extensions/example/script.py
@@ -0,0 +1,137 @@
+"""
+An example of extension. It does nothing, but you can add transformations
+before the return statements to customize the webui behavior.
+
+Starting from history_modifier and ending in output_modifier, the
+functions are declared in the same order that they are called at
+generation time.
+"""
+
+import torch
+from modules import chat
+from modules.text_generation import (
+    decode,
+    encode,
+    generate_reply,
+)
+from transformers import LogitsProcessor
+
+params = {
+    "display_name": "Example Extension",
+    "is_tab": False,
+}
+
+class MyLogits(LogitsProcessor):
+    """
+    Manipulates the probabilities for the next token before it gets sampled.
+    Used in the logits_processor_modifier function below.
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, input_ids, scores):
+        # probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+        # probs[0] /= probs[0].sum()
+        # scores = torch.log(probs / (1 - probs))
+        return scores
+
+def history_modifier(history):
+    """
+    Modifies the chat history.
+    Only used in chat mode.
+    """
+    return history
+
+def state_modifier(state):
+    """
+    Modifies the state variable, which is a dictionary containing the input
+    values in the UI like sliders and checkboxes.
+    """
+    return state
+
+def chat_input_modifier(text, visible_text, state):
+    """
+    Modifies the user input string in chat mode (visible_text).
+    You can also modify the internal representation of the user
+    input (text) to change how it will appear in the prompt.
+    """
+    return text, visible_text
+
+def input_modifier(string, state):
+    """
+    In default/notebook modes, modifies the whole prompt.
+
+    In chat mode, it is the same as chat_input_modifier but only applied
+    to "text", here called "string", and not to "visible_text".
+    """
+    return string
+
+def bot_prefix_modifier(string, state):
+    """
+    Modifies the prefix for the next bot reply in chat mode.
+    By default, the prefix will be something like "Bot Name:".
+    """
+    return string
+
+def tokenizer_modifier(state, prompt, input_ids, input_embeds):
+    """
+    Modifies the input ids and embeds.
+    Used by the multimodal extension to put image embeddings in the prompt.
+    Only used by loaders that use the transformers library for sampling.
+    """
+    return prompt, input_ids, input_embeds
+
+def logits_processor_modifier(processor_list, input_ids):
+    """
+    Adds logits processors to the list, allowing you to access and modify
+    the next token probabilities.
+    Only used by loaders that use the transformers library for sampling.
+    """
+    processor_list.append(MyLogits())
+    return processor_list
+
+def output_modifier(string, state):
+    """
+    Modifies the LLM output before it gets presented.
+
+    In chat mode, the modified version goes into history['visible'],
+    and the original version goes into history['internal'].
+    """
+    return string
+
+def custom_generate_chat_prompt(user_input, state, **kwargs):
+    """
+    Replaces the function that generates the prompt from the chat history.
+    Only used in chat mode.
+    """
+    result = chat.generate_chat_prompt(user_input, state, **kwargs)
+    return result
+
+def custom_css():
+    """
+    Returns a CSS string that gets appended to the CSS for the webui.
+    """
+    return ''
+
+def custom_js():
+    """
+    Returns a javascript string that gets appended to the javascript
+    for the webui.
+    """
+    return ''
+
+def setup():
+    """
+    Gets executed only once, when the extension is imported.
+    """
+    pass
+
+def ui():
+    """
+    Gets executed when the UI is drawn. Custom gradio elements and
+    their corresponding event handlers should be defined here.
+
+    To learn about gradio components, check out the docs:
+    https://gradio.app/docs/
+    """
+    pass
diff --git a/extensions/llava/script.py b/extensions/llava/script.py
deleted file mode 100644
index 781d584b..00000000
--- a/extensions/llava/script.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import gradio as gr
-
-from modules.logging_colors import logger
-
-
-def ui():
-    gr.Markdown("### This extension is deprecated, use \"multimodal\" extension instead")
-    logger.error("LLaVA extension is deprecated, use \"multimodal\" extension instead")
diff --git a/extensions/multimodal/script.py b/extensions/multimodal/script.py
index b3f654e4..8bc26315 100644
--- a/extensions/multimodal/script.py
+++ b/extensions/multimodal/script.py
@@ -35,6 +35,15 @@
 multimodal_embedder: MultimodalEmbedder = None
 
 
+def chat_input_modifier(text, visible_text, state):
+    global input_hijack
+    if input_hijack['state']:
+        input_hijack['state'] = False
+        return input_hijack['value'](text, visible_text)
+    else:
+        return text, visible_text
+
+
 def add_chat_picture(picture, text, visible_text):
     # resize the image, so that shortest edge is at least 224 (size for CLIP), and at most 300 (to keep history manageable)
     max_hw, min_hw = max(picture.size), min(picture.size)
diff --git a/extensions/openai/README.md b/extensions/openai/README.md
index 7bbc1e83..2083734a 100644
--- a/extensions/openai/README.md
+++ b/extensions/openai/README.md
@@ -1,17 +1,15 @@
 # An OpenedAI API (openai like)
 
 This extension creates an API that works kind of like openai (ie. api.openai.com).
-It's incomplete so far but perhaps is functional enough for you.
 
 ## Setup & installation 
 
-Optional (for flask_cloudflared, embeddings):
-
+Install the requirements:
 ```
 pip3 install -r requirements.txt
 ```
 
-It listens on tcp port 5001 by default. You can use the OPENEDAI_PORT environment variable to change this.
+It listens on ```tcp port 5001``` by default. You can use the ```OPENEDAI_PORT``` environment variable to change this.
 
 Make sure you enable it in server launch parameters, it should include:
 
@@ -21,13 +19,30 @@ Make sure you enable it in server launch parameters, it should include:
 
 You can also use the ``--listen`` argument to make the server available on the networ, and/or the ```--share``` argument to enable a public Cloudflare endpoint.
 
-To enable the basic image generation support (txt2img) set the environment variable SD_WEBUI_URL to point to your Stable Diffusion API ([Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui)).
+To enable the basic image generation support (txt2img) set the environment variable ```SD_WEBUI_URL``` to point to your Stable Diffusion API ([Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui)).
 
 For example:
 ```
 SD_WEBUI_URL=http://127.0.0.1:7861
 ```
 
+## Quick start
+
+1. Install the requirements.txt (pip)
+2. Enable the ```openeai``` module (--extensions openai), restart the server.
+3. Configure the openai client
+
+Most openai application can be configured to connect the API if you set the following environment variables:
+
+```shell
+# Sample .env file:
+OPENAI_API_KEY=sk-111111111111111111111111111111111111111111111111
+OPENAI_API_BASE=http://0.0.0.0:5001/v1
+```
+
+If needed, replace 0.0.0.0 with the IP/port of your server.
+
+
 ### Models
 
 This has been successfully tested with Alpaca, Koala, Vicuna, WizardLM and their variants, (ex. gpt4-x-alpaca, GPT4all-snoozy, stable-vicuna, wizard-vicuna, etc.) and many others. Models that have been trained for **Instruction Following** work best. If you test with other models please let me know how it goes. Less than satisfying results (so far) from: RWKV-4-Raven, llama, mpt-7b-instruct/chat.
@@ -36,7 +51,7 @@ For best results across all API endpoints, a model like [vicuna-13b-v1.3-GPTQ](h
 
 For good results with the [Completions](https://platform.openai.com/docs/api-reference/completions) API endpoint, in addition to the above models, you can also try using a base model like [falcon-7b](https://huggingface.co/tiiuae/falcon-7b) or Llama.
 
-For good results with the [ChatCompletions](https://platform.openai.com/docs/api-reference/chat) or [Edits](https://platform.openai.com/docs/api-reference/edits) API endpoints you can use almost any model trained for instruction following - within the limits of the model. Be sure that the proper instruction template is detected and loaded or the results will not be good.
+For good results with the [ChatCompletions](https://platform.openai.com/docs/api-reference/chat) or [Edits](https://platform.openai.com/docs/api-reference/edits) API endpoints you can use almost any model trained for instruction following. Be sure that the proper instruction template is detected and loaded or the results will not be good.
 
 For the proper instruction format to be detected you need to have a matching model entry in your ```models/config.yaml``` file. Be sure to keep this file up to date.
 A matching instruction template file in the characters/instruction-following/ folder will loaded and applied to format messages correctly for the model - this is critical for good results.
@@ -76,7 +91,7 @@ Embeddings requires ```sentence-transformers``` installed, but chat and completi
 | all-mpnet-base-v2 | 768 | 384 | 2800 | 420M | 63.3 |
 | all-MiniLM-L6-v2 | 384 | 256 | 14200 | 80M | 58.8 |
 
-In short, the all-MiniLM-L6-v2 model is 5x faster, 5x smaller ram, 2x smaller storage, and still offers good quality. Stats from (https://www.sbert.net/docs/pretrained_models.html). To change the model from the default you can set the environment variable OPENEDAI_EMBEDDING_MODEL, ex. "OPENEDAI_EMBEDDING_MODEL=all-MiniLM-L6-v2".
+In short, the all-MiniLM-L6-v2 model is 5x faster, 5x smaller ram, 2x smaller storage, and still offers good quality. Stats from (https://www.sbert.net/docs/pretrained_models.html). To change the model from the default you can set the environment variable ```OPENEDAI_EMBEDDING_MODEL```, ex. "OPENEDAI_EMBEDDING_MODEL=all-MiniLM-L6-v2".
 
 Warning: You cannot mix embeddings from different models even if they have the same dimensions. They are not comparable.
 
@@ -85,26 +100,27 @@ Warning: You cannot mix embeddings from different models even if they have the s
 
 Almost everything you use it with will require you to set a dummy OpenAI API key environment variable.
 
-With the [official python openai client](https://github.com/openai/openai-python), you can set the OPENAI_API_BASE environment variable before you import the openai module, like so:
+With the [official python openai client](https://github.com/openai/openai-python), set the ```OPENAI_API_BASE``` environment variables:
 
-```
+```shell
+# Sample .env file:
 OPENAI_API_KEY=sk-111111111111111111111111111111111111111111111111
-OPENAI_API_BASE=http://127.0.0.1:5001/v1
+OPENAI_API_BASE=http://0.0.0.0:5001/v1
 ```
 
-If needed, replace 127.0.0.1 with the IP/port of your server.
+If needed, replace 0.0.0.0 with the IP/port of your server.
 
-If using .env files to save the OPENAI_API_BASE and OPENAI_API_KEY variables, you can ensure compatibility by loading the .env file before loading the openai module, like so in python:
+If using .env files to save the ```OPENAI_API_BASE``` and ```OPENAI_API_KEY``` variables, make sure the .env file is loaded before the openai module is imported:
 
-```
+```python
 from dotenv import load_dotenv
-load_dotenv()
+load_dotenv() # make sure the environment variables are set before import
 import openai
 ```
 
 With the [official Node.js openai client](https://github.com/openai/openai-node) it is slightly more more complex because the environment variables are not used by default, so small source code changes may be required to use the environment variables, like so:
 
-```
+```js
 const openai = OpenAI(Configuration({
   apiKey: process.env.OPENAI_API_KEY,
   basePath: process.env.OPENAI_API_BASE,
@@ -113,7 +129,7 @@ const openai = OpenAI(Configuration({
 
 For apps made with the [chatgpt-api Node.js client library](https://github.com/transitive-bullshit/chatgpt-api):
 
-```
+```js
 const api = new ChatGPTAPI({
   apiKey: process.env.OPENAI_API_KEY,
   apiBaseUrl: process.env.OPENAI_API_BASE,
@@ -127,39 +143,43 @@ The OpenAI API is well documented, you can view the documentation here: https://
 Examples of how to use the Completions API in Python can be found here: https://platform.openai.com/examples
 Not all of them will work with all models unfortunately, See the notes on Models for how to get the best results.
 
-Here is a simple python example of how you can use the Edit endpoint as a translator.
+Here is a simple python example.
 
 ```python
+import os
+os.environ['OPENAI_API_KEY']="sk-111111111111111111111111111111111111111111111111"
+os.environ['OPENAI_API_BASE']="http://0.0.0.0:5001/v1"
 import openai
-response = openai.Edit.create(
+
+response = openai.ChatCompletion.create(
   model="x",
-  instruction="Translate this into French",
-  input="Our mission is to ensure that artificial general intelligence benefits all of humanity.",
+  messages = [{ 'role': 'system', 'content': "Answer in a consistent style." },
+    {'role': 'user', 'content': "Teach me about patience."},
+    {'role': 'assistant', 'content': "The river that carves the deepest valley flows from a modest spring; the grandest symphony originates from a single note; the most intricate tapestry begins with a solitary thread."},
+    {'role': 'user', 'content': "Teach me about the ocean."},
+  ]
 )
-print(response['choices'][0]['text'])
-# Sample Output:
-# Notre mission est de garantir que l'intelligence artificielle généralisée profite à tous les membres de l'humanité.
+text = response['choices'][0]['message']['content']
+print(text)
 ```
 
-
-
 ## Compatibility & not so compatibility
 
 | API endpoint | tested with | notes |
 | --- | --- | --- |
-| /v1/models | openai.Model.list() | Lists models, Currently loaded model first, plus some compatibility options |
-| /v1/models/{id} | openai.Model.get() | returns whatever you ask for, model does nothing yet anyways |
-| /v1/text_completion | openai.Completion.create() | the most tested, only supports single string input so far, variable quality based on the model |
-| /v1/chat/completions | openai.ChatCompletion.create() | Quality depends a lot on the model |
-| /v1/edits | openai.Edit.create() | Works the best of all, perfect for instruction following models |
+| /v1/chat/completions | openai.ChatCompletion.create() | Use it with instruction following models |
+| /v1/embeddings | openai.Embedding.create() | Using SentenceTransformer embeddings |
 | /v1/images/generations | openai.Image.create() | Bare bones, no model configuration, response_format='b64_json' only. |
-| /v1/embeddings | openai.Embedding.create() | Using Sentence Transformer, dimensions are different and may never be directly comparable to openai embeddings. |
-| /v1/moderations | openai.Moderation.create() | does nothing. successfully. |
+| /v1/moderations | openai.Moderation.create() | Basic initial support via embeddings |
+| /v1/models | openai.Model.list() | Lists models, Currently loaded model first, plus some compatibility options |
+| /v1/models/{id} | openai.Model.get() | returns whatever you ask for |
+| /v1/edits | openai.Edit.create() | Deprecated by openai, good with instruction following models |
+| /v1/text_completion | openai.Completion.create() | Legacy endpoint, doesn't support array input, variable quality based on the model |
 | /v1/completions | openai api completions.create | Legacy endpoint (v0.25) |
 | /v1/engines/*/embeddings | python-openai v0.25 | Legacy endpoint |
 | /v1/engines/*/generate | openai engines.generate | Legacy endpoint |
 | /v1/engines | openai engines.list | Legacy Lists models |
-| /v1/engines/{model_name} | openai engines.get -i {model_name} | You can use this legacy endpoint to load models via the api |
+| /v1/engines/{model_name} | openai engines.get -i {model_name} | You can use this legacy endpoint to load models via the api or command line |
 | /v1/images/edits | openai.Image.create_edit() | not yet supported |
 | /v1/images/variations | openai.Image.create_variation() | not yet supported |
 | /v1/audio/\* | openai.Audio.\* | not yet supported |
@@ -167,7 +187,7 @@ print(response['choices'][0]['text'])
 | /v1/fine-tunes\* | openai.FineTune.\* | not yet supported |
 | /v1/search | openai.search, engines.search | not yet supported |
 
-The model name setting is ignored in completions, but you may need to adjust the maximum token length to fit the model (ie. set to <2048 tokens instead of 4096, 8k, etc). To mitigate some of this, the max_tokens value is halved until it is less than truncation_length for the model (typically 2k).
+Because of the differences in OpenAI model context sizes (2k, 4k, 8k, 16k, etc,) you may need to adjust the max_tokens to fit into the context of the model you choose.
 
 Streaming, temperature, top_p, max_tokens, stop, should all work as expected, but not all parameters are mapped correctly.
 
@@ -175,41 +195,29 @@ Some hacky mappings:
 
 | OpenAI | text-generation-webui | note |
 | --- | --- | --- |
+| model | - | Ignored, the model is not changed |
 | frequency_penalty | encoder_repetition_penalty | this seems to operate with a different scale and defaults, I tried to scale it based on range & defaults, but the results are terrible. hardcoded to 1.18 until there is a better way |
 | presence_penalty | repetition_penalty | same issues as frequency_penalty, hardcoded to 1.0 |
-| best_of | top_k | default is 1 |
-| stop | custom_stopping_strings | this is also stuffed with ['\n###', "\n{user prompt}", "{user prompt}" ] for good measure. |
+| best_of | top_k | default is 1 (top_k is 20 for chat, which doesn't support best_of) |
 | n | 1 | variations are not supported yet. |
 | 1 | num_beams | hardcoded to 1 |
 | 1.0 | typical_p | hardcoded to 1.0 |
-| max_tokens | max_new_tokens | For Text Completions max_tokens is set smaller than the truncation_length minus the prompt length. This can cause no input to be generated if the prompt is too large. For ChatCompletions, the older chat messages may be dropped to fit the max_new_tokens requested |
-| logprobs | - | not supported yet |
-| logit_bias | - | not supported yet |
+| logprobs & logit_bias | - | experimental, llama only, transformers-kin only (ExLlama_HF ok), can also use llama tokens if 'model' is not an openai model or will convert from tiktoken for the openai model specified in 'model' |
 | messages.name | - | not supported yet |
 | user | - | not supported yet |
 | functions/function_call | - | function calls are not supported yet |
 
-defaults are mostly from openai, so are different. I use the openai defaults where I can and try to scale them to the webui defaults with the same intent.
 
 ### Applications
 
-Almost everything needs the OPENAI_API_KEY environment variable set, for example:
-```
-OPENAI_API_KEY=sk-111111111111111111111111111111111111111111111111
-```
-Some apps are picky about key format, but 'dummy' or 'sk-dummy' also work in most cases.
-Most application will work if you also set:
-```
-OPENAI_API_BASE=http://127.0.0.1:5001/v1
-```
-but there are some exceptions.
+Almost everything needs the ```OPENAI_API_KEY``` and ```OPENAI_API_BASE``` environment variable set, but there are some exceptions.
 
-| Compatibility | Application/Library | url | notes / setting |
+| Compatibility | Application/Library | Website | Notes |
 | --- | --- | --- | --- |
 | ✅❌ | openai-python (v0.25+) | https://github.com/openai/openai-python | only the endpoints from above are working. OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
 | ✅❌ | openai-node | https://github.com/openai/openai-node | only the endpoints from above are working. environment variables don't work by default, but can be configured (see above) |
 | ✅❌ | chatgpt-api | https://github.com/transitive-bullshit/chatgpt-api | only the endpoints from above are working. environment variables don't work by default, but can be configured (see above) |
-| ✅ | anse | https://github.com/anse-app/anse | API Key & URL configurable in UI |
+| ✅ | anse | https://github.com/anse-app/anse | API Key & URL configurable in UI, Images also work |
 | ✅ | shell_gpt | https://github.com/TheR1D/shell_gpt | OPENAI_API_HOST=http://127.0.0.1:5001 |
 | ✅ | gpt-shell | https://github.com/jla/gpt-shell | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
 | ✅ | gpt-discord-bot | https://github.com/openai/gpt-discord-bot | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
@@ -221,11 +229,12 @@ but there are some exceptions.
 | ❌ | guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported |
 
 ## Future plans
+* better error handling
 * model changing, esp. something for swapping loras or embedding models
 * consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard)
 
 ## Bugs? Feedback? Comments? Pull requests?
 
-To enable debugging and get copious output you can set the OPENEDAI_DEBUG=1 environment variable.
+To enable debugging and get copious output you can set the ```OPENEDAI_DEBUG=1``` environment variable.
 
-Are all appreciated, please @matatonic and I'll try to get back to you as soon as possible.
+Are all appreciated, please @matatonic and I'll try to get back to you as soon as possible.
\ No newline at end of file
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 23c5dbee..e1baa249 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -18,41 +18,50 @@
 class LogitsBiasProcessor(LogitsProcessor):
     def __init__(self, logit_bias={}):
         self.logit_bias = logit_bias
-        super().__init__()
+        if self.logit_bias:
+            self.keys = list([int(key) for key in self.logit_bias.keys()])
+            values = [ self.logit_bias[str(key)] for key in self.keys ]
+            self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
+            debug_msg(f"{self})")
 
     def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
         if self.logit_bias:
-            keys = list([int(key) for key in self.logit_bias.keys()])
-            values = list([int(val) for val in self.logit_bias.values()])
-            logits[0, keys] += torch.tensor(values).cuda()
-
+            debug_msg(logits[0, self.keys], " + ", self.values)
+            logits[0, self.keys] += self.values
+            debug_msg(" --> ", logits[0, self.keys])
+            debug_msg(" max/min ", float(torch.max(logits[0])), float(torch.min(logits[0])))
         return logits
 
+    def __repr__(self):
+        return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
 
 class LogprobProcessor(LogitsProcessor):
     def __init__(self, logprobs=None):
         self.logprobs = logprobs
         self.token_alternatives = {}
-        super().__init__()
 
     def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
         if self.logprobs is not None:  # 0-5
             log_e_probabilities = F.log_softmax(logits, dim=1)
-            # XXX hack. should find the selected token and include the prob of that
-            # ... but we just +1 here instead because we don't know it yet.
-            top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
-            top_tokens = [decode(tok) for tok in top_indices[0]]
-            self.token_alternatives = dict(zip(top_tokens, top_values[0].tolist()))
+            top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs+1)
+            top_tokens = [ decode(tok) for tok in top_indices[0] ]
+            top_probs = [ float(x) for x in top_values[0] ]
+            self.token_alternatives = dict(zip(top_tokens, top_probs))
+            debug_msg(f"{self.__class__.__name__}(logprobs+1={self.logprobs+1}, token_alternatives={self.token_alternatives})")
         return logits
 
+    def __repr__(self):
+        return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
+
 
 def convert_logprobs_to_tiktoken(model, logprobs):
-    try:
-        encoder = tiktoken.encoding_for_model(model)
-        # just pick the first one if it encodes to multiple tokens... 99.9% not required and maybe worse overall.
-        return dict([(encoder.decode([encoder.encode(token)[0]]), prob) for token, prob in logprobs.items()])
-    except KeyError:
-        # assume native tokens if we can't find the tokenizer
+# more problems than it's worth.
+#    try:
+#        encoder = tiktoken.encoding_for_model(model)
+#        # just pick the first one if it encodes to multiple tokens... 99.9% not required and maybe worse overall.
+#        return dict([(encoder.decode([encoder.encode(token)[0]]), prob) for token, prob in logprobs.items()])
+#    except KeyError:
+#        # assume native tokens if we can't find the tokenizer
         return logprobs
 
 
@@ -73,8 +82,8 @@ def marshal_common_params(body):
     req_params['requested_model'] = body.get('model', shared.model_name)
 
     req_params['suffix'] = default(body, 'suffix', req_params['suffix'])
-    req_params['temperature'] = clamp(default(body, 'temperature', req_params['temperature']), 0.001, 1.999)  # fixup absolute 0.0/2.0
-    req_params['top_p'] = clamp(default(body, 'top_p', req_params['top_p']), 0.001, 1.0)
+    req_params['temperature'] = clamp(default(body, 'temperature', req_params['temperature']), 0.01, 1.99)  # fixup absolute 0.0/2.0
+    req_params['top_p'] = clamp(default(body, 'top_p', req_params['top_p']), 0.01, 1.0)
     n = default(body, 'n', 1)
     if n != 1:
         raise InvalidRequestError(message="Only n = 1 is supported.", param='n')
@@ -87,6 +96,11 @@ def marshal_common_params(body):
 
     # presence_penalty - ignored
     # frequency_penalty - ignored
+
+    # pass through unofficial params
+    req_params['repetition_penalty'] = default(body, 'repetition_penalty', req_params['repetition_penalty'])
+    req_params['encoder_repetition_penalty'] = default(body, 'encoder_repetition_penalty', req_params['encoder_repetition_penalty'])
+
     # user - ignored
 
     logits_processor = []
@@ -98,9 +112,11 @@ def marshal_common_params(body):
             encoder = tiktoken.encoding_for_model(req_params['requested_model'])
             new_logit_bias = {}
             for logit, bias in logit_bias.items():
-                for x in encode(encoder.decode([int(logit)]))[0]:
+                for x in encode(encoder.decode([int(logit)]), add_special_tokens=False)[0]:
+                    if int(x) in [0, 1, 2, 29871]: # XXX LLAMA tokens
+                        continue
                     new_logit_bias[str(int(x))] = bias
-            print(logit_bias, '->', new_logit_bias)
+            debug_msg('logit_bias_map', logit_bias, '->', new_logit_bias)
             logit_bias = new_logit_bias
         except KeyError:
             pass  # assume native tokens if we can't find the tokenizer
@@ -134,11 +150,11 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
     messages = body['messages']
 
     role_formats = {
-        'user': 'user: {message}\n',
-        'assistant': 'assistant: {message}\n',
+        'user': 'User: {message}\n',
+        'assistant': 'Assistant: {message}\n',
         'system': '{message}',
-        'context': 'You are a helpful assistant. Answer as concisely as possible.',
-        'prompt': 'assistant:',
+        'context': 'You are a helpful assistant. Answer as concisely as possible.\nUser: I want your assistance.\nAssistant: Sure! What can I do for you?',
+        'prompt': 'Assistant:',
     }
 
     if not 'stopping_strings' in req_params:
@@ -151,10 +167,10 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
 
             template = instruct['turn_template']
             system_message_template = "{message}"
-            system_message_default = instruct['context']
+            system_message_default = instruct.get('context', '') # can be missing
             bot_start = template.find('<|bot|>')  # So far, 100% of instruction templates have this token
-            user_message_template = template[:bot_start].replace('<|user-message|>', '{message}').replace('<|user|>', instruct['user'])
-            bot_message_template = template[bot_start:].replace('<|bot-message|>', '{message}').replace('<|bot|>', instruct['bot'])
+            user_message_template = template[:bot_start].replace('<|user-message|>', '{message}').replace('<|user|>', instruct.get('user', ''))
+            bot_message_template = template[bot_start:].replace('<|bot-message|>', '{message}').replace('<|bot|>', instruct.get('bot', ''))
             bot_prompt = bot_message_template[:bot_message_template.find('{message}')].rstrip(' ')
 
             role_formats = {
@@ -173,13 +189,13 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
             debug_msg(f"Loaded instruction role format: {shared.settings['instruction_template']}")
 
         except Exception as e:
-            req_params['stopping_strings'].extend(['\nuser:'])
+            req_params['stopping_strings'].extend(['\nUser:', 'User:'])  # XXX User: prompt here also
 
             print(f"Exception: When loading characters/instruction-following/{shared.settings['instruction_template']}.yaml: {repr(e)}")
             print("Warning: Loaded default instruction-following template for model.")
 
     else:
-        req_params['stopping_strings'].extend(['\nuser:'])
+        req_params['stopping_strings'].extend(['\nUser:', 'User:'])  # XXX User: prompt here also
         print("Warning: Loaded default instruction-following template for model.")
 
     system_msgs = []
@@ -194,6 +210,11 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
         context_msg = end_line(role_formats['system'].format(message=body['prompt'])) + context_msg
 
     for m in messages:
+        if 'role' not in m:
+            raise InvalidRequestError(message="messages: missing role", param='messages')
+        if 'content' not in m:
+            raise InvalidRequestError(message="messages: missing content", param='messages')
+        
         role = m['role']
         content = m['content']
         # name = m.get('name', None)
@@ -215,12 +236,12 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
 
     if token_count >= req_params['truncation_length']:
         err_msg = f"This model maximum context length is {req_params['truncation_length']} tokens. However, your messages resulted in over {token_count} tokens."
-        raise InvalidRequestError(message=err_msg)
+        raise InvalidRequestError(message=err_msg, param='messages')
 
     if max_tokens > 0 and token_count + max_tokens > req_params['truncation_length']:
         err_msg = f"This model maximum context length is {req_params['truncation_length']} tokens. However, your messages resulted in over {token_count} tokens and max_tokens is {max_tokens}."
         print(f"Warning: ${err_msg}")
-        # raise InvalidRequestError(message=err_msg)
+        # raise InvalidRequestError(message=err_msg, params='max_tokens')
 
     return prompt, token_count
 
@@ -251,6 +272,10 @@ def chat_completions(body: dict, is_legacy: bool = False) -> dict:
     # format the prompt from messages
     prompt, token_count = messages_to_prompt(body, req_params, max_tokens)
 
+    # set real max, avoid deeper errors
+    if req_params['max_new_tokens'] + token_count >= req_params['truncation_length']:
+        req_params['max_new_tokens'] = req_params['truncation_length'] - token_count
+
     # generate reply #######################################
     debug_msg({'prompt': prompt, 'req_params': req_params})
     stopping_strings = req_params.pop('stopping_strings', [])
@@ -267,7 +292,7 @@ def chat_completions(body: dict, is_legacy: bool = False) -> dict:
 
     completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
-    if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= max_tokens:
+    if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= req_params['max_new_tokens']:
         stop_reason = "length"
 
     resp = {
@@ -323,6 +348,10 @@ def stream_chat_completions(body: dict, is_legacy: bool = False):
     # format the prompt from messages
     prompt, token_count = messages_to_prompt(body, req_params, max_tokens)
 
+    # set real max, avoid deeper errors
+    if req_params['max_new_tokens'] + token_count >= req_params['truncation_length']:
+        req_params['max_new_tokens'] = req_params['truncation_length'] - token_count
+
     def chat_streaming_chunk(content):
         # begin streaming
         chunk = {
@@ -352,7 +381,6 @@ def chat_streaming_chunk(content):
     debug_msg({'prompt': prompt, 'req_params': req_params})
 
     stopping_strings = req_params.pop('stopping_strings', [])
-    logprob_proc = req_params.pop('logprob_proc', None)
 
     generator = generate_reply(prompt, req_params, stopping_strings=stopping_strings, is_chat=False)
 
@@ -375,13 +403,17 @@ def chat_streaming_chunk(content):
         if len_seen == 0 and new_content[0] == ' ':
             new_content = new_content[1:]
 
-        completion_token_count += len(encode(new_content)[0])
         chunk = chat_streaming_chunk(new_content)
 
         yield chunk
 
+    # to get the correct token_count, strip leading space if present
+    if answer and answer[0] == ' ':
+        answer = answer[1:]
+
+    completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
-    if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= max_tokens:
+    if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= req_params['max_new_tokens']:
         stop_reason = "length"
 
     chunk = chat_streaming_chunk('')
@@ -413,7 +445,7 @@ def completions(body: dict, is_legacy: bool = False):
         if prompt and isinstance(prompt[0], int):
             try:
                 encoder = tiktoken.encoding_for_model(requested_model)
-                prompt = encode(encoder.decode(prompt))[0]
+                prompt = encoder.decode(prompt)
             except KeyError:
                 prompt = decode(prompt)[0]
         else:
@@ -441,7 +473,6 @@ def completions(body: dict, is_legacy: bool = False):
     # generate reply #######################################
     debug_msg({'prompt': prompt, 'req_params': req_params})
     stopping_strings = req_params.pop('stopping_strings', [])
-    logprob_proc = req_params.pop('logprob_proc', None)
     generator = generate_reply(prompt, req_params, stopping_strings=stopping_strings, is_chat=False)
 
     answer = ''
@@ -475,7 +506,7 @@ def completions(body: dict, is_legacy: bool = False):
         }
     }
 
-    if logprob_proc:
+    if logprob_proc and logprob_proc.token_alternatives:
         top_logprobs = convert_logprobs_to_tiktoken(model=requested_model, logprobs=logprob_proc.token_alternatives)
         resp[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
     else:
@@ -504,7 +535,7 @@ def stream_completions(body: dict, is_legacy: bool = False):
         if prompt and isinstance(prompt[0], int):
             try:
                 encoder = tiktoken.encoding_for_model(requested_model)
-                prompt = encode(encoder.decode(prompt))[0]
+                prompt = encoder.decode(prompt)
             except KeyError:
                 prompt = decode(prompt)[0]
         else:
@@ -579,9 +610,13 @@ def text_streaming_chunk(content):
 
         chunk = text_streaming_chunk(new_content)
 
-        completion_token_count += len(encode(new_content)[0])
         yield chunk
 
+    # to get the correct count, we strip the leading space if present
+    if answer and answer[0] == ' ':
+        answer = answer[1:]
+
+    completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
     if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= max_tokens:
         stop_reason = "length"
diff --git a/extensions/openai/defaults.py b/extensions/openai/defaults.py
index 7c4f1c44..52f0d641 100644
--- a/extensions/openai/defaults.py
+++ b/extensions/openai/defaults.py
@@ -46,8 +46,6 @@ def get_default_req_params():
     return copy.deepcopy(default_req_params)
 
 # little helper to get defaults if arg is present but None and should be the same type as default.
-
-
 def default(dic, key, default):
     val = dic.get(key, default)
     if type(val) != type(default):
diff --git a/extensions/openai/embeddings.py b/extensions/openai/embeddings.py
index c02bb933..be4cd80b 100644
--- a/extensions/openai/embeddings.py
+++ b/extensions/openai/embeddings.py
@@ -1,43 +1,54 @@
 import os
 from sentence_transformers import SentenceTransformer
+import numpy as np
 from extensions.openai.utils import float_list_to_base64, debug_msg
 from extensions.openai.errors import *
 
 st_model = os.environ["OPENEDAI_EMBEDDING_MODEL"] if "OPENEDAI_EMBEDDING_MODEL" in os.environ else "all-mpnet-base-v2"
 embeddings_model = None
+# OPENEDAI_EMBEDDING_DEVICE: auto (best or cpu), cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia, privateuseone
+embeddings_device = os.environ.get("OPENEDAI_EMBEDDING_DEVICE", "cpu")
+if embeddings_device.lower() == 'auto':
+    embeddings_device = None
 
-
-def load_embedding_model(model):
+def load_embedding_model(model: str) -> SentenceTransformer:
+    global embeddings_device, embeddings_model
     try:
-        emb_model = SentenceTransformer(model)
-        print(f"\nLoaded embedding model: {model}, max sequence length: {emb_model.max_seq_length}")
+        embeddings_model = 'loading...' # flag
+        # see: https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer
+        emb_model = SentenceTransformer(model, device=embeddings_device)
+        # ... emb_model.device doesn't seem to work, always cpu anyways? but specify cpu anyways to free more VRAM
+        print(f"\nLoaded embedding model: {model} on {emb_model.device} [always seems to say 'cpu', even if 'cuda'], max sequence length: {emb_model.max_seq_length}")
     except Exception as e:
-        print(f"\nError: Failed to load embedding model: {model}")
+        embeddings_model = None
         raise ServiceUnavailableError(f"Error: Failed to load embedding model: {model}", internal_message=repr(e))
 
     return emb_model
 
 
-def get_embeddings_model():
+def get_embeddings_model() -> SentenceTransformer:
     global embeddings_model, st_model
     if st_model and not embeddings_model:
         embeddings_model = load_embedding_model(st_model)  # lazy load the model
     return embeddings_model
 
 
-def get_embeddings_model_name():
+def get_embeddings_model_name() -> str:
     global st_model
     return st_model
 
 
-def embeddings(input: list, encoding_format: str):
+def get_embeddings(input: list) -> np.ndarray:
+    return get_embeddings_model().encode(input, convert_to_numpy=True, normalize_embeddings=True, convert_to_tensor=False, device=embeddings_device)
+
+def embeddings(input: list, encoding_format: str) -> dict:
 
-    embeddings = get_embeddings_model().encode(input).tolist()
+    embeddings = get_embeddings(input)
 
     if encoding_format == "base64":
         data = [{"object": "embedding", "embedding": float_list_to_base64(emb), "index": n} for n, emb in enumerate(embeddings)]
     else:
-        data = [{"object": "embedding", "embedding": emb, "index": n} for n, emb in enumerate(embeddings)]
+        data = [{"object": "embedding", "embedding": emb.tolist(), "index": n} for n, emb in enumerate(embeddings)]
 
     response = {
         "object": "list",
diff --git a/extensions/openai/errors.py b/extensions/openai/errors.py
index ff519c4f..838d1e7c 100644
--- a/extensions/openai/errors.py
+++ b/extensions/openai/errors.py
@@ -13,8 +13,8 @@ def __repr__(self):
 
 
 class InvalidRequestError(OpenAIError):
-    def __init__(self, message, param, code=400, error_type='InvalidRequestError', internal_message=''):
-        super(OpenAIError, self).__init__(message, code, error_type, internal_message)
+    def __init__(self, message, param, code=400, internal_message=''):
+        super().__init__(message, code, internal_message)
         self.param = param
 
     def __repr__(self):
@@ -27,5 +27,5 @@ def __repr__(self):
 
 
 class ServiceUnavailableError(OpenAIError):
-    def __init__(self, message=None, code=500, error_type='ServiceUnavailableError', internal_message=''):
-        super(OpenAIError, self).__init__(message, code, error_type, internal_message)
+    def __init__(self, message="Service unavailable, please try again later.", code=503, internal_message=''):
+        super().__init__(message, code, internal_message)
diff --git a/extensions/openai/images.py b/extensions/openai/images.py
index d2be3192..9fdb625e 100644
--- a/extensions/openai/images.py
+++ b/extensions/openai/images.py
@@ -9,12 +9,16 @@ def generations(prompt: str, size: str, response_format: str, n: int):
     # Low effort implementation for compatibility. With only "prompt" being passed and assuming DALL-E
     # the results will be limited and likely poor. SD has hundreds of models and dozens of settings.
     # If you want high quality tailored results you should just use the Stable Diffusion API directly.
-    # it's too general an API to try and shape the result with specific tags like "masterpiece", etc,
-    # Will probably work best with the stock SD models.
-    # SD configuration is beyond the scope of this API.
+    # it's too general an API to try and shape the result with specific tags like negative prompts
+    # or "masterpiece", etc. SD configuration is beyond the scope of this API.
     # At this point I will not add the edits and variations endpoints (ie. img2img) because they
     # require changing the form data handling to accept multipart form data, also to properly support
     # url return types will require file management and a web serving files... Perhaps later!
+    base_model_size = 512 if not 'SD_BASE_MODEL_SIZE' in os.environ else int(os.environ.get('SD_BASE_MODEL_SIZE', 512))
+    sd_defaults = {
+        'sampler_name': 'DPM++ 2M Karras',  # vast improvement
+        'steps': 30,
+    }
 
     width, height = [int(x) for x in size.split('x')]  # ignore the restrictions on size
 
@@ -24,8 +28,21 @@ def generations(prompt: str, size: str, response_format: str, n: int):
         'width': width,
         'height': height,
         'batch_size': n,
-        'restore_faces': True,  # slightly less horrible
     }
+    payload.update(sd_defaults)
+
+    scale = min(width, height) / base_model_size
+    if scale >= 1.2:
+        # for better performance with the default size (1024), and larger res.
+        scaler = {
+            'width': width // scale,
+            'height': height // scale,
+            'hr_scale': scale,
+            'enable_hr': True,
+            'hr_upscaler': 'Latent',
+            'denoising_strength': 0.68,
+        }
+        payload.update(scaler)
 
     resp = {
         'created': int(time.time()),
@@ -38,7 +55,8 @@ def generations(prompt: str, size: str, response_format: str, n: int):
     response = requests.post(url=sd_url, json=payload)
     r = response.json()
     if response.status_code != 200 or 'images' not in r:
-        raise ServiceUnavailableError(r.get('detail', [{'msg': 'Unknown error calling Stable Diffusion'}])[0]['msg'], code=response.status_code)
+        print(r)
+        raise ServiceUnavailableError(r.get('error', 'Unknown error calling Stable Diffusion'), code=response.status_code, internal_message=r.get('errors',None))
     # r['parameters']...
     for b64_json in r['images']:
         if response_format == 'b64_json':
diff --git a/extensions/openai/moderations.py b/extensions/openai/moderations.py
index 66dfec9f..5b06a672 100644
--- a/extensions/openai/moderations.py
+++ b/extensions/openai/moderations.py
@@ -1,7 +1,7 @@
 import time
 import numpy as np
 from numpy.linalg import norm
-from extensions.openai.embeddings import get_embeddings_model
+from extensions.openai.embeddings import get_embeddings
 
 
 moderations_disabled = False  # return 0/false
@@ -11,21 +11,21 @@
 flag_threshold = 0.5
 
 
-def get_category_embeddings():
+def get_category_embeddings() -> dict:
     global category_embeddings, categories
     if category_embeddings is None:
-        embeddings = get_embeddings_model().encode(categories).tolist()
+        embeddings = get_embeddings(categories).tolist()
         category_embeddings = dict(zip(categories, embeddings))
 
     return category_embeddings
 
 
-def cosine_similarity(a, b):
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
     return np.dot(a, b) / (norm(a) * norm(b))
 
 
 # seems most openai like with all-mpnet-base-v2
-def mod_score(a, b):
+def mod_score(a: np.ndarray, b: np.ndarray) -> float:
     return 2.0 * np.dot(a, b)
 
 
@@ -37,8 +37,7 @@ def moderations(input):
         "results": [],
     }
 
-    embeddings_model = get_embeddings_model()
-    if not embeddings_model or moderations_disabled:
+    if moderations_disabled:
         results['results'] = [{
             'categories': dict([(C, False) for C in categories]),
             'category_scores': dict([(C, 0.0) for C in categories]),
@@ -53,7 +52,7 @@ def moderations(input):
         input = [input]
 
     for in_str in input:
-        for ine in embeddings_model.encode([in_str]).tolist():
+        for ine in get_embeddings([in_str]):
             category_scores = dict([(C, mod_score(category_embeddings[C], ine)) for C in categories])
             category_flags = dict([(C, bool(category_scores[C] > flag_threshold)) for C in categories])
             flagged = any(category_flags.values())
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index a0a5bcf6..86f2deb7 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -55,11 +55,13 @@ def start_sse(self):
 
     def send_sse(self, chunk: dict):
         response = 'data: ' + json.dumps(chunk) + '\r\n\r\n'
-        debug_msg(response)
+        debug_msg(response[:-4])
         self.wfile.write(response.encode('utf-8'))
 
     def end_sse(self):
-        self.wfile.write('data: [DONE]\r\n\r\n'.encode('utf-8'))
+        response = 'data: [DONE]\r\n\r\n'
+        debug_msg(response[:-4])
+        self.wfile.write(response.encode('utf-8'))
 
     def return_json(self, ret: dict, code: int = 200, no_debug=False):
         self.send_response(code)
@@ -84,6 +86,7 @@ def openai_error(self, message, code=500, error_type='APIError', param='', inter
             }
         }
         if internal_message:
+            print(error_type, message)
             print(internal_message)
             # error_resp['internal_message'] = internal_message
 
@@ -93,12 +96,10 @@ def openai_error_handler(func):
         def wrapper(self):
             try:
                 func(self)
-            except ServiceUnavailableError as e:
-                self.openai_error(e.message, e.code, e.error_type, internal_message=e.internal_message)
             except InvalidRequestError as e:
-                self.openai_error(e.message, e.code, e.error_type, e.param, internal_message=e.internal_message)
+                self.openai_error(e.message, e.code, e.__class__.__name__, e.param, internal_message=e.internal_message)
             except OpenAIError as e:
-                self.openai_error(e.message, e.code, e.error_type, internal_message=e.internal_message)
+                self.openai_error(e.message, e.code, e.__class__.__name__, internal_message=e.internal_message)
             except Exception as e:
                 self.openai_error(repr(e), 500, 'OpenAIError', internal_message=traceback.format_exc())
 
@@ -143,8 +144,7 @@ def do_POST(self):
         if '/completions' in self.path or '/generate' in self.path:
 
             if not shared.model:
-                self.openai_error("No model loaded.")
-                return
+                raise ServiceUnavailableError("No model loaded.")
 
             is_legacy = '/generate' in self.path
             is_streaming = body.get('stream', False)
@@ -176,8 +176,7 @@ def do_POST(self):
             # deprecated
 
             if not shared.model:
-                self.openai_error("No model loaded.")
-                return
+                raise ServiceUnavailableError("No model loaded.")
 
             req_params = get_default_req_params()
 
@@ -190,7 +189,10 @@ def do_POST(self):
 
             self.return_json(response)
 
-        elif '/images/generations' in self.path and 'SD_WEBUI_URL' in os.environ:
+        elif '/images/generations' in self.path:
+            if not 'SD_WEBUI_URL' in os.environ:
+                raise ServiceUnavailableError("Stable Diffusion not available. SD_WEBUI_URL not set.")
+
             prompt = body['prompt']
             size = default(body, 'size', '1024x1024')
             response_format = default(body, 'response_format', 'url')  # or b64_json
@@ -256,11 +258,11 @@ def run_server():
         try:
             from flask_cloudflared import _run_cloudflared
             public_url = _run_cloudflared(params['port'], params['port'] + 1)
-            print(f'Starting OpenAI compatible api at\nOPENAI_API_BASE={public_url}/v1')
+            print(f'OpenAI compatible API ready at: OPENAI_API_BASE={public_url}/v1')
         except ImportError:
             print('You should install flask_cloudflared manually')
     else:
-        print(f'Starting OpenAI compatible api:\nOPENAI_API_BASE=http://{server_addr[0]}:{server_addr[1]}/v1')
+        print(f'OpenAI compatible API ready at: OPENAI_API_BASE=http://{server_addr[0]}:{server_addr[1]}/v1')
 
     server.serve_forever()
 
diff --git a/extensions/openai/tokens.py b/extensions/openai/tokens.py
index f243c3c9..f8d6737a 100644
--- a/extensions/openai/tokens.py
+++ b/extensions/openai/tokens.py
@@ -1,6 +1,6 @@
 from extensions.openai.utils import float_list_to_base64
 from modules.text_generation import encode, decode
-
+import numpy as np
 
 def token_count(prompt):
     tokens = encode(prompt)[0]
@@ -12,14 +12,13 @@ def token_count(prompt):
     }
 
 
-def token_encode(input, encoding_format=''):
+def token_encode(input, encoding_format):
     # if isinstance(input, list):
     tokens = encode(input)[0]
 
     return {
         'results': [{
-            'encoding_format': encoding_format,
-            'tokens': float_list_to_base64(tokens) if encoding_format == "base64" else tokens,
+            'tokens': tokens,
             'length': len(tokens),
         }]
     }
diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index 0c9441a3..abc1acbc 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -3,9 +3,9 @@
 import numpy as np
 
 
-def float_list_to_base64(float_list):
+def float_list_to_base64(float_array: np.ndarray) -> str:
     # Convert the list to a float32 array that the OpenAPI client expects
-    float_array = np.array(float_list, dtype="float32")
+    #float_array = np.array(float_list, dtype="float32")
 
     # Get raw bytes
     bytes_array = float_array.tobytes()
diff --git a/extensions/send_pictures/script.py b/extensions/send_pictures/script.py
index 63421743..39c9362a 100644
--- a/extensions/send_pictures/script.py
+++ b/extensions/send_pictures/script.py
@@ -9,8 +9,6 @@
 from modules.ui import gather_interface_values
 from modules.utils import gradio
 
-# If 'state' is True, will hijack the next chat generation with
-# custom input text given by 'value' in the format [text, visible_text]
 input_hijack = {
     'state': False,
     'value': ["", ""]
@@ -20,6 +18,15 @@
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float32).to("cpu")
 
 
+def chat_input_modifier(text, visible_text, state):
+    global input_hijack
+    if input_hijack['state']:
+        input_hijack['state'] = False
+        return input_hijack['value']
+    else:
+        return text, visible_text
+
+
 def caption_image(raw_image):
     inputs = processor(raw_image.convert('RGB'), return_tensors="pt").to("cpu", torch.float32)
     out = model.generate(**inputs, max_new_tokens=100)
@@ -42,7 +49,10 @@ def ui():
 
     # Prepare the input hijack, update the interface values, call the generation function, and clear the picture
     picture_select.upload(
-        lambda picture, name1, name2: input_hijack.update({"state": True, "value": generate_chat_picture(picture, name1, name2)}), [picture_select, shared.gradio['name1'], shared.gradio['name2']], None).then(
+        lambda picture, name1, name2: input_hijack.update({
+            "state": True,
+            "value": generate_chat_picture(picture, name1, name2)
+        }), [picture_select, shared.gradio['name1'], shared.gradio['name2']], None).then(
         gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
         lambda: None, None, picture_select, show_progress=False)
diff --git a/extensions/superbooga/download_urls.py b/extensions/superbooga/download_urls.py
index efe300d2..424a9885 100644
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@@ -4,7 +4,10 @@
 
 
 def download_single(url):
-    response = requests.get(url, timeout=5)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+    }
+    response = requests.get(url, headers=headers, timeout=5)
     if response.status_code == 200:
         return response.content
     else:
diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py
index 1e07ad2c..cdc55687 100644
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@@ -16,6 +16,15 @@
 }
 
 
+def chat_input_modifier(text, visible_text, state):
+    global input_hijack
+    if input_hijack['state']:
+        input_hijack['state'] = False
+        return input_hijack['value']
+    else:
+        return text, visible_text
+
+
 def do_stt(audio, whipser_model, whipser_language):
     transcription = ""
     r = sr.Recognizer()
@@ -56,6 +65,7 @@ def ui():
     audio.change(
         auto_transcribe, [audio, auto_submit, whipser_model, whipser_language], [shared.gradio['textbox'], audio]).then(
         None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}")
+
     whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None)
     whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None)
     auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)
diff --git a/models/config.yaml b/models/config.yaml
index 86d7293f..f9d0a4c0 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -274,9 +274,9 @@ TheBloke_WizardLM-30B-GPTQ:
   instruction_template: 'Alpaca'
 .*llama-(2|v2):
   truncation_length: 4096
+  rms_norm_eps: 5.0e-6
 .*llama-(2|v2).*chat:
   mode: 'instruct'
   instruction_template: 'Llama-v2'
 .*llama.*70b.*ggml.*\.bin:
   n_gqa: 8
-  rms_norm_eps: 1.0e-5
diff --git a/modules/ComputeDevice.py b/modules/ComputeDevice.py
new file mode 100644
index 00000000..999ab086
--- /dev/null
+++ b/modules/ComputeDevice.py
@@ -0,0 +1,164 @@
+import math
+import os
+import psutil
+import re
+import torch
+import unittest
+
+
+# from modules import shared
+import modules.shared as shared
+from modules.logging_colors import logger
+
+class ComputeDevice:
+    '''
+    Keep a list of all instances so we can use class methods for operating on all of them at once, like resetting, re-initiailixzing or anything else we might wat to do.
+    '''
+    devices = []
+
+    def __init__(self, device_type=None):
+        if device_type and ':' in device_type:
+            self.device_type, self.local_rank = device_type.split(':')
+            self.local_rank = int(self.local_rank)
+        else:
+            self.device_type = device_type if device_type else self.select_device()
+            self.local_rank = self.get_local_rank()
+
+        self.device = torch.device(self.device_type, self.local_rank)
+        ComputeDevice.devices.append(self)
+
+        # Initialize memory attributes
+        self.system_memory = None
+        self.gpu_memory = None
+        self.cpu_memory = None
+        # Calculate memory
+        self.total_mem = self.calculate_memory()
+        # Call the methods to set the device and memory attributes
+        self.select_device()
+        self.calculate_memory()
+
+    @classmethod
+    def clear_all_cache(cls):
+        '''
+        This frees all cache space used by every device of ComputeDevice class we have created.
+        '''
+        for device in cls.devices:
+            device.clear_cache()
+
+    def clear_cache(self):
+        '''
+        This clears the cache for the torch device passeed to us.
+        '''
+        if self.device_type == 'cuda':
+            torch.cuda.empty_cache()
+        elif self.device_type == 'mps':
+            torch.mps.empty_cache()
+
+        # Remove the device from the list
+        ComputeDevice.devices.remove(self)
+
+    def get_local_rank(self):
+        '''
+        Get local renk is assigned in config or as environment variable.
+        '''
+        try:
+            local_rank = shared.args.local_rank
+        except TypeError:
+            local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        return local_rank
+
+    def select_device(self):
+        '''
+        This will contain the logic to select the appropriate device (CUDA, MPS, CPU) 
+        
+        Default is CPU
+
+        Local rank is just an index of the torch device.
+        
+        The statement: torch.device('cuds:0')
+        Is identical to: torch.device('cuda', 0)
+
+        '''
+        local_rank = self.get_local_rank()
+        if torch.cuda.is_available():
+            return 'cuda'
+        elif torch.backends.mps.is_available():
+            return 'mps'
+        else:
+            return 'cpu'
+
+    def calculate_memory(self):
+        '''
+        Perform all memory calculations to determine total system memory, total GPU memory, and CPU memory available for use by the application.  Some of these are adjusted by amounts for reservations specified in the config files.
+        '''
+        self.system_memory = math.floor(psutil.virtual_memory().total / (1024 * 1024))
+
+        # Check for MPS, CUDA, or CPU and calculate total memory accordingly
+        if torch.backends.mps.is_available():
+            self.gpu_memory = [self.system_memory]
+        elif torch.cuda.is_available():
+            self.gpu_memory = [math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)) for i in range(torch.cuda.device_count())]
+        else:
+            self.gpu_memory = [self.system_memory]
+
+        # Calculate default reserved GPU memory
+        self.default_gpu_mem = []
+        if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
+            for i in shared.args.gpu_memory:
+                if 'mib' in i.lower():
+                    self.default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
+                else:
+                    self.default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
+        while len(self.default_gpu_mem) < len(self.gpu_memory):
+            self.default_gpu_mem.append(0)
+
+        # Calculate default reserved CPU memory
+        if shared.args.cpu_memory is not None:
+            self.cpu_memory = int(re.sub('[a-zA-Z ]', '', shared.args.cpu_memory))
+        else:
+            self.cpu_memory = 0
+
+        # Calculate the total available memory for the application
+        self.total_mem = [gm - dgm for gm, dgm in zip(self.gpu_memory, self.default_gpu_mem)]
+        self.total_mem.append(self.system_memory - self.cpu_memory)
+
+
+
+# Unit testing for this class.
+class TestComputeDevice(unittest.TestCase):
+    def setUp(self):
+        self.device = ComputeDevice('cpu')
+
+    def test_device_type(self):
+        self.assertEqual(self.device.device_type, 'cpu')
+
+    def test_local_rank(self):
+        self.assertEqual(self.device.local_rank, 0)
+
+    def test_device(self):
+        self.assertEqual(self.device.device.type, 'cpu')
+
+    def test_memory_calculation(self):
+        self.assertIsNotNone(self.device.system_memory)
+        self.assertIsNotNone(self.device.gpu_memory)
+        self.assertIsNotNone(self.device.cpu_memory)
+
+    def test_clear_cache(self):
+        # This is a bit tricky to test as it doesn't return anything
+        # But at least we can check it doesn't raise an error
+        try:
+            self.device.clear_cache()
+        except Exception as e:
+            self.fail(f"clear_cache raised an exception: {e}")
+
+    def test_clear_all_cache(self):
+        # Similar to test_clear_cache
+        try:
+            ComputeDevice.clear_all_cache()
+        except Exception as e:
+            self.fail(f"clear_all_cache raised an exception: {e}")
+
+# If this is run directly from the command line, rather than imported, it willr 
+# run the unit tests
+if __name__ == '__main__':
+    unittest.main()
diff --git a/modules/chat.py b/modules/chat.py
index d2423555..f684768b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -175,10 +175,8 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
     # Preparing the input
     if not any((regenerate, _continue)):
-        text, visible_text = apply_extensions('input_hijack', text, visible_text)
-        if visible_text is None:
-            visible_text = text
-
+        visible_text = text
+        text, visible_text = apply_extensions('chat_input', text, visible_text, state)
         text = apply_extensions('input', text, state)
 
         # *Is typing...*
diff --git a/modules/deepspeed_parameters.py b/modules/deepspeed_parameters.py
index 9116f579..f170a385 100644
--- a/modules/deepspeed_parameters.py
+++ b/modules/deepspeed_parameters.py
@@ -1,6 +1,6 @@
 def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir):
     '''
-    DeepSpeed configration
+    DeepSpeed configuration
     https://huggingface.co/docs/transformers/main_classes/deepspeed
     '''
 
diff --git a/modules/extensions.py b/modules/extensions.py
index faf6cf6d..76b6be8b 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -1,13 +1,12 @@
 import traceback
 from functools import partial
+from inspect import signature
 
 import gradio as gr
 
 import extensions
 import modules.shared as shared
 from modules.logging_colors import logger
-from inspect import signature
-
 
 state = {}
 available_extensions = []
@@ -66,15 +65,11 @@ def _apply_string_extensions(function_name, text, state):
     return text
 
 
-# Input hijack of extensions
-def _apply_input_hijack(text, visible_text):
+# Extension functions that map string -> string
+def _apply_chat_input_extensions(text, visible_text, state):
     for extension, _ in iterator():
-        if hasattr(extension, 'input_hijack') and extension.input_hijack['state']:
-            extension.input_hijack['state'] = False
-            if callable(extension.input_hijack['value']):
-                text, visible_text = extension.input_hijack['value'](text, visible_text)
-            else:
-                text, visible_text = extension.input_hijack['value']
+        if hasattr(extension, 'chat_input_modifier'):
+            text, visible_text = extension.chat_input_modifier(text, visible_text, state)
 
     return text, visible_text
 
@@ -120,7 +115,11 @@ def _apply_tokenizer_extensions(function_name, state, prompt, input_ids, input_e
 def _apply_logits_processor_extensions(function_name, processor_list, input_ids):
     for extension, _ in iterator():
         if hasattr(extension, function_name):
-            getattr(extension, function_name)(processor_list, input_ids)
+            result = getattr(extension, function_name)(processor_list, input_ids)
+            if type(result) is list:
+                processor_list = result
+
+    return processor_list
 
 
 # Get prompt length in tokens after applying extension functions which override the default tokenizer output
@@ -187,12 +186,12 @@ def create_extensions_tabs():
 EXTENSION_MAP = {
     "input": partial(_apply_string_extensions, "input_modifier"),
     "output": partial(_apply_string_extensions, "output_modifier"),
+    "chat_input": _apply_chat_input_extensions,
     "state": _apply_state_modifier_extensions,
     "history": _apply_history_modifier_extensions,
     "bot_prefix": partial(_apply_string_extensions, "bot_prefix_modifier"),
     "tokenizer": partial(_apply_tokenizer_extensions, "tokenizer_modifier"),
     'logits_processor': partial(_apply_logits_processor_extensions, 'logits_processor_modifier'),
-    "input_hijack": _apply_input_hijack,
     "custom_generate_chat_prompt": _apply_custom_generate_chat_prompt,
     "custom_generate_reply": _apply_custom_generate_reply,
     "tokenized_length": _apply_custom_tokenized_length,
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 94d893c4..bbc51100 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -99,6 +99,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
+            'n_gqa': shared.args.n_gqa or None,
+            'rms_norm_eps': shared.args.rms_norm_eps or None,
             'logits_all': True,
         }
 
diff --git a/modules/loaders.py b/modules/loaders.py
index b760128f..c55cf0ff 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -30,6 +30,8 @@
     ],
     'llama.cpp': [
         'n_ctx',
+        'n_gqa',
+        'rms_norm_eps',
         'n_gpu_layers',
         'n_batch',
         'threads',
@@ -42,6 +44,8 @@
     ],
     'llamacpp_HF': [
         'n_ctx',
+        'n_gqa',
+        'rms_norm_eps',
         'n_gpu_layers',
         'n_batch',
         'threads',
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 3f37e48d..9319582e 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -30,8 +30,6 @@ def infer_loader(model_name):
         loader = 'llama.cpp'
     elif re.match('.*rwkv.*\.pth', model_name.lower()):
         loader = 'RWKV'
-    elif shared.args.flexgen:
-        loader = 'FlexGen'
     else:
         loader = 'Transformers'
 
diff --git a/modules/shared.py b/modules/shared.py
index f0a426a0..8558bbc0 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -31,8 +31,11 @@
 # For restarting the interface
 need_restart = False
 
+# Graceful Shutdown
+run_server = True
+
 settings = {
-    'dark_theme': False,
+    'dark_theme': True,
     'autoload_model': False,
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
@@ -96,7 +99,7 @@ def str2bool(v):
 parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
 
 # Model loader
-parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen')
+parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv')
 
 # Accelerate/transformers
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
@@ -128,6 +131,8 @@ def str2bool(v):
 parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
 parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)')
+parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama2 70b.')
+parser.add_argument('--rms_norm_eps', type=float, default=0, help='Must be 1e-5 for llama2 70b.')
 
 # GPTQ
 parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
@@ -152,14 +157,6 @@ def str2bool(v):
 # ExLlama
 parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
 parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
-parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
-parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both.")
-
-# FlexGen
-parser.add_argument('--flexgen', action='store_true', help='DEPRECATED')
-parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
-parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
-parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).")
 
 # DeepSpeed
 parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -170,6 +167,10 @@ def str2bool(v):
 parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
 parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
 
+# RoPE
+parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
+parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.")
+
 # Gradio
 parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
 parser.add_argument('--listen-host', type=str, help='The hostname that the server will use.')
@@ -198,9 +199,6 @@ def str2bool(v):
 if args.gptq_for_llama:
     logger.warning('--gptq-for-llama has been deprecated and will be removed soon. Use --loader gptq-for-llama instead.')
     args.loader = 'gptq-for-llama'
-if args.flexgen:
-    logger.warning('--flexgen has been deprecated and will be removed soon. Use --loader flexgen instead.')
-    args.loader = 'FlexGen'
 
 # Security warnings
 if args.trust_remote_code:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index d3939d3f..e1be6aa3 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -53,8 +53,6 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
     if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu:
         return input_ids
-    elif shared.args.flexgen:
-        return input_ids.numpy()
     elif shared.args.deepspeed:
         return input_ids.to(device=local_rank)
     elif torch.backends.mps.is_available():
@@ -182,8 +180,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
 
         if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
             generate_func = generate_reply_custom
-        elif shared.args.flexgen:
-            generate_func = generate_reply_flexgen
         else:
             generate_func = generate_reply_HF
 
@@ -339,66 +335,3 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str
         new_tokens = len(encode(original_question + reply)[0]) - original_tokens
         print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
         return
-
-
-def generate_reply_flexgen(question, original_question, seed, state, stopping_strings=None, is_chat=False):
-    generate_params = {}
-    for k in ['max_new_tokens', 'do_sample', 'temperature']:
-        generate_params[k] = state[k]
-
-    if state['stream']:
-        generate_params['max_new_tokens'] = 8
-
-    # Encode the input
-    input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
-    output = input_ids[0]
-
-    # Find the eos tokens
-    eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
-    if not state['ban_eos_token']:
-        generate_params['stop'] = eos_token_ids[-1]
-
-    # Add the encoded tokens to generate_params
-    question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
-    original_input_ids = input_ids
-    generate_params.update({'inputs': input_ids})
-    if inputs_embeds is not None:
-        generate_params.update({'inputs_embeds': inputs_embeds})
-
-    t0 = time.time()
-    try:
-        if not is_chat:
-            yield ''
-
-        # Generate the entire reply at once.
-        if not state['stream']:
-            with torch.no_grad():
-                output = shared.model.generate(**generate_params)[0]
-
-            yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
-
-        # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
-        else:
-            for i in range(state['max_new_tokens'] // 8 + 1):
-                if shared.stop_everything:
-                    break
-
-                clear_torch_cache()
-                with torch.no_grad():
-                    output = shared.model.generate(**generate_params)[0]
-
-                if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
-                    break
-
-                yield get_reply_from_output_ids(output, original_input_ids, original_question, state)
-                input_ids = np.reshape(output, (1, output.shape[0]))
-                generate_params.update({'inputs': input_ids})
-
-    except Exception:
-        traceback.print_exc()
-    finally:
-        t1 = time.time()
-        original_tokens = len(original_input_ids[0])
-        new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
-        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
-        return
diff --git a/modules/training.py b/modules/training.py
index 1f8e5e5e..c98fded2 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -445,9 +445,9 @@ def tokenize(prompt, append_eos_token=False):
 
         def generate_prompt(data_point: dict[str, str]):
             for options, data in format_data.items():
-                if set(options.split(',')) == set(x[0] for x in data_point.items() if (x[1] is not None and len(x[1].strip()) > 0)):
+                if set(options.split(',')) == set(x[0] for x in data_point.items() if (type(x[1]) is str and len(x[1].strip()) > 0)):
                     for key, val in data_point.items():
-                        if val is not None:
+                        if type(val) is str:
                             data = data.replace(f'%{key}%', val)
                     return data
             raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
diff --git a/modules/ui.py b/modules/ui.py
index 704be925..d9b3a131 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -61,6 +61,8 @@ def list_model_elements():
         'mlock',
         'n_gpu_layers',
         'n_ctx',
+        'n_gqa',
+        'rms_norm_eps',
         'llama_cpp_seed',
         'gpu_split',
         'max_seq_len',
@@ -161,7 +163,10 @@ def apply_interface_values(state, use_persistent=False):
 
 
 class ToolButton(gr.Button, gr.components.IOComponent):
-    """Small button with single emoji as text, fits inside gradio forms"""
+    """
+    Small button with single emoji as text, fits inside gradio forms
+    Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui
+    """
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -171,6 +176,9 @@ def get_block_name(self):
 
 
 def create_refresh_button(refresh_component, refresh_method, refreshed_args, elem_class):
+    """
+    Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui
+    """
     def refresh():
         refresh_method()
         args = refreshed_args() if callable(refreshed_args) else refreshed_args
diff --git a/modules/utils.py b/modules/utils.py
index e257de2d..9ae5dc86 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -71,10 +71,7 @@ def natural_keys(text):
 
 
 def get_available_models():
-    if shared.args.flexgen:
-        return sorted([re.sub('-np$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if item.name.endswith('-np')], key=natural_keys)
-    else:
-        return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
+    return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
 
 
 def get_available_presets():
diff --git a/server.py b/server.py
index 7231dfef..a52655a7 100644
--- a/server.py
+++ b/server.py
@@ -1,5 +1,6 @@
 import os
 import warnings
+import cProfile
 
 from modules.logging_colors import logger
 from modules.block_requests import OpenMonkeyPatch, RequestBlocker
@@ -215,10 +216,14 @@ def create_model_menus():
                         shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
                         shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
                         shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
-                        shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
-                        shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
+
                         shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=1024, value=shared.args.n_gpu_layers)
                         shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
+                        shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
+                        shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
+                        shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama2 70b.')
+                        shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.n_gqa, info='5e-6 is a good value for llama2 70b.')
+
                         shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
                         shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
                         shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
@@ -246,6 +251,7 @@ def create_model_menus():
                         shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
                         shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                         shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
+#                        shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                         shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
                         shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                         shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
@@ -318,7 +324,7 @@ def create_settings_menus(default_preset):
     with gr.Row():
         with gr.Column():
             with gr.Row():
-                shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset', elem_classes='slim-dropdown')
+                shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Generation parameters preset', elem_classes='slim-dropdown')
                 ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
                 shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
                 shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
@@ -373,7 +379,6 @@ def create_settings_menus(default_preset):
         1) Midnight Enigma
         2) Yara
         3) Shortwave
-        4) Kobold-Godlike
 
     ### Temperature
     Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.
@@ -566,6 +571,8 @@ def set_interface_arguments(interface_mode, extensions, bool_active):
 
     shared.need_restart = True
 
+def shutdown_server(interface_mode, extensions, bool_active):
+    shared.run_server = False
 
 def create_interface():
 
@@ -591,6 +598,21 @@ def create_interface():
     if shared.args.extensions is not None and len(shared.args.extensions) > 0:
         extensions_module.load_extensions()
 
+    # Forcing some events to be triggered on page load
+    shared.persistent_interface_state.update({
+        'loader': shared.args.loader or 'Transformers',
+    })
+
+    if shared.is_chat():
+        shared.persistent_interface_state.update({
+            'mode': shared.settings['mode'],
+            'character_menu': shared.args.character or shared.settings['character'],
+            'instruction_template': shared.settings['instruction_template']
+        })
+
+        if Path("cache/pfp_character.png").exists():
+            Path("cache/pfp_character.png").unlink()
+
     # css/js strings
     css = ui.css if not shared.is_chat() else ui.css + ui.chat_css
     js = ui.main_js if not shared.is_chat() else ui.main_js + ui.chat_js
@@ -838,11 +860,18 @@ def create_interface():
 
                     extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')
                     extension_status = gr.Markdown()
+                    shared.gradio['stop_server'] = gr.Button("Shutrown the Server", elem_classes="small-button", variant="primary")
+
 
             extension_name.submit(
                 clone_or_pull_repository, extension_name, extension_status, show_progress=False).then(
                 lambda: gr.update(choices=utils.get_available_extensions(), value=shared.args.extensions), None, gradio('extensions_menu'))
 
+            # Stop Server
+            shared.gradio['stop_server'].click(
+                shutdown_server, gradio('interface_modes_menu', 'extensions_menu', 'bool_menu'), None).then(
+                lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Shutting Down Server</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
+
             # Reset interface event
             shared.gradio['reset_interface'].click(
                 set_interface_arguments, gradio('interface_modes_menu', 'extensions_menu', 'bool_menu'), None).then(
@@ -1052,11 +1081,11 @@ def create_interface():
 
         create_file_saving_event_handlers()
 
-        shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
-        shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         if shared.settings['dark_theme']:
             shared.gradio['interface'].load(lambda: None, None, None, _js="() => document.getElementsByTagName('body')[0].classList.add('dark')")
 
+        shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
+        shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         if shared.is_chat():
             shared.gradio['interface'].load(chat.redraw_html, shared.reload_inputs, gradio('display'))
 
@@ -1075,7 +1104,7 @@ def create_interface():
             shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch, auth=auth)
 
 
-if __name__ == "__main__":
+def main():
     # Loading custom settings
     settings_file = None
     if shared.args.settings is not None and Path(shared.args.settings).exists():
@@ -1102,6 +1131,8 @@ def create_interface():
         'skip_special_tokens': shared.settings['skip_special_tokens'],
         'custom_stopping_strings': shared.settings['custom_stopping_strings'],
         'truncation_length': shared.settings['truncation_length'],
+        'n_gqa': 0,
+        'rms_norm_eps': 0,
     }
 
     shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
@@ -1152,26 +1183,11 @@ def create_interface():
         if shared.args.lora:
             add_lora_to_model(shared.args.lora)
 
-    # Forcing some events to be triggered on page load
-    shared.persistent_interface_state.update({
-        'loader': shared.args.loader or 'Transformers',
-    })
-
-    if shared.is_chat():
-        shared.persistent_interface_state.update({
-            'mode': shared.settings['mode'],
-            'character_menu': shared.args.character or shared.settings['character'],
-            'instruction_template': shared.settings['instruction_template']
-        })
-
-        if Path("cache/pfp_character.png").exists():
-            Path("cache/pfp_character.png").unlink()
-
     shared.generation_lock = Lock()
 
     # Launch the web UI
     create_interface()
-    while True:
+    while shared.run_server:
         time.sleep(0.5)
         if shared.need_restart:
             shared.need_restart = False
@@ -1179,3 +1195,11 @@ def create_interface():
             shared.gradio['interface'].close()
             time.sleep(0.5)
             create_interface()
+
+if __name__ == "__main__":
+    if os.getenv('DEBUG_PROF') == 1:
+        logger.info(f"Profiling activated sending information to outlut.prof")
+        cProfile.run(main(), 'output.prof')
+    else:
+        main()
+
diff --git a/settings-template.yaml b/settings-template.yaml
index de2c73d3..62351e54 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -1,4 +1,4 @@
-dark_theme: false
+dark_theme: True
 autoload_model: false
 max_new_tokens: 200
 max_new_tokens_min: 1

From 82a9a62f4b22ccadab820c437a7436b129cfbc18 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Sat, 5 Aug 2023 14:23:43 -0500
Subject: [PATCH 02/13] mpre changes.

---
 .gitignore                 |   1 +
 docker/.dockerignore       |   9 ----
 docker/.env.example        |  30 -----------
 docker/Dockerfile          |  68 ------------------------
 docker/docker-compose.yml  |  32 -----------
 modules/AutoGPTQ_loader.py |   3 +-
 modules/ComputeDevice.py   | 106 ++++++++++++++++++++++++++++++-------
 modules/GPTQ_loader.py     |  18 +++----
 modules/RWKV.py            |   1 +
 modules/callbacks.py       |   4 +-
 modules/llamacpp_hf.py     |   3 +-
 modules/llamacpp_model.py  |   6 ++-
 modules/models.py          |  30 +++++------
 modules/sampler_hijack.py  |   4 +-
 modules/text_generation.py |  19 +++----
 server.py                  |  78 ++++++++++++---------------
 16 files changed, 166 insertions(+), 246 deletions(-)
 delete mode 100644 docker/.dockerignore
 delete mode 100644 docker/.env.example
 delete mode 100644 docker/Dockerfile
 delete mode 100644 docker/docker-compose.yml

diff --git a/.gitignore b/.gitignore
index a529ce65..6f766ebc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,4 @@ Thumbs.db
 
 *.swp
 .*.un~
+ouput.prof
diff --git a/docker/.dockerignore b/docker/.dockerignore
deleted file mode 100644
index 6073533e..00000000
--- a/docker/.dockerignore
+++ /dev/null
@@ -1,9 +0,0 @@
-.env
-Dockerfile
-/characters
-/loras
-/models
-/presets
-/prompts
-/softprompts
-/training
diff --git a/docker/.env.example b/docker/.env.example
deleted file mode 100644
index 3119a9f0..00000000
--- a/docker/.env.example
+++ /dev/null
@@ -1,30 +0,0 @@
-# by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX
-# however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5
-# https://developer.nvidia.com/cuda-gpus you can find the version for your card here
-TORCH_CUDA_ARCH_LIST=7.5
-
-# these commands worked for me with roughly 4.5GB of vram
-CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices
-
-# the following examples have been tested with the files linked in docs/README_docker.md:
-# example running 13b with 4bit/128 groupsize        : CLI_ARGS=--model llama-13b-4bit-128g --wbits 4 --listen --groupsize 128 --pre_layer 25
-# example with loading api extension and public share: CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices --no-stream --extensions api --share
-# example running 7b with 8bit groupsize             : CLI_ARGS=--model llama-7b --load-in-8bit --listen --auto-devices
-
-# the port the webui binds to on the host
-HOST_PORT=7860
-# the port the webui binds to inside the container
-CONTAINER_PORT=7860
-
-# the port the api binds to on the host
-HOST_API_PORT=5000
-# the port the api binds to inside the container
-CONTAINER_API_PORT=5000
-
-# the port the api stream endpoint binds to on the host
-HOST_API_STREAM_PORT=5005
-# the port the api stream endpoint binds to inside the container
-CONTAINER_API_STREAM_PORT=5005
-
-# the version used to install text-generation-webui from
-WEBUI_VERSION=HEAD
diff --git a/docker/Dockerfile b/docker/Dockerfile
deleted file mode 100644
index 7cc0ff15..00000000
--- a/docker/Dockerfile
+++ /dev/null
@@ -1,68 +0,0 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder
-
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa /build
-
-WORKDIR /build
-
-RUN python3 -m venv /build/venv
-RUN . /build/venv/bin/activate && \
-    pip3 install --upgrade pip setuptools wheel && \
-    pip3 install torch torchvision torchaudio && \
-    pip3 install -r requirements.txt
-
-# https://developer.nvidia.com/cuda-gpus
-# for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
-ARG TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
-RUN . /build/venv/bin/activate && \
-    python3 setup_cuda.py bdist_wheel -d .
-
-FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
-
-LABEL maintainer="Your Name <your.email@example.com>"
-LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
-
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv
-RUN mkdir /app
-
-WORKDIR /app
-
-ARG WEBUI_VERSION
-RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source"
-
-RUN virtualenv /app/venv
-RUN . /app/venv/bin/activate && \
-    pip3 install --upgrade pip setuptools wheel && \
-    pip3 install torch torchvision torchaudio
-
-COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
-RUN . /app/venv/bin/activate && \
-    pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl
-
-COPY extensions/api/requirements.txt /app/extensions/api/requirements.txt
-COPY extensions/elevenlabs_tts/requirements.txt /app/extensions/elevenlabs_tts/requirements.txt
-COPY extensions/google_translate/requirements.txt /app/extensions/google_translate/requirements.txt
-COPY extensions/silero_tts/requirements.txt /app/extensions/silero_tts/requirements.txt
-COPY extensions/whisper_stt/requirements.txt /app/extensions/whisper_stt/requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/api && pip3 install -r requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/elevenlabs_tts && pip3 install -r requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/google_translate && pip3 install -r requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/silero_tts && pip3 install -r requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/whisper_stt && pip3 install -r requirements.txt
-
-COPY requirements.txt /app/requirements.txt
-RUN . /app/venv/bin/activate && \
-    pip3 install -r requirements.txt
-
-RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
-
-COPY . /app/
-ENV CLI_ARGS=""
-CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS}
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
deleted file mode 100644
index 46b27580..00000000
--- a/docker/docker-compose.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-version: "3.3"
-services:
-  text-generation-webui:
-    build:
-      context: .
-      args:
-        # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
-        TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
-        WEBUI_VERSION: ${WEBUI_VERSION:-HEAD}
-    env_file: .env
-    ports:
-      - "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
-      - "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
-      - "${HOST_API_STREAM_PORT:-5005}:${CONTAINER_API_STREAM_PORT:-5005}"
-    stdin_open: true
-    tty: true
-    volumes:
-      - ./characters:/app/characters
-      - ./extensions:/app/extensions
-      - ./loras:/app/loras
-      - ./models:/app/models
-      - ./presets:/app/presets
-      - ./prompts:/app/prompts
-      - ./softprompts:/app/softprompts
-      - ./training:/app/training
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ['0']
-              capabilities: [gpu]
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
index 0d41ac0a..ac43337f 100644
--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from modules.ComputeDevice import gpu_dev
 
 import modules.shared as shared
 from modules.logging_colors import logger
@@ -41,7 +42,7 @@ def load_quantized(model_name):
     # Define the params for AutoGPTQForCausalLM.from_quantized
     params = {
         'model_basename': pt_path.stem,
-        'device': "cuda:0" if not shared.args.cpu else "cpu",
+        'device': gpu_dev,
         'use_triton': shared.args.triton,
         'inject_fused_attention': not shared.args.no_inject_fused_attention,
         'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
diff --git a/modules/ComputeDevice.py b/modules/ComputeDevice.py
index 999ab086..29b9468a 100644
--- a/modules/ComputeDevice.py
+++ b/modules/ComputeDevice.py
@@ -10,6 +10,78 @@
 import modules.shared as shared
 from modules.logging_colors import logger
 
+
+def gpu_available():
+    '''
+    Sets and returns the default torch.device object accordng to which deecices are available
+    on th emacnine. This will be the device used in context for any tensor operatuoins done
+    without providing an dexplicit device index or number.  This device number only applies to
+    CUDA devices and not MPS.
+    
+    Returns True for suda and mps
+            False for cpu
+    '''
+    return get_gpu()[0]
+
+
+def gpu_dev():
+    '''
+    Sets the default compute device for GPU acceleration.
+    
+    Returns torch.device object as cuda, mps, or cpu
+    '''
+    return get_gpu()[1]
+
+
+def get_gpu():
+    '''
+    Checks for GPU acceleration with either cuda or mps, will fallback to cpu
+
+    This should only really need to be called once. But will get called for every time
+    we check to see if there is an active GPU device. When this gets moved into a ComputeDevice
+    class, we can take care of this as class variables and methods.
+    
+    Returns a tuple (has_gpu, gpu_dev)
+        has_gou: true if cuda or mps is available
+        gpu_dev: the device found for compute
+    '''
+    # We don't *HAVE* to set a local rank index for each compute device, but it doesn't
+    # hurt anything of we do. This is mostly for CUDA and distributed setup.
+    local_rank = get_local_rank()
+    if torch.cuda.is_available():
+        logger.info("Using CUDA GPU Acceleration for Torch Device")
+        # torch.cuda.set_device(local_rank)
+        return True, torch.device("cuda", local_rank)
+    elif torch.backends.mps.is_available():
+        logger.info("Using MPS GPU Acceleration for Torch Device")
+        return True, torch.device("mps", local_rank)
+    else:
+        logger.warning("CPU only! No GPU acceleration available.  Possible performance impact.")
+        return False, torch.device("cpu", local_rank)
+
+
+def clear_gpu_cache():
+    '''
+    This clears the cache for the default torch device
+    Less than optimal, but should do for now.
+    '''
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    elif torch.backends.mps.is_available():
+        torch.mps.empty_cache()
+
+
+def get_local_rank():
+    '''
+    Get local renk is assigned in config or as environment variable.
+    '''
+    try:
+        local_rank = shared.args.local_rank
+    except TypeError:
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    return local_rank
+
+
 class ComputeDevice:
     '''
     Keep a list of all instances so we can use class methods for operating on all of them at once, like resetting, re-initiailixzing or anything else we might wat to do.
@@ -32,9 +104,6 @@ def __init__(self, device_type=None):
         self.gpu_memory = None
         self.cpu_memory = None
         # Calculate memory
-        self.total_mem = self.calculate_memory()
-        # Call the methods to set the device and memory attributes
-        self.select_device()
         self.calculate_memory()
 
     @classmethod
@@ -79,7 +148,6 @@ def select_device(self):
         Is identical to: torch.device('cuda', 0)
 
         '''
-        local_rank = self.get_local_rank()
         if torch.cuda.is_available():
             return 'cuda'
         elif torch.backends.mps.is_available():
@@ -87,40 +155,42 @@ def select_device(self):
         else:
             return 'cpu'
 
-    def calculate_memory(self):
+    @classmethod
+    def calculate_memory(cls):
         '''
         Perform all memory calculations to determine total system memory, total GPU memory, and CPU memory available for use by the application.  Some of these are adjusted by amounts for reservations specified in the config files.
         '''
-        self.system_memory = math.floor(psutil.virtual_memory().total / (1024 * 1024))
+        cls.system_memory = math.floor(psutil.virtual_memory().total / (1024 * 1024))
 
         # Check for MPS, CUDA, or CPU and calculate total memory accordingly
         if torch.backends.mps.is_available():
-            self.gpu_memory = [self.system_memory]
+            cls.gpu_memory = [cls.system_memory]
         elif torch.cuda.is_available():
-            self.gpu_memory = [math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)) for i in range(torch.cuda.device_count())]
+            cls.gpu_memory = [math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)) for i in range(torch.cuda.device_count())]
         else:
-            self.gpu_memory = [self.system_memory]
+            cls.gpu_memory = [cls.system_memory]
 
         # Calculate default reserved GPU memory
-        self.default_gpu_mem = []
+        cls.default_gpu_mem = []
         if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
             for i in shared.args.gpu_memory:
                 if 'mib' in i.lower():
-                    self.default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
+                    cls.default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
                 else:
-                    self.default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
-        while len(self.default_gpu_mem) < len(self.gpu_memory):
-            self.default_gpu_mem.append(0)
+                    cls.default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
+        while len(cls.default_gpu_mem) < len(cls.gpu_memory):
+            cls.default_gpu_mem.append(0)
 
         # Calculate default reserved CPU memory
         if shared.args.cpu_memory is not None:
-            self.cpu_memory = int(re.sub('[a-zA-Z ]', '', shared.args.cpu_memory))
+            cls.cpu_reserved_memory = int(re.sub('[a-zA-Z ]', '', shared.args.cpu_memory))
         else:
-            self.cpu_memory = 0
+            cls.cpu_reserved_memory = 0
 
         # Calculate the total available memory for the application
-        self.total_mem = [gm - dgm for gm, dgm in zip(self.gpu_memory, self.default_gpu_mem)]
-        self.total_mem.append(self.system_memory - self.cpu_memory)
+        cls.total_mem = [gm - dgm for gm, dgm in zip(cls.gpu_memory, cls.default_gpu_mem)]
+        cls.total_mem.append(cls.system_memory - cls.cpu_memory)
+        
 
 
 
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index ddc5f9a5..84833302 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -10,6 +10,7 @@
 
 import modules.shared as shared
 from modules.logging_colors import logger
+from modules.ComputeDevice import gpu_dev
 
 sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
 
@@ -171,10 +172,10 @@ def load_quantized(model_name):
         else:
             pre_layer = shared.args.pre_layer
 
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, pre_layer)
+        GPTQ_model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, pre_layer)
     else:
         threshold = False if model_type == 'gptj' else 128
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
+        GPTQ_model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
 
         # accelerate offload (doesn't work properly)
         if shared.args.gpu_memory or torch.cuda.device_count() > 1:
@@ -187,15 +188,14 @@ def load_quantized(model_name):
 
                 max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
             else:
-                max_memory = accelerate.utils.get_balanced_memory(model)
+                max_memory = accelerate.utils.get_balanced_memory(GPTQ_model)
 
-            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
+            device_map = accelerate.infer_auto_device_map(GPTQ_model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
             logger.info("Using the following device map for the quantized model:", device_map)
             # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
-            model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)
+            GPTQ_model = accelerate.dispatch_model(GPTQ_model, device_map=device_map, offload_buffers=True)
 
-        # No offload
-        elif not shared.args.cpu:
-            model = model.to(torch.device('cuda:0'))
+        gpu = gpu_dev()
+        GPTQ_model = GPTQ_model.to(gpu)
 
-    return model
+    return GPTQ_model
diff --git a/modules/RWKV.py b/modules/RWKV.py
index 35d69986..7ed18519 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -7,6 +7,7 @@
 
 import modules.shared as shared
 from modules.callbacks import Iteratorize
+from modules.ComputeDevice import gpu_dev
 
 np.set_printoptions(precision=4, suppress=True, linewidth=200)
 
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 1fa95e47..42e61a2e 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -7,6 +7,7 @@
 import transformers
 
 import modules.shared as shared
+from modules.ComputeDevice import clear_gpu_cache
 
 
 class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
@@ -90,5 +91,4 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 def clear_torch_cache():
     gc.collect()
-    if not shared.args.cpu:
-        torch.cuda.empty_cache()
+    clear_gpu_cache()
\ No newline at end of file
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index bbc51100..472c45fb 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -10,6 +10,7 @@
 
 from modules import shared
 from modules.logging_colors import logger
+from modules.ComputeDevice import get_gpu
 
 
 class LlamacppHF(PreTrainedModel):
@@ -30,7 +31,7 @@ def prepare_inputs_for_generation(self, input_ids, **kwargs):
 
     @property
     def device(self) -> torch.device:
-        return torch.device(0)
+        return get_gpu()
 
     def __call__(self, *args, **kwargs):
         # TODO: Some decoding methods (such as Contrastive Search) may not work at this time
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 180b0f37..d8d41cdd 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -25,8 +25,8 @@ class LlamaCppModel:
     def __init__(self):
         self.initialized = False
 
-    def __del__(self):
-        self.model.__del__()
+#     def __del__(self):
+#         self.model.__del__()
 
     @classmethod
     def from_pretrained(self, path):
@@ -53,6 +53,8 @@ def from_pretrained(self, path):
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
+            'n_gqa': shared.args.n_gqa or None,
+            'rms_norm_eps': shared.args.rms_norm_eps or None,
         }
 
         result.model = Llama(**params)
diff --git a/modules/models.py b/modules/models.py
index 232d5fa6..3c005967 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -21,6 +21,7 @@
 from modules import llama_attn_hijack, sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import infer_loader
+from modules.ComputeDevice import get_gpu, gpu_available
 
 transformers.logging.set_verbosity_error()
 
@@ -35,9 +36,8 @@
     from modules.deepspeed_parameters import generate_ds_config
 
     # Distributed setup
-    local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
+    gpu_dev = get_gpu()
     world_size = int(os.getenv("WORLD_SIZE", "1"))
-    torch.cuda.set_device(local_rank)
     deepspeed.init_distributed()
     ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
     dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration
@@ -78,23 +78,23 @@ def load_model(model_name, loader=None):
     shared.args.loader = loader
     output = load_func_map[loader](model_name)
     if type(output) is tuple:
-        model, tokenizer = output
+        lcl_model, lcl_tokenizer = output
     else:
-        model = output
-        if model is None:
+        lcl_model = output
+        if lcl_model is None:
             return None, None
         else:
-            tokenizer = load_tokenizer(model_name, model)
+            lcl_tokenizer = load_tokenizer(model_name, lcl_model)
 
     # Hijack attention with xformers
     if any((shared.args.xformers, shared.args.sdp_attention)):
         llama_attn_hijack.hijack_llama_attention()
 
     logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.\n")
-    return model, tokenizer
+    return lcl_model, lcl_tokenizer
 
 
-def load_tokenizer(model_name, model):
+def load_tokenizer(model_name, tokenizer):
     tokenizer = None
     path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
     if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
@@ -145,13 +145,9 @@ def huggingface_loader(model_name):
             LoaderClass = AutoModelForCausalLM
 
     # Load the model in simple 16-bit mode by default
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.lcllcl is not None, shared.args.cpu_memory is not None]):
         model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
-        if torch.backends.mps.is_available():
-            device = torch.device('mps')
-            model = model.to(device)
-        else:
-            model = model.cuda()
+        model = model.to()
 
     # DeepSpeed ZeRO-3
     elif shared.args.deepspeed:
@@ -167,8 +163,7 @@ def huggingface_loader(model_name):
             "trust_remote_code": shared.args.trust_remote_code
         }
 
-        if not any((shared.args.cpu, torch.cuda.is_available(), torch.backends.mps.is_available())):
-            logger.warning("torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.")
+        if not gpu_available():
             shared.args.cpu = True
 
         if shared.args.cpu:
@@ -250,7 +245,8 @@ def flexgen_loader(model_name):
 def RWKV_loader(model_name):
     from modules.RWKV import RWKVModel, RWKVTokenizer
 
-    model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
+    gpu_dev = get_gpu()
+    model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device=gpu_dev)
     tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir))
     return model, tokenizer
 
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 0a86b4fd..08ab4826 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -9,6 +9,7 @@
     LogitsProcessorList,
     TemperatureLogitsWarper
 )
+from modules.ComputeDevice import get_gpu
 
 
 class TailFreeLogitsWarper(LogitsWarper):
@@ -106,7 +107,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         # Normalize the probabilities of the remaining words
         prob_topk = torch.softmax(sorted_logits, dim=0)
 
-        prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
+        gpu_dev = get_gpu()
+        prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to(gpu_dev)
 
         observed_surprise = -math.log2(prob_topk[prev_i])
         self.e = observed_surprise - self.mirostat_tau
diff --git a/modules/text_generation.py b/modules/text_generation.py
index e1be6aa3..e42b87f3 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -19,7 +19,8 @@
 from modules.extensions import apply_extensions
 from modules.html_generator import generate_4chan_html, generate_basic_html
 from modules.logging_colors import logger
-from modules.models import clear_torch_cache, local_rank
+from modules.models import clear_torch_cache
+from modules.ComputeDevice import get_gpu
 
 
 def generate_reply(*args, **kwargs):
@@ -53,13 +54,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
     if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu:
         return input_ids
-    elif shared.args.deepspeed:
-        return input_ids.to(device=local_rank)
-    elif torch.backends.mps.is_available():
-        device = torch.device('mps')
-        return input_ids.to(device)
-    else:
-        return input_ids.cuda()
+
+    get_gpu()
+    return gpu.to()
 
 
 def get_encoded_length(prompt):
@@ -124,8 +121,6 @@ def set_manual_seed(seed):
         seed = random.randint(1, 2**31)
 
     torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
 
     return seed
 
@@ -246,7 +241,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
     # Encode the input
     input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
     output = input_ids[0]
-    cuda = not any((shared.args.cpu, shared.args.deepspeed))
 
     # Add the encoded tokens to generate_params
     question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
@@ -277,8 +271,7 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
         if not state['stream']:
             with torch.no_grad():
                 output = shared.model.generate(**generate_params)[0]
-                if cuda:
-                    output = output.cuda()
+                output = output.to()
 
             yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
 
diff --git a/server.py b/server.py
index a52655a7..5ae1e039 100644
--- a/server.py
+++ b/server.py
@@ -1,20 +1,24 @@
-import os
-import warnings
-import cProfile
-
-from modules.logging_colors import logger
-from modules.block_requests import OpenMonkeyPatch, RequestBlocker
-
-os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
-os.environ['BITSANDBYTES_NOWELCOME'] = '1'
-warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
-
-with RequestBlocker():
-    import gradio as gr
-
 import matplotlib
-matplotlib.use('Agg')  # This fixes LaTeX rendering on some systems
-
+from functools import partial
+from pathlib import Path
+from threading import Lock
+from PIL import Image
+from modules import chat, loaders, presets, shared, training, ui, utils
+from modules.block_requests import OpenMonkeyPatch, RequestBlocker
+from modules.extensions import apply_extensions
+from modules.github import clone_or_pull_repository
+from modules.html_generator import chat_html_wrapper
+from modules.logging_colors import logger
+from modules.LoRA import add_lora_to_model
+from modules.models import load_model, unload_model
+from modules.models_settings import (apply_model_settings_to_state,
+                                     get_model_settings_from_yamls,
+                                     save_model_settings,
+                                     update_model_parameters)
+from modules.text_generation import (generate_reply_wrapper,
+                                     get_encoded_length, stop_everything_event)
+from modules.utils import gradio
+import cProfile
 import importlib
 import json
 import math
@@ -23,34 +27,21 @@
 import sys
 import time
 import traceback
-from functools import partial
-from pathlib import Path
-from threading import Lock
-
+import warnings
 import psutil
 import torch
 import yaml
-from PIL import Image
-
 import modules.extensions as extensions_module
-from modules import chat, loaders, presets, shared, training, ui, utils
-from modules.extensions import apply_extensions
-from modules.github import clone_or_pull_repository
-from modules.html_generator import chat_html_wrapper
-from modules.LoRA import add_lora_to_model
-from modules.models import load_model, unload_model
-from modules.models_settings import (
-    apply_model_settings_to_state,
-    get_model_settings_from_yamls,
-    save_model_settings,
-    update_model_parameters
-)
-from modules.text_generation import (
-    generate_reply_wrapper,
-    get_encoded_length,
-    stop_everything_event
-)
-from modules.utils import gradio
+
+os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
+os.environ['BITSANDBYTES_NOWELCOME'] = '1'
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+
+with RequestBlocker():
+    import gradio as gr
+
+matplotlib.use('Agg')  # This fixes LaTeX rendering on some systems
+
 
 
 def load_model_wrapper(selected_model, loader, autoload=False):
@@ -572,6 +563,7 @@ def set_interface_arguments(interface_mode, extensions, bool_active):
     shared.need_restart = True
 
 def shutdown_server(interface_mode, extensions, bool_active):
+    stop_everything_event()
     shared.run_server = False
 
 def create_interface():
@@ -1197,9 +1189,9 @@ def main():
             create_interface()
 
 if __name__ == "__main__":
-    if os.getenv('DEBUG_PROF') == 1:
-        logger.info(f"Profiling activated sending information to outlut.prof")
-        cProfile.run(main(), 'output.prof')
+    if os.getenv('DEBUG_PROF') == "1":
+        logger.info(f"Profiling activated sending information to output.prof")
+        cProfile.run('main()', "outout.prof")
     else:
         main()
 

From 38e55930bb5269f9bd4d8385d01988529095ccb3 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Sun, 6 Aug 2023 13:20:18 -0500
Subject: [PATCH 03/13] mpre changes.

---
 .env.default                                  |  33 +++
 .env.local                                    |  85 +++++++
 .gitignore                                    |   1 +
 README.md                                     |  30 +--
 .../instruction-following/Llama-v2.yaml       |   2 +-
 convert-to-flexgen.py                         |  63 -----
 docker/.dockerignore                          |   9 -
 docker/.env.example                           |  30 ---
 docker/Dockerfile                             |  68 -----
 docker/docker-compose.yml                     |  32 ---
 docs/Extensions.md                            | 233 +++++++++++------
 docs/FlexGen.md                               |  64 -----
 docs/Low-VRAM-guide.md                        |  53 ----
 docs/README.md                                |   2 -
 docs/llama.cpp-models.md                      |  53 ----
 extensions/api/blocking_api.py                |  13 +-
 extensions/api/streaming_api.py               |   9 +-
 extensions/api/util.py                        |   1 -
 extensions/example/script.py                  | 137 ++++++++++
 extensions/llava/script.py                    |   8 -
 extensions/multimodal/script.py               |   9 +
 extensions/openai/README.md                   | 117 +++++----
 extensions/openai/completions.py              | 117 ++++++---
 extensions/openai/defaults.py                 |   2 -
 extensions/openai/embeddings.py               |  31 ++-
 extensions/openai/errors.py                   |   8 +-
 extensions/openai/images.py                   |  28 ++-
 extensions/openai/moderations.py              |  15 +-
 extensions/openai/script.py                   |  28 ++-
 extensions/openai/tokens.py                   |   7 +-
 extensions/openai/utils.py                    |   4 +-
 extensions/send_pictures/script.py            |  16 +-
 extensions/silero_tts/script.py               |   6 +-
 extensions/superbooga/download_urls.py        |   5 +-
 extensions/whisper_stt/script.py              |  10 +
 models/config.yaml                            |   2 +-
 modules/AutoGPTQ_loader.py                    |   3 +-
 modules/ComputeDevice.py                      | 234 ++++++++++++++++++
 modules/GPTQ_loader.py                        |  18 +-
 modules/RWKV.py                               |   1 +
 modules/callbacks.py                          |   4 +-
 modules/chat.py                               |   6 +-
 modules/deepspeed_parameters.py               |   2 +-
 modules/extensions.py                         |  23 +-
 modules/llamacpp_hf.py                        |   5 +-
 modules/llamacpp_model.py                     |   6 +-
 modules/loaders.py                            |   4 +
 modules/models.py                             |  30 +--
 modules/models_settings.py                    |   2 -
 modules/sampler_hijack.py                     |   4 +-
 modules/shared.py                             |  24 +-
 modules/text_generation.py                    |  74 +-----
 modules/training.py                           |   4 +-
 modules/ui.py                                 |  10 +-
 modules/utils.py                              |   5 +-
 oobainst                                      | 101 ++++++++
 oobastart                                     |  94 +++++++
 server.py                                     | 140 ++++++-----
 settings-template.yaml                        |   2 +-
 59 files changed, 1286 insertions(+), 841 deletions(-)
 create mode 100644 .env.default
 create mode 100644 .env.local
 delete mode 100644 convert-to-flexgen.py
 delete mode 100644 docker/.dockerignore
 delete mode 100644 docker/.env.example
 delete mode 100644 docker/Dockerfile
 delete mode 100644 docker/docker-compose.yml
 delete mode 100644 docs/FlexGen.md
 delete mode 100644 docs/Low-VRAM-guide.md
 delete mode 100644 docs/llama.cpp-models.md
 create mode 100644 extensions/example/script.py
 delete mode 100644 extensions/llava/script.py
 create mode 100644 modules/ComputeDevice.py
 create mode 100644 oobainst
 create mode 100755 oobastart

diff --git a/.env.default b/.env.default
new file mode 100644
index 00000000..3f156c15
--- /dev/null
+++ b/.env.default
@@ -0,0 +1,33 @@
+#/bin/bash
+
+# This is for overriding the setup_*.sh, webui.py, server.py, and other
+# oogabooga environment variables. If they are not set here, they will
+# default to what the one-click-installers use as their defaults.
+#
+export OOBABOOGA_INST=="${OOBABOOGA_INST:=${OOBABOOGA_BASE}/oobagooga_instal}"
+export OOBABOOGA_BASE="${OOBABOOGA_BASE:=${PWD}}"
+export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui"
+export OOBABOOGA_OPTS="--chat"
+
+# This file will be overwritten by updating with a pull if new variables
+# are added to support the environment.
+#
+# The following was generated by the following command line in my login
+# shell environment.  I already have miniconda/conda installed.
+#
+# conda info --all | \
+#     grep '^CONDA' | \
+#     awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}'
+#
+
+export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
+export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
+
+export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
+export CONDA_EXE="${CONDA_EXE:=${OOBABOOGA_INST}/installer_files/conda/bin/conda}"
+export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+export CONDA_ROOT="${CONDA_ROOT:=${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda}"
+export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"
diff --git a/.env.local b/.env.local
new file mode 100644
index 00000000..86b8e4ba
--- /dev/null
+++ b/.env.local
@@ -0,0 +1,85 @@
+#/bin/bash
+
+# This is for overriding the setup_*.sh, webui.py, server.py, and other
+# oogabooga environment variables. If they are not set here, they will
+# default to what the one-click-installers use as their defaults.
+#
+export OOBABOOGA_INST=="oobabooga"
+export OOBABOOGA_BASE="${HOME}/${OOBABOOGA_BASE}"
+export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui-macos"
+export OOBABOOGA_OPTS="--chat --verbose "
+
+# This file will be overwritten by updating with a pull if new variables
+# are added to support the environment.
+#
+# export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
+# export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
+# export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
+# export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+# export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+# export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"
+
+# The following was generated by the following command line in my login
+# shell environment.  I already have miniconda/conda installed.
+#
+# conda info --all | grep '^CONDA' | awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}
+#
+export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
+export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen.00}"
+export CONDA_EXE="${CONDA_EXE:=/Users/mps/miniconda3/bin/conda}"
+export CONDA_PREFIX="${CONDA_PREFIX:=/Users/mps/miniconda3}"
+export CONDA_PROMPT_MODIFIER="${CONDA_PROMPT_MODIFIER:=(base) }"
+export CONDA_PYTHON_EXE="${CONDA_PYTHON_EXE:=/Users/mps/miniconda3/bin/python}"
+export CONDA_ROOT="${CONDA_ROOT:=/Users/mps/miniconda3}"#/bin/bash
+
+# This is for overriding the setup_*.sh, webui.py, server.py, and other
+# oogabooga environment variables. If they are not set here, they will
+# default to what the one-click-installers use as their defaults.
+
+OOBABOOGA_BASE="oobabooga_macos"
+
+OOBABOOGA_INST="${HOME}/${OOBABOOGA_BASE}"
+
+export CONDA_DEFAULT_ENV="textgen"
+export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
+export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+export CONDA_PROMPT_MODIFIER="() "
+export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"#/bin/bash
+
+# This is for overriding the setup_*.sh, webui.py, server.py, and other
+# oogabooga environment variables. If they are not set here, they will
+# default to what the one-click-installers use as their defaults.
+#
+export OOBABOOGA_INST=="oobabooga-test-install"
+export OOBABOOGA_BASE="${HOME}/${OOBABOOGA_BASE}"
+export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui"
+export OOBABOOGA_OPTS="--chat --verbose "
+
+# This file will be overwritten by updating with a pull if new variables
+# are added to support the environment.
+#
+# export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
+# export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
+# export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
+# export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+# export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+# export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"
+
+# The following was generated by the following command line in my login
+# shell environment.  I already have miniconda/conda installed.
+#
+# conda info --all | \
+#     grep '^CONDA' | \
+#     awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}'
+
+export CONDA_ROOT="/Users/mps/miniconda3"
+export CONDA_PREFIX="${CONDA_ROOT}"
+# export CONDA_PROMPT_MODIFIER=" (base)"
+export CONDA_PYTHON_EXE="${CONDA_ROOT}/bin/python"
+export CONDA_EXE="${CONDA_DEFAULT_ENV}/bin/conda"
+
+# Set the default Conda environment and the environment for the Web GUI
+#
+export CONDA_DEFAULT_ENV="base"
+export CONDA_OOBABOOGA_ENV="textgen"
diff --git a/.gitignore b/.gitignore
index a529ce65..716d894e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,4 @@ Thumbs.db
 
 *.swp
 .*.un~
+*.prof
diff --git a/README.md b/README.md
index ae485898..a703b01d 100644
--- a/README.md
+++ b/README.md
@@ -36,9 +36,9 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
       - [AutoGPTQ](#autogptq)
       - [ExLlama](#exllama)
       - [GPTQ-for-LLaMa](#gptq-for-llama)
-      - [FlexGen](#flexgen)
       - [DeepSpeed](#deepspeed)
       - [RWKV](#rwkv)
+      - [RoPE (for llama.cpp and ExLlama only)](#rope-for-llamacpp-and-exllama-only)
       - [Gradio](#gradio)
       - [API](#api)
       - [Multimodal](#multimodal)
@@ -47,7 +47,6 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
   - [Community](#community)
   - [Credits](#credits)
 
-
 ## Features
 
 * 3 interface modes: default, notebook, and chat
@@ -56,7 +55,7 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
 * LoRA: load and unload LoRAs on the fly, load multiple LoRAs at the same time, train a new LoRA
 * Precise instruction templates for chat mode, including Alpaca, Vicuna, Open Assistant, Dolly, Koala, ChatGLM, MOSS, RWKV-Raven, Galactica, StableLM, WizardLM, Baize, Ziya, Chinese-Vicuna, MPT, INCITE, Wizard Mega, KoAlpaca, Vigogne, Bactrian, h2o, and OpenBuddy
 * [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
-* 8-bit and 4-bit inference through bitsandbytes **CPU only mode for macOS, bitsandbytes does not support Apple Silicon M1/M2 processors**
+* 8-bit and 4-bit inference through bitsandbytes **CPU only mode for macOS, bitsandbytes does not support Apple Silicon GPU**
 * CPU mode for transformers models
 * [DeepSpeed ZeRO-3 inference](docs/DeepSpeed.md)
 * [Extensions](docs/Extensions.md)
@@ -165,7 +164,7 @@ Optionally, you can use the following command-line flags:
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
 
 #### Accelerate/transformers
 
@@ -203,8 +202,8 @@ Optionally, you can use the following command-line flags:
 | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
 | `--no-mmap` | Prevent mmap from being used. |
 | `--mlock`   | Force the system to keep the model in RAM. |
-| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
-| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
+| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. Does not apply for Apple Silicon GPU since it uses unified memory. |
+| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with Apple Silicon GPU Support for BLAS and llama-cpp using Metal. Load the model and look for **llama_model_load_internal: n_layer in ths STDERR and this will show you the number of layers in the model. Set this value to that number or possibly n + 2, This si very sensitive now an will overrun your data area or tensor cache causing a segmentation fault. |
 | `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 | `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama2 70b. |
@@ -226,8 +225,6 @@ Optionally, you can use the following command-line flags:
 |------------------|-------------|
 |`--gpu-split`     | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
 |`--max_seq_len MAX_SEQ_LEN`           | Maximum sequence length. |
-|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
-|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both. `
 
 #### GPTQ-for-LLaMa
 
@@ -243,14 +240,6 @@ Optionally, you can use the following command-line flags:
 | `--warmup_autotune`    | (triton) Enable warmup autotune. |
 | `--fused_mlp`          | (triton) Enable fused mlp. |
 
-#### FlexGen
-
-| Flag             | Description |
-|------------------|-------------|
-| `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
-| `--compress-weight`               | FlexGen: Whether to compress weight (default: False).|
-| `--pin-weight [PIN_WEIGHT]`       | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
-
 #### DeepSpeed
 
 | Flag                                  | Description |
@@ -266,6 +255,13 @@ Optionally, you can use the following command-line flags:
 | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
 | `--rwkv-cuda-on`                | RWKV: Compile the CUDA kernel for better performance. |
 
+#### RoPE (for llama.cpp and ExLlama only)
+
+| Flag             | Description |
+|------------------|-------------|
+|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
+|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both. |
+
 #### Gradio
 
 | Flag                                  | Description |
@@ -293,8 +289,6 @@ Optionally, you can use the following command-line flags:
 |---------------------------------------|-------------|
 | `--multimodal-pipeline PIPELINE`      | The multimodal pipeline to use. Examples: `llava-7b`, `llava-13b`. |
 
-Out of memory errors? [Check the low VRAM guide](docs/Low-VRAM-guide.md).
-
 ## Presets
 
 Inference settings presets can be created under `presets/` as yaml files. These files are detected automatically at startup.
diff --git a/characters/instruction-following/Llama-v2.yaml b/characters/instruction-following/Llama-v2.yaml
index a3af0e87..d259dd39 100644
--- a/characters/instruction-following/Llama-v2.yaml
+++ b/characters/instruction-following/Llama-v2.yaml
@@ -1,4 +1,4 @@
 user: ""
 bot: ""
 turn_template: "<|user|><|user-message|> [/INST] <|bot|><|bot-message|> </s><s>[INST] "
-context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n"
+context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n\n"
diff --git a/convert-to-flexgen.py b/convert-to-flexgen.py
deleted file mode 100644
index 7654593b..00000000
--- a/convert-to-flexgen.py
+++ /dev/null
@@ -1,63 +0,0 @@
-'''
-
-Converts a transformers model to a format compatible with flexgen.
-
-'''
-
-import argparse
-import os
-from pathlib import Path
-
-import numpy as np
-import torch
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54))
-parser.add_argument('MODEL', type=str, default=None, nargs='?', help="Path to the input model.")
-args = parser.parse_args()
-
-
-def disable_torch_init():
-    """
-    Disable the redundant torch default initialization to accelerate model creation.
-    """
-    import torch
-    global torch_linear_init_backup
-    global torch_layer_norm_init_backup
-
-    torch_linear_init_backup = torch.nn.Linear.reset_parameters
-    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
-
-    torch_layer_norm_init_backup = torch.nn.LayerNorm.reset_parameters
-    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-
-
-def restore_torch_init():
-    """Rollback the change made by disable_torch_init."""
-    import torch
-    setattr(torch.nn.Linear, "reset_parameters", torch_linear_init_backup)
-    setattr(torch.nn.LayerNorm, "reset_parameters", torch_layer_norm_init_backup)
-
-
-if __name__ == '__main__':
-    path = Path(args.MODEL)
-    model_name = path.name
-
-    print(f"Loading {model_name}...")
-    # disable_torch_init()
-    model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
-    # restore_torch_init()
-
-    tokenizer = AutoTokenizer.from_pretrained(path)
-
-    out_folder = Path(f"models/{model_name}-np")
-    if not Path(out_folder).exists():
-        os.mkdir(out_folder)
-
-    print(f"Saving the converted model to {out_folder}...")
-    for name, param in tqdm(list(model.model.named_parameters())):
-        name = name.replace("decoder.final_layer_norm", "decoder.layer_norm")
-        param_path = os.path.join(out_folder, name)
-        with open(param_path, "wb") as f:
-            np.save(f, param.cpu().detach().numpy())
diff --git a/docker/.dockerignore b/docker/.dockerignore
deleted file mode 100644
index 6073533e..00000000
--- a/docker/.dockerignore
+++ /dev/null
@@ -1,9 +0,0 @@
-.env
-Dockerfile
-/characters
-/loras
-/models
-/presets
-/prompts
-/softprompts
-/training
diff --git a/docker/.env.example b/docker/.env.example
deleted file mode 100644
index 3119a9f0..00000000
--- a/docker/.env.example
+++ /dev/null
@@ -1,30 +0,0 @@
-# by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX
-# however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5
-# https://developer.nvidia.com/cuda-gpus you can find the version for your card here
-TORCH_CUDA_ARCH_LIST=7.5
-
-# these commands worked for me with roughly 4.5GB of vram
-CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices
-
-# the following examples have been tested with the files linked in docs/README_docker.md:
-# example running 13b with 4bit/128 groupsize        : CLI_ARGS=--model llama-13b-4bit-128g --wbits 4 --listen --groupsize 128 --pre_layer 25
-# example with loading api extension and public share: CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices --no-stream --extensions api --share
-# example running 7b with 8bit groupsize             : CLI_ARGS=--model llama-7b --load-in-8bit --listen --auto-devices
-
-# the port the webui binds to on the host
-HOST_PORT=7860
-# the port the webui binds to inside the container
-CONTAINER_PORT=7860
-
-# the port the api binds to on the host
-HOST_API_PORT=5000
-# the port the api binds to inside the container
-CONTAINER_API_PORT=5000
-
-# the port the api stream endpoint binds to on the host
-HOST_API_STREAM_PORT=5005
-# the port the api stream endpoint binds to inside the container
-CONTAINER_API_STREAM_PORT=5005
-
-# the version used to install text-generation-webui from
-WEBUI_VERSION=HEAD
diff --git a/docker/Dockerfile b/docker/Dockerfile
deleted file mode 100644
index 7cc0ff15..00000000
--- a/docker/Dockerfile
+++ /dev/null
@@ -1,68 +0,0 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder
-
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa /build
-
-WORKDIR /build
-
-RUN python3 -m venv /build/venv
-RUN . /build/venv/bin/activate && \
-    pip3 install --upgrade pip setuptools wheel && \
-    pip3 install torch torchvision torchaudio && \
-    pip3 install -r requirements.txt
-
-# https://developer.nvidia.com/cuda-gpus
-# for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
-ARG TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
-RUN . /build/venv/bin/activate && \
-    python3 setup_cuda.py bdist_wheel -d .
-
-FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
-
-LABEL maintainer="Your Name <your.email@example.com>"
-LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
-
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv
-RUN mkdir /app
-
-WORKDIR /app
-
-ARG WEBUI_VERSION
-RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source"
-
-RUN virtualenv /app/venv
-RUN . /app/venv/bin/activate && \
-    pip3 install --upgrade pip setuptools wheel && \
-    pip3 install torch torchvision torchaudio
-
-COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
-RUN . /app/venv/bin/activate && \
-    pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl
-
-COPY extensions/api/requirements.txt /app/extensions/api/requirements.txt
-COPY extensions/elevenlabs_tts/requirements.txt /app/extensions/elevenlabs_tts/requirements.txt
-COPY extensions/google_translate/requirements.txt /app/extensions/google_translate/requirements.txt
-COPY extensions/silero_tts/requirements.txt /app/extensions/silero_tts/requirements.txt
-COPY extensions/whisper_stt/requirements.txt /app/extensions/whisper_stt/requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/api && pip3 install -r requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/elevenlabs_tts && pip3 install -r requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/google_translate && pip3 install -r requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/silero_tts && pip3 install -r requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/whisper_stt && pip3 install -r requirements.txt
-
-COPY requirements.txt /app/requirements.txt
-RUN . /app/venv/bin/activate && \
-    pip3 install -r requirements.txt
-
-RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
-
-COPY . /app/
-ENV CLI_ARGS=""
-CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS}
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
deleted file mode 100644
index 46b27580..00000000
--- a/docker/docker-compose.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-version: "3.3"
-services:
-  text-generation-webui:
-    build:
-      context: .
-      args:
-        # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
-        TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
-        WEBUI_VERSION: ${WEBUI_VERSION:-HEAD}
-    env_file: .env
-    ports:
-      - "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
-      - "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
-      - "${HOST_API_STREAM_PORT:-5005}:${CONTAINER_API_STREAM_PORT:-5005}"
-    stdin_open: true
-    tty: true
-    volumes:
-      - ./characters:/app/characters
-      - ./extensions:/app/extensions
-      - ./loras:/app/loras
-      - ./models:/app/models
-      - ./presets:/app/presets
-      - ./prompts:/app/prompts
-      - ./softprompts:/app/softprompts
-      - ./training:/app/training
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ['0']
-              capabilities: [gpu]
diff --git a/docs/Extensions.md b/docs/Extensions.md
index e156456b..c7d1aa36 100644
--- a/docs/Extensions.md
+++ b/docs/Extensions.md
@@ -1,45 +1,47 @@
-Extensions are defined by files named `script.py` inside subfolders of `text-generation-webui/extensions`. They are loaded at startup if specified with the `--extensions` flag.
+# Extensions
+
+Extensions are defined by files named `script.py` inside subfolders of `text-generation-webui/extensions`. They are loaded at startup if the folder name is specified after the `--extensions` flag.
 
 For instance, `extensions/silero_tts/script.py` gets loaded with `python server.py --extensions silero_tts`.
 
 ## [text-generation-webui-extensions](https://github.com/oobabooga/text-generation-webui-extensions)
 
-The link above contains a directory of user extensions for text-generation-webui.
+The repository above contains a directory of user extensions.
 
-If you create an extension, you are welcome to host it in a GitHub repository and submit it to the list above.
+If you create an extension, you are welcome to host it in a GitHub repository and submit a PR adding it to the list.
 
 ## Built-in extensions
 
-Most of these have been created by the extremely talented contributors that you can find here: [contributors](https://github.com/oobabooga/text-generation-webui/graphs/contributors?from=2022-12-18&to=&type=a).
-
 |Extension|Description|
 |---------|-----------|
-|[api](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/api)| Creates an API with two endpoints, one for streaming at `/api/v1/stream` port 5005 and another for blocking at `/api/v1/generate` port 5000. This is the main API for this web UI. |
+|[api](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/api)| Creates an API with two endpoints, one for streaming at `/api/v1/stream` port 5005 and another for blocking at `/api/v1/generate` port 5000. This is the main API for the webui. |
+|[openai](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai)| Creates an API that mimics the OpenAI API and can be used as a drop-in replacement. |
+|[multimodal](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal) | Adds multimodality support (text+images). For a detailed description see [README.md](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal/README.md) in the extension directory. |
 |[google_translate](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/google_translate)| Automatically translates inputs and outputs using Google Translate.|
-|[character_bias](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/character_bias)| Just a very simple example that biases the bot's responses in chat mode.|
-|[gallery](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/gallery/)| Creates a gallery with the chat characters and their pictures. |
-|[silero_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/silero_tts)| Text-to-speech extension using [Silero](https://github.com/snakers4/silero-models). When used in chat mode, it replaces the responses with an audio widget. |
+|[silero_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/silero_tts)| Text-to-speech extension using [Silero](https://github.com/snakers4/silero-models). When used in chat mode, responses are replaced with an audio widget. |
 |[elevenlabs_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/elevenlabs_tts)| Text-to-speech extension using the [ElevenLabs](https://beta.elevenlabs.io/) API. You need an API key to use it. |
-|[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. |
 |[whisper_stt](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/whisper_stt)| Allows you to enter your inputs in chat mode using your microphone. |
 |[sd_api_pictures](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/sd_api_pictures)| Allows you to request pictures from the bot in chat mode, which will be generated using the AUTOMATIC1111 Stable Diffusion API. See examples [here](https://github.com/oobabooga/text-generation-webui/pull/309). |
-|[multimodal](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal) | Adds multimodality support (text+images). For a detailed description see [README.md](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal/README.md) in the extension directory. |
-|[openai](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai)| Creates an API that mimics the OpenAI API and can be used as a drop-in replacement. |
+|[character_bias](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/character_bias)| Just a very simple example that adds a hidden string at the beginning of the bot's reply in chat mode. |
+|[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. |
+|[gallery](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/gallery/)| Creates a gallery with the chat characters and their pictures. |
 |[superbooga](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/superbooga)| An extension that uses ChromaDB to create an arbitrarily large pseudocontext, taking as input text files, URLs, or pasted text. Based on https://github.com/kaiokendev/superbig. |
+|[ngrok](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/ngrok)| Allows you to access the web UI remotely using the ngrok reverse tunnel service (free). It's an alternative to the built-in Gradio `--share` feature. |
+|[perplexity_colors](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/perplexity_colors)| Colors each token in the output text by its associated probability, as derived from the model logits. |
 
 ## How to write an extension
 
-script.py may define the special functions and variables below.
-
-#### Predefined functions
+The extensions framework is based on special functions and variables that you can define in `script.py`. The functions are the following:
 
 | Function        | Description |
 |-------------|-------------|
+| `def setup()` | Is executed when the extension gets imported. |
 | `def ui()` | Creates custom gradio elements when the UI is launched. | 
 | `def custom_css()` | Returns custom CSS as a string. It is applied whenever the web UI is loaded. |
 | `def custom_js()` | Same as above but for javascript. |
 | `def input_modifier(string, state)`  | Modifies the input string before it enters the model. In chat mode, it is applied to the user message. Otherwise, it is applied to the entire prompt. |
 | `def output_modifier(string, state)`  | Modifies the output string before it is presented in the UI. In chat mode, it is applied to the bot's reply. Otherwise, it is applied to the entire output. |
+| `def chat_input_modifier(text, visible_text, state)` | Modifies both the visible and internal inputs in chat mode. Can be used to hijack the chat input with custom content. |
 | `def bot_prefix_modifier(string, state)`  | Applied in chat mode to the prefix for the bot's reply. |
 | `def state_modifier(state)`  | Modifies the dictionary containing the UI input parameters before it is used by the text generation functions. |
 | `def history_modifier(history)`  | Modifies the chat history before the text generation in chat mode begins. |
@@ -48,9 +50,7 @@ script.py may define the special functions and variables below.
 | `def tokenizer_modifier(state, prompt, input_ids, input_embeds)` | Modifies the `input_ids`/`input_embeds` fed to the model. Should return `prompt`, `input_ids`, `input_embeds`. See the `multimodal` extension for an example. |
 | `def custom_tokenized_length(prompt)` | Used in conjunction with `tokenizer_modifier`, returns the length in tokens of `prompt`. See the `multimodal` extension for an example. |
 
-#### `params` dictionary
-
-In this dictionary, `display_name` is used to define the displayed name of the extension in the UI, and `is_tab` is used to define whether the extension should appear in a new tab. By default, extensions appear at the bottom of the "Text generation" tab.
+Additionally, you can define a special `params` dictionary. In it, the `display_name` key is used to define the displayed name of the extension in the UI, and the `is_tab` key is used to define whether the extension should appear in a new tab. By default, extensions appear at the bottom of the "Text generation" tab.
 
 Example:
 
@@ -61,7 +61,7 @@ params = {
 }
 ```
 
-Additionally, `params` may contain variables that you want to be customizable through a `settings.json` file. For instance, assuming the extension is in `extensions/google_translate`, the variable `language string` in
+The `params` dict may also contain variables that you want to be customizable through a `settings.yaml` file. For instance, assuming the extension is in `extensions/google_translate`, the variable `language string` in
 
 ```python
 params = {
@@ -71,32 +71,19 @@ params = {
 }
 ```
 
-can be customized by adding a key called `google_translate-language string` to `settings.json`:
+can be customized by adding a key called `google_translate-language string` to `settings.yaml`:
 
 ```python
-"google_translate-language string": "fr",
+google_translate-language string: 'fr'
 ``` 
 
-That is, the syntax is `extension_name-variable_name`.
-
-#### `input_hijack` dictionary
-
-```python
-input_hijack = {
-    'state': False,
-    'value': ["", ""]
-}
-```
-This is only used in chat mode. If your extension sets `input_hijack['state'] = True` at any moment, the next call to `modules.chat.chatbot_wrapper` will use the values inside `input_hijack['value']` as the user input for text generation. See the `send_pictures` extension above for an example. 
-
-Additionally, your extension can set the value to be a callback in the form of `def cb(text: str, visible_text: str) -> [str, str]`. See the `multimodal` extension above for an example.
+That is, the syntax for the key is `extension_name-variable_name`.
 
 ## Using multiple extensions at the same time
 
-In order to use your extension, you must start the web UI with the `--extensions` flag followed by the name of your extension (the folder under `text-generation-webui/extension` where `script.py` resides).
-
-You can activate more than one extension at a time by providing their names separated by spaces. The input, output, and bot prefix modifiers will be applied in the specified order. 
+You can activate more than one extension at a time by providing their names separated by spaces after `--extensions`. The input, output, and bot prefix modifiers will be applied in the specified order. 
 
+Example:
 
 ```
 python server.py --extensions enthusiasm translate # First apply enthusiasm, then translate
@@ -106,56 +93,150 @@ python server.py --extensions translate enthusiasm # First apply translate, then
 Do note, that for:
 - `custom_generate_chat_prompt`
 - `custom_generate_reply`
-- `tokenizer_modifier`
 - `custom_tokenized_length`
 
 only the first declaration encountered will be used and the rest will be ignored. 
 
-## The `bot_prefix_modifier`
-
-In chat mode, this function modifies the prefix for a new bot message. For instance, if your bot is named `Marie Antoinette`, the default prefix for a new message will be
-
-```
-Marie Antoinette:
-```
-
-Using `bot_prefix_modifier`, you can change it to:
-
-```
-Marie Antoinette: *I am very enthusiastic*
-```
- 
-Marie Antoinette will become very enthusiastic in all her messages.
-
-## `custom_generate_reply` example
+## A full example
 
-Once defined in a `script.py`, this function is executed in place of the main generation functions. You can use it to connect the web UI to an external API, or to load a custom model that is not supported yet.
-
-Note that in chat mode, this function must only return the new text, whereas in other modes it must return the original prompt + the new text.
+The source code below can be found at [extensions/example/script.py](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/example/script.py).
 
 ```python
-import datetime
-
-def custom_generate_reply(question, original_question, seed, state, stopping_strings):
-    cumulative = ''
-    for i in range(10):
-        cumulative += f"Counting: {i}...\n"
-        yield cumulative
+"""
+An example of extension. It does nothing, but you can add transformations
+before the return statements to customize the webui behavior.
 
-    cumulative += f"Done! {str(datetime.datetime.now())}"
-    yield cumulative
-```
+Starting from history_modifier and ending in output_modifier, the
+functions are declared in the same order that they are called at
+generation time.
+"""
 
-## `custom_generate_chat_prompt` example
+import torch
+from modules import chat
+from modules.text_generation import (
+    decode,
+    encode,
+    generate_reply,
+)
+from transformers import LogitsProcessor
 
-Below is an extension that just reproduces the default prompt generator in `modules/chat.py`. You can modify it freely to come up with your own prompts in chat mode.
+params = {
+    "display_name": "Example Extension",
+    "is_tab": False,
+}
 
-```python
-from modules import chat
+class MyLogits(LogitsProcessor):
+    """
+    Manipulates the probabilities for the next token before it gets sampled.
+    Used in the logits_processor_modifier function below.
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, input_ids, scores):
+        # probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+        # probs[0] /= probs[0].sum()
+        # scores = torch.log(probs / (1 - probs))
+        return scores
+
+def history_modifier(history):
+    """
+    Modifies the chat history.
+    Only used in chat mode.
+    """
+    return history
+
+def state_modifier(state):
+    """
+    Modifies the state variable, which is a dictionary containing the input
+    values in the UI like sliders and checkboxes.
+    """
+    return state
+
+def chat_input_modifier(text, visible_text, state):
+    """
+    Modifies the user input string in chat mode (visible_text).
+    You can also modify the internal representation of the user
+    input (text) to change how it will appear in the prompt.
+    """
+    return text, visible_text
+
+def input_modifier(string, state):
+    """
+    In default/notebook modes, modifies the whole prompt.
+
+    In chat mode, it is the same as chat_input_modifier but only applied
+    to "text", here called "string", and not to "visible_text".
+    """
+    return string
+
+def bot_prefix_modifier(string, state):
+    """
+    Modifies the prefix for the next bot reply in chat mode.
+    By default, the prefix will be something like "Bot Name:".
+    """
+    return string
+
+def tokenizer_modifier(state, prompt, input_ids, input_embeds):
+    """
+    Modifies the input ids and embeds.
+    Used by the multimodal extension to put image embeddings in the prompt.
+    Only used by loaders that use the transformers library for sampling.
+    """
+    return prompt, input_ids, input_embeds
+
+def logits_processor_modifier(processor_list, input_ids):
+    """
+    Adds logits processors to the list, allowing you to access and modify
+    the next token probabilities.
+    Only used by loaders that use the transformers library for sampling.
+    """
+    processor_list.append(MyLogits())
+    return processor_list
+
+def output_modifier(string, state):
+    """
+    Modifies the LLM output before it gets presented.
+
+    In chat mode, the modified version goes into history['visible'],
+    and the original version goes into history['internal'].
+    """
+    return string
 
 def custom_generate_chat_prompt(user_input, state, **kwargs):
-    
-    # Do something with kwargs['history'] or state
-
-    return chat.generate_chat_prompt(user_input, state, **kwargs)
+    """
+    Replaces the function that generates the prompt from the chat history.
+    Only used in chat mode.
+    """
+    result = chat.generate_chat_prompt(user_input, state, **kwargs)
+    return result
+
+def custom_css():
+    """
+    Returns a CSS string that gets appended to the CSS for the webui.
+    """
+    return ''
+
+def custom_js():
+    """
+    Returns a javascript string that gets appended to the javascript
+    for the webui.
+    """
+    return ''
+
+def setup():
+    """
+    Gets executed only once, when the extension is imported.
+    """
+    pass
+
+def ui():
+    """
+    Gets executed when the UI is drawn. Custom gradio elements and
+    their corresponding event handlers should be defined here.
+
+    To learn about gradio components, check out the docs:
+    https://gradio.app/docs/
+    """
+    pass
 ```
diff --git a/docs/FlexGen.md b/docs/FlexGen.md
deleted file mode 100644
index 931cc36f..00000000
--- a/docs/FlexGen.md
+++ /dev/null
@@ -1,64 +0,0 @@
->FlexGen is a high-throughput generation engine for running large language models with limited GPU memory (e.g., a 16GB T4 GPU or a 24GB RTX3090 gaming card!).
-
-https://github.com/FMInference/FlexGen
-
-## Installation
-
-No additional installation steps are necessary. FlexGen is in the `requirements.txt` file for this project.
-
-## Converting a model
-
-FlexGen only works with the OPT model, and it needs to be converted to numpy format before starting the web UI:
-
-```
-python convert-to-flexgen.py models/opt-1.3b/
-```
-
-The output will be saved to `models/opt-1.3b-np/`.
-
-## Usage
-
-The basic command is the following:
-
-```
-python server.py --model opt-1.3b  --loader flexgen
-```
-
-For large models, the RAM usage may be too high and your computer may freeze. If that happens, you can try this:
-
-```
-python server.py --model opt-1.3b  --loader flexgen --compress-weight
-```
-
-With this second command, I was able to run both OPT-6.7b and OPT-13B with **2GB VRAM**, and the speed was good in both cases.
-
-You can also manually set the offload strategy with
-
-```
-python server.py --model opt-1.3b  --loader flexgen --percent 0 100 100 0 100 0
-```
-
-where the six numbers after `--percent` are:
-
-```
-the percentage of weight on GPU
-the percentage of weight on CPU
-the percentage of attention cache on GPU
-the percentage of attention cache on CPU
-the percentage of activations on GPU
-the percentage of activations on CPU
-```
-
-You should typically only change the first two numbers. If their sum is less than 100, the remaining layers will be offloaded to the disk, by default into the `text-generation-webui/cache` folder.
-
-## Performance
-
-In my experiments with OPT-30B using a RTX 3090 on Linux, I have obtained these results:
-
-* `--loader flexgen --compress-weight --percent 0 100 100 0 100 0`: 0.99 seconds per token.
-* `--loader flexgen --compress-weight --percent 100 0 100 0 100 0`: 0.765 seconds per token.
-
-## Limitations
-
-* Only works with the OPT models.
-* Only two generation parameters are available: `temperature` and `do_sample`.
\ No newline at end of file
diff --git a/docs/Low-VRAM-guide.md b/docs/Low-VRAM-guide.md
deleted file mode 100644
index 7814ecb0..00000000
--- a/docs/Low-VRAM-guide.md
+++ /dev/null
@@ -1,53 +0,0 @@
-If you GPU is not large enough to fit a 16-bit model, try these in the following order:
-
-### Load the model in 8-bit mode
-
-```
-python server.py --load-in-8bit
-```
-
-### Load the model in 4-bit mode
-
-```
-python server.py --load-in-4bit
-```
-
-### Split the model across your GPU and CPU
-
-```
-python server.py --auto-devices
-```
-
-If you can load the model with this command but it runs out of memory when you try to generate text, try increasingly limiting the amount of memory allocated to the GPU until the error stops happening:
-
-```
-python server.py --auto-devices --gpu-memory 10
-python server.py --auto-devices --gpu-memory 9
-python server.py --auto-devices --gpu-memory 8
-...
-```
-
-where the number is in GiB.
-
-For finer control, you can also specify the unit in MiB explicitly:
-
-```
-python server.py --auto-devices --gpu-memory 8722MiB
-python server.py --auto-devices --gpu-memory 4725MiB
-python server.py --auto-devices --gpu-memory 3500MiB
-...
-```
-
-### Send layers to a disk cache
-
-As a desperate last measure, you can split the model across your GPU, CPU, and disk:
-
-```
-python server.py --auto-devices --disk
-```
-
-With this, I am able to load a 30b model into my RTX 3090, but it takes 10 seconds to generate 1 word.
-
-### DeepSpeed (experimental)
-
-An experimental alternative to all of the above is to use DeepSpeed: [guide](DeepSpeed.md).
diff --git a/docs/README.md b/docs/README.md
index 06b73b84..f3829855 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -8,10 +8,8 @@
 * [Docker](Docker.md)
 * [ExLlama](ExLlama.md)
 * [Extensions](Extensions.md)
-* [FlexGen](FlexGen.md)
 * [Generation parameters](Generation-parameters.md)
 * [GPTQ models (4 bit mode)](GPTQ-models-(4-bit-mode).md)
-* [llama.cpp models](llama.cpp-models.md)
 * [LLaMA model](LLaMA-model.md)
 * [LoRA](LoRA.md)
 * [Low VRAM guide](Low-VRAM-guide.md)
diff --git a/docs/llama.cpp-models.md b/docs/llama.cpp-models.md
deleted file mode 100644
index bcf3c046..00000000
--- a/docs/llama.cpp-models.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# Using llama.cpp in the web UI
-
-## Setting up the models
-
-#### Pre-converted
-
-Place the model in the `models` folder, making sure that its name contains `ggml` somewhere and ends in `.bin`.
-
-#### Convert LLaMA yourself
-
-Follow the instructions in the llama.cpp README to generate the `ggml-model.bin` file: https://github.com/ggerganov/llama.cpp#usage
-
-## GPU acceleration
-
-Enabled with the `--n-gpu-layers` parameter. 
-
-* If you have enough VRAM, use a high number like `--n-gpu-layers 200000` to offload all layers to the GPU. 
-* Otherwise, start with a low number like `--n-gpu-layers 10` and then gradually increase it until you run out of memory.
-
-To use this feature, you need to manually compile and install `llama-cpp-python` with GPU support.
-
-#### Linux
-
-```
-pip uninstall -y llama-cpp-python
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
-```
-
-#### Windows
-
-```
-pip uninstall -y llama-cpp-python
-set CMAKE_ARGS="-DLLAMA_CUBLAS=on"
-set FORCE_CMAKE=1
-pip install llama-cpp-python --no-cache-dir
-```
-
-#### macOS
-
-```
-pip uninstall -y llama-cpp-python
-CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
-```
-
-Here you can find the different compilation options for OpenBLAS / cuBLAS / CLBlast: https://pypi.org/project/llama-cpp-python/
-
-## Performance
-
-This was the performance of llama-7b int4 on my i5-12400F (cpu only):
-
-> Output generated in 33.07 seconds (6.05 tokens/s, 200 tokens, context 17)
-
-You can change the number of threads with `--threads N`.
diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py
index edc6d8f4..fbbc5ec1 100644
--- a/extensions/api/blocking_api.py
+++ b/extensions/api/blocking_api.py
@@ -7,10 +7,15 @@
 from modules.chat import generate_chat_reply
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
-from modules.models_settings import (get_model_settings_from_yamls,
-                                     update_model_parameters)
-from modules.text_generation import (encode, generate_reply,
-                                     stop_everything_event)
+from modules.models_settings import (
+    get_model_settings_from_yamls,
+    update_model_parameters
+)
+from modules.text_generation import (
+    encode,
+    generate_reply,
+    stop_everything_event
+)
 from modules.utils import get_available_models
 
 
diff --git a/extensions/api/streaming_api.py b/extensions/api/streaming_api.py
index 88359e3e..6afa827d 100644
--- a/extensions/api/streaming_api.py
+++ b/extensions/api/streaming_api.py
@@ -2,12 +2,15 @@
 import json
 from threading import Thread
 
-from websockets.server import serve
-
-from extensions.api.util import build_parameters, try_start_cloudflared, with_api_lock
+from extensions.api.util import (
+    build_parameters,
+    try_start_cloudflared,
+    with_api_lock
+)
 from modules import shared
 from modules.chat import generate_chat_reply
 from modules.text_generation import generate_reply
+from websockets.server import serve
 
 PATH = '/api/v1/stream'
 
diff --git a/extensions/api/util.py b/extensions/api/util.py
index a9d581eb..2358b7d2 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -10,7 +10,6 @@
 from modules.chat import load_character_memoized
 from modules.presets import load_preset_memoized
 
-
 # We use a thread local to store the asyncio lock, so that each thread
 # has its own lock.  This isn't strictly necessary, but it makes it
 # such that if we can support multiple worker threads in the future,
diff --git a/extensions/example/script.py b/extensions/example/script.py
new file mode 100644
index 00000000..669749c0
--- /dev/null
+++ b/extensions/example/script.py
@@ -0,0 +1,137 @@
+"""
+An example of extension. It does nothing, but you can add transformations
+before the return statements to customize the webui behavior.
+
+Starting from history_modifier and ending in output_modifier, the
+functions are declared in the same order that they are called at
+generation time.
+"""
+
+import torch
+from modules import chat
+from modules.text_generation import (
+    decode,
+    encode,
+    generate_reply,
+)
+from transformers import LogitsProcessor
+
+params = {
+    "display_name": "Example Extension",
+    "is_tab": False,
+}
+
+class MyLogits(LogitsProcessor):
+    """
+    Manipulates the probabilities for the next token before it gets sampled.
+    Used in the logits_processor_modifier function below.
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, input_ids, scores):
+        # probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+        # probs[0] /= probs[0].sum()
+        # scores = torch.log(probs / (1 - probs))
+        return scores
+
+def history_modifier(history):
+    """
+    Modifies the chat history.
+    Only used in chat mode.
+    """
+    return history
+
+def state_modifier(state):
+    """
+    Modifies the state variable, which is a dictionary containing the input
+    values in the UI like sliders and checkboxes.
+    """
+    return state
+
+def chat_input_modifier(text, visible_text, state):
+    """
+    Modifies the user input string in chat mode (visible_text).
+    You can also modify the internal representation of the user
+    input (text) to change how it will appear in the prompt.
+    """
+    return text, visible_text
+
+def input_modifier(string, state):
+    """
+    In default/notebook modes, modifies the whole prompt.
+
+    In chat mode, it is the same as chat_input_modifier but only applied
+    to "text", here called "string", and not to "visible_text".
+    """
+    return string
+
+def bot_prefix_modifier(string, state):
+    """
+    Modifies the prefix for the next bot reply in chat mode.
+    By default, the prefix will be something like "Bot Name:".
+    """
+    return string
+
+def tokenizer_modifier(state, prompt, input_ids, input_embeds):
+    """
+    Modifies the input ids and embeds.
+    Used by the multimodal extension to put image embeddings in the prompt.
+    Only used by loaders that use the transformers library for sampling.
+    """
+    return prompt, input_ids, input_embeds
+
+def logits_processor_modifier(processor_list, input_ids):
+    """
+    Adds logits processors to the list, allowing you to access and modify
+    the next token probabilities.
+    Only used by loaders that use the transformers library for sampling.
+    """
+    processor_list.append(MyLogits())
+    return processor_list
+
+def output_modifier(string, state):
+    """
+    Modifies the LLM output before it gets presented.
+
+    In chat mode, the modified version goes into history['visible'],
+    and the original version goes into history['internal'].
+    """
+    return string
+
+def custom_generate_chat_prompt(user_input, state, **kwargs):
+    """
+    Replaces the function that generates the prompt from the chat history.
+    Only used in chat mode.
+    """
+    result = chat.generate_chat_prompt(user_input, state, **kwargs)
+    return result
+
+def custom_css():
+    """
+    Returns a CSS string that gets appended to the CSS for the webui.
+    """
+    return ''
+
+def custom_js():
+    """
+    Returns a javascript string that gets appended to the javascript
+    for the webui.
+    """
+    return ''
+
+def setup():
+    """
+    Gets executed only once, when the extension is imported.
+    """
+    pass
+
+def ui():
+    """
+    Gets executed when the UI is drawn. Custom gradio elements and
+    their corresponding event handlers should be defined here.
+
+    To learn about gradio components, check out the docs:
+    https://gradio.app/docs/
+    """
+    pass
diff --git a/extensions/llava/script.py b/extensions/llava/script.py
deleted file mode 100644
index 781d584b..00000000
--- a/extensions/llava/script.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import gradio as gr
-
-from modules.logging_colors import logger
-
-
-def ui():
-    gr.Markdown("### This extension is deprecated, use \"multimodal\" extension instead")
-    logger.error("LLaVA extension is deprecated, use \"multimodal\" extension instead")
diff --git a/extensions/multimodal/script.py b/extensions/multimodal/script.py
index b3f654e4..8bc26315 100644
--- a/extensions/multimodal/script.py
+++ b/extensions/multimodal/script.py
@@ -35,6 +35,15 @@
 multimodal_embedder: MultimodalEmbedder = None
 
 
+def chat_input_modifier(text, visible_text, state):
+    global input_hijack
+    if input_hijack['state']:
+        input_hijack['state'] = False
+        return input_hijack['value'](text, visible_text)
+    else:
+        return text, visible_text
+
+
 def add_chat_picture(picture, text, visible_text):
     # resize the image, so that shortest edge is at least 224 (size for CLIP), and at most 300 (to keep history manageable)
     max_hw, min_hw = max(picture.size), min(picture.size)
diff --git a/extensions/openai/README.md b/extensions/openai/README.md
index 7bbc1e83..2083734a 100644
--- a/extensions/openai/README.md
+++ b/extensions/openai/README.md
@@ -1,17 +1,15 @@
 # An OpenedAI API (openai like)
 
 This extension creates an API that works kind of like openai (ie. api.openai.com).
-It's incomplete so far but perhaps is functional enough for you.
 
 ## Setup & installation 
 
-Optional (for flask_cloudflared, embeddings):
-
+Install the requirements:
 ```
 pip3 install -r requirements.txt
 ```
 
-It listens on tcp port 5001 by default. You can use the OPENEDAI_PORT environment variable to change this.
+It listens on ```tcp port 5001``` by default. You can use the ```OPENEDAI_PORT``` environment variable to change this.
 
 Make sure you enable it in server launch parameters, it should include:
 
@@ -21,13 +19,30 @@ Make sure you enable it in server launch parameters, it should include:
 
 You can also use the ``--listen`` argument to make the server available on the networ, and/or the ```--share``` argument to enable a public Cloudflare endpoint.
 
-To enable the basic image generation support (txt2img) set the environment variable SD_WEBUI_URL to point to your Stable Diffusion API ([Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui)).
+To enable the basic image generation support (txt2img) set the environment variable ```SD_WEBUI_URL``` to point to your Stable Diffusion API ([Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui)).
 
 For example:
 ```
 SD_WEBUI_URL=http://127.0.0.1:7861
 ```
 
+## Quick start
+
+1. Install the requirements.txt (pip)
+2. Enable the ```openeai``` module (--extensions openai), restart the server.
+3. Configure the openai client
+
+Most openai application can be configured to connect the API if you set the following environment variables:
+
+```shell
+# Sample .env file:
+OPENAI_API_KEY=sk-111111111111111111111111111111111111111111111111
+OPENAI_API_BASE=http://0.0.0.0:5001/v1
+```
+
+If needed, replace 0.0.0.0 with the IP/port of your server.
+
+
 ### Models
 
 This has been successfully tested with Alpaca, Koala, Vicuna, WizardLM and their variants, (ex. gpt4-x-alpaca, GPT4all-snoozy, stable-vicuna, wizard-vicuna, etc.) and many others. Models that have been trained for **Instruction Following** work best. If you test with other models please let me know how it goes. Less than satisfying results (so far) from: RWKV-4-Raven, llama, mpt-7b-instruct/chat.
@@ -36,7 +51,7 @@ For best results across all API endpoints, a model like [vicuna-13b-v1.3-GPTQ](h
 
 For good results with the [Completions](https://platform.openai.com/docs/api-reference/completions) API endpoint, in addition to the above models, you can also try using a base model like [falcon-7b](https://huggingface.co/tiiuae/falcon-7b) or Llama.
 
-For good results with the [ChatCompletions](https://platform.openai.com/docs/api-reference/chat) or [Edits](https://platform.openai.com/docs/api-reference/edits) API endpoints you can use almost any model trained for instruction following - within the limits of the model. Be sure that the proper instruction template is detected and loaded or the results will not be good.
+For good results with the [ChatCompletions](https://platform.openai.com/docs/api-reference/chat) or [Edits](https://platform.openai.com/docs/api-reference/edits) API endpoints you can use almost any model trained for instruction following. Be sure that the proper instruction template is detected and loaded or the results will not be good.
 
 For the proper instruction format to be detected you need to have a matching model entry in your ```models/config.yaml``` file. Be sure to keep this file up to date.
 A matching instruction template file in the characters/instruction-following/ folder will loaded and applied to format messages correctly for the model - this is critical for good results.
@@ -76,7 +91,7 @@ Embeddings requires ```sentence-transformers``` installed, but chat and completi
 | all-mpnet-base-v2 | 768 | 384 | 2800 | 420M | 63.3 |
 | all-MiniLM-L6-v2 | 384 | 256 | 14200 | 80M | 58.8 |
 
-In short, the all-MiniLM-L6-v2 model is 5x faster, 5x smaller ram, 2x smaller storage, and still offers good quality. Stats from (https://www.sbert.net/docs/pretrained_models.html). To change the model from the default you can set the environment variable OPENEDAI_EMBEDDING_MODEL, ex. "OPENEDAI_EMBEDDING_MODEL=all-MiniLM-L6-v2".
+In short, the all-MiniLM-L6-v2 model is 5x faster, 5x smaller ram, 2x smaller storage, and still offers good quality. Stats from (https://www.sbert.net/docs/pretrained_models.html). To change the model from the default you can set the environment variable ```OPENEDAI_EMBEDDING_MODEL```, ex. "OPENEDAI_EMBEDDING_MODEL=all-MiniLM-L6-v2".
 
 Warning: You cannot mix embeddings from different models even if they have the same dimensions. They are not comparable.
 
@@ -85,26 +100,27 @@ Warning: You cannot mix embeddings from different models even if they have the s
 
 Almost everything you use it with will require you to set a dummy OpenAI API key environment variable.
 
-With the [official python openai client](https://github.com/openai/openai-python), you can set the OPENAI_API_BASE environment variable before you import the openai module, like so:
+With the [official python openai client](https://github.com/openai/openai-python), set the ```OPENAI_API_BASE``` environment variables:
 
-```
+```shell
+# Sample .env file:
 OPENAI_API_KEY=sk-111111111111111111111111111111111111111111111111
-OPENAI_API_BASE=http://127.0.0.1:5001/v1
+OPENAI_API_BASE=http://0.0.0.0:5001/v1
 ```
 
-If needed, replace 127.0.0.1 with the IP/port of your server.
+If needed, replace 0.0.0.0 with the IP/port of your server.
 
-If using .env files to save the OPENAI_API_BASE and OPENAI_API_KEY variables, you can ensure compatibility by loading the .env file before loading the openai module, like so in python:
+If using .env files to save the ```OPENAI_API_BASE``` and ```OPENAI_API_KEY``` variables, make sure the .env file is loaded before the openai module is imported:
 
-```
+```python
 from dotenv import load_dotenv
-load_dotenv()
+load_dotenv() # make sure the environment variables are set before import
 import openai
 ```
 
 With the [official Node.js openai client](https://github.com/openai/openai-node) it is slightly more more complex because the environment variables are not used by default, so small source code changes may be required to use the environment variables, like so:
 
-```
+```js
 const openai = OpenAI(Configuration({
   apiKey: process.env.OPENAI_API_KEY,
   basePath: process.env.OPENAI_API_BASE,
@@ -113,7 +129,7 @@ const openai = OpenAI(Configuration({
 
 For apps made with the [chatgpt-api Node.js client library](https://github.com/transitive-bullshit/chatgpt-api):
 
-```
+```js
 const api = new ChatGPTAPI({
   apiKey: process.env.OPENAI_API_KEY,
   apiBaseUrl: process.env.OPENAI_API_BASE,
@@ -127,39 +143,43 @@ The OpenAI API is well documented, you can view the documentation here: https://
 Examples of how to use the Completions API in Python can be found here: https://platform.openai.com/examples
 Not all of them will work with all models unfortunately, See the notes on Models for how to get the best results.
 
-Here is a simple python example of how you can use the Edit endpoint as a translator.
+Here is a simple python example.
 
 ```python
+import os
+os.environ['OPENAI_API_KEY']="sk-111111111111111111111111111111111111111111111111"
+os.environ['OPENAI_API_BASE']="http://0.0.0.0:5001/v1"
 import openai
-response = openai.Edit.create(
+
+response = openai.ChatCompletion.create(
   model="x",
-  instruction="Translate this into French",
-  input="Our mission is to ensure that artificial general intelligence benefits all of humanity.",
+  messages = [{ 'role': 'system', 'content': "Answer in a consistent style." },
+    {'role': 'user', 'content': "Teach me about patience."},
+    {'role': 'assistant', 'content': "The river that carves the deepest valley flows from a modest spring; the grandest symphony originates from a single note; the most intricate tapestry begins with a solitary thread."},
+    {'role': 'user', 'content': "Teach me about the ocean."},
+  ]
 )
-print(response['choices'][0]['text'])
-# Sample Output:
-# Notre mission est de garantir que l'intelligence artificielle généralisée profite à tous les membres de l'humanité.
+text = response['choices'][0]['message']['content']
+print(text)
 ```
 
-
-
 ## Compatibility & not so compatibility
 
 | API endpoint | tested with | notes |
 | --- | --- | --- |
-| /v1/models | openai.Model.list() | Lists models, Currently loaded model first, plus some compatibility options |
-| /v1/models/{id} | openai.Model.get() | returns whatever you ask for, model does nothing yet anyways |
-| /v1/text_completion | openai.Completion.create() | the most tested, only supports single string input so far, variable quality based on the model |
-| /v1/chat/completions | openai.ChatCompletion.create() | Quality depends a lot on the model |
-| /v1/edits | openai.Edit.create() | Works the best of all, perfect for instruction following models |
+| /v1/chat/completions | openai.ChatCompletion.create() | Use it with instruction following models |
+| /v1/embeddings | openai.Embedding.create() | Using SentenceTransformer embeddings |
 | /v1/images/generations | openai.Image.create() | Bare bones, no model configuration, response_format='b64_json' only. |
-| /v1/embeddings | openai.Embedding.create() | Using Sentence Transformer, dimensions are different and may never be directly comparable to openai embeddings. |
-| /v1/moderations | openai.Moderation.create() | does nothing. successfully. |
+| /v1/moderations | openai.Moderation.create() | Basic initial support via embeddings |
+| /v1/models | openai.Model.list() | Lists models, Currently loaded model first, plus some compatibility options |
+| /v1/models/{id} | openai.Model.get() | returns whatever you ask for |
+| /v1/edits | openai.Edit.create() | Deprecated by openai, good with instruction following models |
+| /v1/text_completion | openai.Completion.create() | Legacy endpoint, doesn't support array input, variable quality based on the model |
 | /v1/completions | openai api completions.create | Legacy endpoint (v0.25) |
 | /v1/engines/*/embeddings | python-openai v0.25 | Legacy endpoint |
 | /v1/engines/*/generate | openai engines.generate | Legacy endpoint |
 | /v1/engines | openai engines.list | Legacy Lists models |
-| /v1/engines/{model_name} | openai engines.get -i {model_name} | You can use this legacy endpoint to load models via the api |
+| /v1/engines/{model_name} | openai engines.get -i {model_name} | You can use this legacy endpoint to load models via the api or command line |
 | /v1/images/edits | openai.Image.create_edit() | not yet supported |
 | /v1/images/variations | openai.Image.create_variation() | not yet supported |
 | /v1/audio/\* | openai.Audio.\* | not yet supported |
@@ -167,7 +187,7 @@ print(response['choices'][0]['text'])
 | /v1/fine-tunes\* | openai.FineTune.\* | not yet supported |
 | /v1/search | openai.search, engines.search | not yet supported |
 
-The model name setting is ignored in completions, but you may need to adjust the maximum token length to fit the model (ie. set to <2048 tokens instead of 4096, 8k, etc). To mitigate some of this, the max_tokens value is halved until it is less than truncation_length for the model (typically 2k).
+Because of the differences in OpenAI model context sizes (2k, 4k, 8k, 16k, etc,) you may need to adjust the max_tokens to fit into the context of the model you choose.
 
 Streaming, temperature, top_p, max_tokens, stop, should all work as expected, but not all parameters are mapped correctly.
 
@@ -175,41 +195,29 @@ Some hacky mappings:
 
 | OpenAI | text-generation-webui | note |
 | --- | --- | --- |
+| model | - | Ignored, the model is not changed |
 | frequency_penalty | encoder_repetition_penalty | this seems to operate with a different scale and defaults, I tried to scale it based on range & defaults, but the results are terrible. hardcoded to 1.18 until there is a better way |
 | presence_penalty | repetition_penalty | same issues as frequency_penalty, hardcoded to 1.0 |
-| best_of | top_k | default is 1 |
-| stop | custom_stopping_strings | this is also stuffed with ['\n###', "\n{user prompt}", "{user prompt}" ] for good measure. |
+| best_of | top_k | default is 1 (top_k is 20 for chat, which doesn't support best_of) |
 | n | 1 | variations are not supported yet. |
 | 1 | num_beams | hardcoded to 1 |
 | 1.0 | typical_p | hardcoded to 1.0 |
-| max_tokens | max_new_tokens | For Text Completions max_tokens is set smaller than the truncation_length minus the prompt length. This can cause no input to be generated if the prompt is too large. For ChatCompletions, the older chat messages may be dropped to fit the max_new_tokens requested |
-| logprobs | - | not supported yet |
-| logit_bias | - | not supported yet |
+| logprobs & logit_bias | - | experimental, llama only, transformers-kin only (ExLlama_HF ok), can also use llama tokens if 'model' is not an openai model or will convert from tiktoken for the openai model specified in 'model' |
 | messages.name | - | not supported yet |
 | user | - | not supported yet |
 | functions/function_call | - | function calls are not supported yet |
 
-defaults are mostly from openai, so are different. I use the openai defaults where I can and try to scale them to the webui defaults with the same intent.
 
 ### Applications
 
-Almost everything needs the OPENAI_API_KEY environment variable set, for example:
-```
-OPENAI_API_KEY=sk-111111111111111111111111111111111111111111111111
-```
-Some apps are picky about key format, but 'dummy' or 'sk-dummy' also work in most cases.
-Most application will work if you also set:
-```
-OPENAI_API_BASE=http://127.0.0.1:5001/v1
-```
-but there are some exceptions.
+Almost everything needs the ```OPENAI_API_KEY``` and ```OPENAI_API_BASE``` environment variable set, but there are some exceptions.
 
-| Compatibility | Application/Library | url | notes / setting |
+| Compatibility | Application/Library | Website | Notes |
 | --- | --- | --- | --- |
 | ✅❌ | openai-python (v0.25+) | https://github.com/openai/openai-python | only the endpoints from above are working. OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
 | ✅❌ | openai-node | https://github.com/openai/openai-node | only the endpoints from above are working. environment variables don't work by default, but can be configured (see above) |
 | ✅❌ | chatgpt-api | https://github.com/transitive-bullshit/chatgpt-api | only the endpoints from above are working. environment variables don't work by default, but can be configured (see above) |
-| ✅ | anse | https://github.com/anse-app/anse | API Key & URL configurable in UI |
+| ✅ | anse | https://github.com/anse-app/anse | API Key & URL configurable in UI, Images also work |
 | ✅ | shell_gpt | https://github.com/TheR1D/shell_gpt | OPENAI_API_HOST=http://127.0.0.1:5001 |
 | ✅ | gpt-shell | https://github.com/jla/gpt-shell | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
 | ✅ | gpt-discord-bot | https://github.com/openai/gpt-discord-bot | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
@@ -221,11 +229,12 @@ but there are some exceptions.
 | ❌ | guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported |
 
 ## Future plans
+* better error handling
 * model changing, esp. something for swapping loras or embedding models
 * consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard)
 
 ## Bugs? Feedback? Comments? Pull requests?
 
-To enable debugging and get copious output you can set the OPENEDAI_DEBUG=1 environment variable.
+To enable debugging and get copious output you can set the ```OPENEDAI_DEBUG=1``` environment variable.
 
-Are all appreciated, please @matatonic and I'll try to get back to you as soon as possible.
+Are all appreciated, please @matatonic and I'll try to get back to you as soon as possible.
\ No newline at end of file
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 23c5dbee..e1baa249 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -18,41 +18,50 @@
 class LogitsBiasProcessor(LogitsProcessor):
     def __init__(self, logit_bias={}):
         self.logit_bias = logit_bias
-        super().__init__()
+        if self.logit_bias:
+            self.keys = list([int(key) for key in self.logit_bias.keys()])
+            values = [ self.logit_bias[str(key)] for key in self.keys ]
+            self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
+            debug_msg(f"{self})")
 
     def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
         if self.logit_bias:
-            keys = list([int(key) for key in self.logit_bias.keys()])
-            values = list([int(val) for val in self.logit_bias.values()])
-            logits[0, keys] += torch.tensor(values).cuda()
-
+            debug_msg(logits[0, self.keys], " + ", self.values)
+            logits[0, self.keys] += self.values
+            debug_msg(" --> ", logits[0, self.keys])
+            debug_msg(" max/min ", float(torch.max(logits[0])), float(torch.min(logits[0])))
         return logits
 
+    def __repr__(self):
+        return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
 
 class LogprobProcessor(LogitsProcessor):
     def __init__(self, logprobs=None):
         self.logprobs = logprobs
         self.token_alternatives = {}
-        super().__init__()
 
     def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
         if self.logprobs is not None:  # 0-5
             log_e_probabilities = F.log_softmax(logits, dim=1)
-            # XXX hack. should find the selected token and include the prob of that
-            # ... but we just +1 here instead because we don't know it yet.
-            top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
-            top_tokens = [decode(tok) for tok in top_indices[0]]
-            self.token_alternatives = dict(zip(top_tokens, top_values[0].tolist()))
+            top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs+1)
+            top_tokens = [ decode(tok) for tok in top_indices[0] ]
+            top_probs = [ float(x) for x in top_values[0] ]
+            self.token_alternatives = dict(zip(top_tokens, top_probs))
+            debug_msg(f"{self.__class__.__name__}(logprobs+1={self.logprobs+1}, token_alternatives={self.token_alternatives})")
         return logits
 
+    def __repr__(self):
+        return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
+
 
 def convert_logprobs_to_tiktoken(model, logprobs):
-    try:
-        encoder = tiktoken.encoding_for_model(model)
-        # just pick the first one if it encodes to multiple tokens... 99.9% not required and maybe worse overall.
-        return dict([(encoder.decode([encoder.encode(token)[0]]), prob) for token, prob in logprobs.items()])
-    except KeyError:
-        # assume native tokens if we can't find the tokenizer
+# more problems than it's worth.
+#    try:
+#        encoder = tiktoken.encoding_for_model(model)
+#        # just pick the first one if it encodes to multiple tokens... 99.9% not required and maybe worse overall.
+#        return dict([(encoder.decode([encoder.encode(token)[0]]), prob) for token, prob in logprobs.items()])
+#    except KeyError:
+#        # assume native tokens if we can't find the tokenizer
         return logprobs
 
 
@@ -73,8 +82,8 @@ def marshal_common_params(body):
     req_params['requested_model'] = body.get('model', shared.model_name)
 
     req_params['suffix'] = default(body, 'suffix', req_params['suffix'])
-    req_params['temperature'] = clamp(default(body, 'temperature', req_params['temperature']), 0.001, 1.999)  # fixup absolute 0.0/2.0
-    req_params['top_p'] = clamp(default(body, 'top_p', req_params['top_p']), 0.001, 1.0)
+    req_params['temperature'] = clamp(default(body, 'temperature', req_params['temperature']), 0.01, 1.99)  # fixup absolute 0.0/2.0
+    req_params['top_p'] = clamp(default(body, 'top_p', req_params['top_p']), 0.01, 1.0)
     n = default(body, 'n', 1)
     if n != 1:
         raise InvalidRequestError(message="Only n = 1 is supported.", param='n')
@@ -87,6 +96,11 @@ def marshal_common_params(body):
 
     # presence_penalty - ignored
     # frequency_penalty - ignored
+
+    # pass through unofficial params
+    req_params['repetition_penalty'] = default(body, 'repetition_penalty', req_params['repetition_penalty'])
+    req_params['encoder_repetition_penalty'] = default(body, 'encoder_repetition_penalty', req_params['encoder_repetition_penalty'])
+
     # user - ignored
 
     logits_processor = []
@@ -98,9 +112,11 @@ def marshal_common_params(body):
             encoder = tiktoken.encoding_for_model(req_params['requested_model'])
             new_logit_bias = {}
             for logit, bias in logit_bias.items():
-                for x in encode(encoder.decode([int(logit)]))[0]:
+                for x in encode(encoder.decode([int(logit)]), add_special_tokens=False)[0]:
+                    if int(x) in [0, 1, 2, 29871]: # XXX LLAMA tokens
+                        continue
                     new_logit_bias[str(int(x))] = bias
-            print(logit_bias, '->', new_logit_bias)
+            debug_msg('logit_bias_map', logit_bias, '->', new_logit_bias)
             logit_bias = new_logit_bias
         except KeyError:
             pass  # assume native tokens if we can't find the tokenizer
@@ -134,11 +150,11 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
     messages = body['messages']
 
     role_formats = {
-        'user': 'user: {message}\n',
-        'assistant': 'assistant: {message}\n',
+        'user': 'User: {message}\n',
+        'assistant': 'Assistant: {message}\n',
         'system': '{message}',
-        'context': 'You are a helpful assistant. Answer as concisely as possible.',
-        'prompt': 'assistant:',
+        'context': 'You are a helpful assistant. Answer as concisely as possible.\nUser: I want your assistance.\nAssistant: Sure! What can I do for you?',
+        'prompt': 'Assistant:',
     }
 
     if not 'stopping_strings' in req_params:
@@ -151,10 +167,10 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
 
             template = instruct['turn_template']
             system_message_template = "{message}"
-            system_message_default = instruct['context']
+            system_message_default = instruct.get('context', '') # can be missing
             bot_start = template.find('<|bot|>')  # So far, 100% of instruction templates have this token
-            user_message_template = template[:bot_start].replace('<|user-message|>', '{message}').replace('<|user|>', instruct['user'])
-            bot_message_template = template[bot_start:].replace('<|bot-message|>', '{message}').replace('<|bot|>', instruct['bot'])
+            user_message_template = template[:bot_start].replace('<|user-message|>', '{message}').replace('<|user|>', instruct.get('user', ''))
+            bot_message_template = template[bot_start:].replace('<|bot-message|>', '{message}').replace('<|bot|>', instruct.get('bot', ''))
             bot_prompt = bot_message_template[:bot_message_template.find('{message}')].rstrip(' ')
 
             role_formats = {
@@ -173,13 +189,13 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
             debug_msg(f"Loaded instruction role format: {shared.settings['instruction_template']}")
 
         except Exception as e:
-            req_params['stopping_strings'].extend(['\nuser:'])
+            req_params['stopping_strings'].extend(['\nUser:', 'User:'])  # XXX User: prompt here also
 
             print(f"Exception: When loading characters/instruction-following/{shared.settings['instruction_template']}.yaml: {repr(e)}")
             print("Warning: Loaded default instruction-following template for model.")
 
     else:
-        req_params['stopping_strings'].extend(['\nuser:'])
+        req_params['stopping_strings'].extend(['\nUser:', 'User:'])  # XXX User: prompt here also
         print("Warning: Loaded default instruction-following template for model.")
 
     system_msgs = []
@@ -194,6 +210,11 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
         context_msg = end_line(role_formats['system'].format(message=body['prompt'])) + context_msg
 
     for m in messages:
+        if 'role' not in m:
+            raise InvalidRequestError(message="messages: missing role", param='messages')
+        if 'content' not in m:
+            raise InvalidRequestError(message="messages: missing content", param='messages')
+        
         role = m['role']
         content = m['content']
         # name = m.get('name', None)
@@ -215,12 +236,12 @@ def messages_to_prompt(body: dict, req_params: dict, max_tokens):
 
     if token_count >= req_params['truncation_length']:
         err_msg = f"This model maximum context length is {req_params['truncation_length']} tokens. However, your messages resulted in over {token_count} tokens."
-        raise InvalidRequestError(message=err_msg)
+        raise InvalidRequestError(message=err_msg, param='messages')
 
     if max_tokens > 0 and token_count + max_tokens > req_params['truncation_length']:
         err_msg = f"This model maximum context length is {req_params['truncation_length']} tokens. However, your messages resulted in over {token_count} tokens and max_tokens is {max_tokens}."
         print(f"Warning: ${err_msg}")
-        # raise InvalidRequestError(message=err_msg)
+        # raise InvalidRequestError(message=err_msg, params='max_tokens')
 
     return prompt, token_count
 
@@ -251,6 +272,10 @@ def chat_completions(body: dict, is_legacy: bool = False) -> dict:
     # format the prompt from messages
     prompt, token_count = messages_to_prompt(body, req_params, max_tokens)
 
+    # set real max, avoid deeper errors
+    if req_params['max_new_tokens'] + token_count >= req_params['truncation_length']:
+        req_params['max_new_tokens'] = req_params['truncation_length'] - token_count
+
     # generate reply #######################################
     debug_msg({'prompt': prompt, 'req_params': req_params})
     stopping_strings = req_params.pop('stopping_strings', [])
@@ -267,7 +292,7 @@ def chat_completions(body: dict, is_legacy: bool = False) -> dict:
 
     completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
-    if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= max_tokens:
+    if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= req_params['max_new_tokens']:
         stop_reason = "length"
 
     resp = {
@@ -323,6 +348,10 @@ def stream_chat_completions(body: dict, is_legacy: bool = False):
     # format the prompt from messages
     prompt, token_count = messages_to_prompt(body, req_params, max_tokens)
 
+    # set real max, avoid deeper errors
+    if req_params['max_new_tokens'] + token_count >= req_params['truncation_length']:
+        req_params['max_new_tokens'] = req_params['truncation_length'] - token_count
+
     def chat_streaming_chunk(content):
         # begin streaming
         chunk = {
@@ -352,7 +381,6 @@ def chat_streaming_chunk(content):
     debug_msg({'prompt': prompt, 'req_params': req_params})
 
     stopping_strings = req_params.pop('stopping_strings', [])
-    logprob_proc = req_params.pop('logprob_proc', None)
 
     generator = generate_reply(prompt, req_params, stopping_strings=stopping_strings, is_chat=False)
 
@@ -375,13 +403,17 @@ def chat_streaming_chunk(content):
         if len_seen == 0 and new_content[0] == ' ':
             new_content = new_content[1:]
 
-        completion_token_count += len(encode(new_content)[0])
         chunk = chat_streaming_chunk(new_content)
 
         yield chunk
 
+    # to get the correct token_count, strip leading space if present
+    if answer and answer[0] == ' ':
+        answer = answer[1:]
+
+    completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
-    if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= max_tokens:
+    if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= req_params['max_new_tokens']:
         stop_reason = "length"
 
     chunk = chat_streaming_chunk('')
@@ -413,7 +445,7 @@ def completions(body: dict, is_legacy: bool = False):
         if prompt and isinstance(prompt[0], int):
             try:
                 encoder = tiktoken.encoding_for_model(requested_model)
-                prompt = encode(encoder.decode(prompt))[0]
+                prompt = encoder.decode(prompt)
             except KeyError:
                 prompt = decode(prompt)[0]
         else:
@@ -441,7 +473,6 @@ def completions(body: dict, is_legacy: bool = False):
     # generate reply #######################################
     debug_msg({'prompt': prompt, 'req_params': req_params})
     stopping_strings = req_params.pop('stopping_strings', [])
-    logprob_proc = req_params.pop('logprob_proc', None)
     generator = generate_reply(prompt, req_params, stopping_strings=stopping_strings, is_chat=False)
 
     answer = ''
@@ -475,7 +506,7 @@ def completions(body: dict, is_legacy: bool = False):
         }
     }
 
-    if logprob_proc:
+    if logprob_proc and logprob_proc.token_alternatives:
         top_logprobs = convert_logprobs_to_tiktoken(model=requested_model, logprobs=logprob_proc.token_alternatives)
         resp[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
     else:
@@ -504,7 +535,7 @@ def stream_completions(body: dict, is_legacy: bool = False):
         if prompt and isinstance(prompt[0], int):
             try:
                 encoder = tiktoken.encoding_for_model(requested_model)
-                prompt = encode(encoder.decode(prompt))[0]
+                prompt = encoder.decode(prompt)
             except KeyError:
                 prompt = decode(prompt)[0]
         else:
@@ -579,9 +610,13 @@ def text_streaming_chunk(content):
 
         chunk = text_streaming_chunk(new_content)
 
-        completion_token_count += len(encode(new_content)[0])
         yield chunk
 
+    # to get the correct count, we strip the leading space if present
+    if answer and answer[0] == ' ':
+        answer = answer[1:]
+
+    completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
     if token_count + completion_token_count >= req_params['truncation_length'] or completion_token_count >= max_tokens:
         stop_reason = "length"
diff --git a/extensions/openai/defaults.py b/extensions/openai/defaults.py
index 7c4f1c44..52f0d641 100644
--- a/extensions/openai/defaults.py
+++ b/extensions/openai/defaults.py
@@ -46,8 +46,6 @@ def get_default_req_params():
     return copy.deepcopy(default_req_params)
 
 # little helper to get defaults if arg is present but None and should be the same type as default.
-
-
 def default(dic, key, default):
     val = dic.get(key, default)
     if type(val) != type(default):
diff --git a/extensions/openai/embeddings.py b/extensions/openai/embeddings.py
index c02bb933..be4cd80b 100644
--- a/extensions/openai/embeddings.py
+++ b/extensions/openai/embeddings.py
@@ -1,43 +1,54 @@
 import os
 from sentence_transformers import SentenceTransformer
+import numpy as np
 from extensions.openai.utils import float_list_to_base64, debug_msg
 from extensions.openai.errors import *
 
 st_model = os.environ["OPENEDAI_EMBEDDING_MODEL"] if "OPENEDAI_EMBEDDING_MODEL" in os.environ else "all-mpnet-base-v2"
 embeddings_model = None
+# OPENEDAI_EMBEDDING_DEVICE: auto (best or cpu), cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia, privateuseone
+embeddings_device = os.environ.get("OPENEDAI_EMBEDDING_DEVICE", "cpu")
+if embeddings_device.lower() == 'auto':
+    embeddings_device = None
 
-
-def load_embedding_model(model):
+def load_embedding_model(model: str) -> SentenceTransformer:
+    global embeddings_device, embeddings_model
     try:
-        emb_model = SentenceTransformer(model)
-        print(f"\nLoaded embedding model: {model}, max sequence length: {emb_model.max_seq_length}")
+        embeddings_model = 'loading...' # flag
+        # see: https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer
+        emb_model = SentenceTransformer(model, device=embeddings_device)
+        # ... emb_model.device doesn't seem to work, always cpu anyways? but specify cpu anyways to free more VRAM
+        print(f"\nLoaded embedding model: {model} on {emb_model.device} [always seems to say 'cpu', even if 'cuda'], max sequence length: {emb_model.max_seq_length}")
     except Exception as e:
-        print(f"\nError: Failed to load embedding model: {model}")
+        embeddings_model = None
         raise ServiceUnavailableError(f"Error: Failed to load embedding model: {model}", internal_message=repr(e))
 
     return emb_model
 
 
-def get_embeddings_model():
+def get_embeddings_model() -> SentenceTransformer:
     global embeddings_model, st_model
     if st_model and not embeddings_model:
         embeddings_model = load_embedding_model(st_model)  # lazy load the model
     return embeddings_model
 
 
-def get_embeddings_model_name():
+def get_embeddings_model_name() -> str:
     global st_model
     return st_model
 
 
-def embeddings(input: list, encoding_format: str):
+def get_embeddings(input: list) -> np.ndarray:
+    return get_embeddings_model().encode(input, convert_to_numpy=True, normalize_embeddings=True, convert_to_tensor=False, device=embeddings_device)
+
+def embeddings(input: list, encoding_format: str) -> dict:
 
-    embeddings = get_embeddings_model().encode(input).tolist()
+    embeddings = get_embeddings(input)
 
     if encoding_format == "base64":
         data = [{"object": "embedding", "embedding": float_list_to_base64(emb), "index": n} for n, emb in enumerate(embeddings)]
     else:
-        data = [{"object": "embedding", "embedding": emb, "index": n} for n, emb in enumerate(embeddings)]
+        data = [{"object": "embedding", "embedding": emb.tolist(), "index": n} for n, emb in enumerate(embeddings)]
 
     response = {
         "object": "list",
diff --git a/extensions/openai/errors.py b/extensions/openai/errors.py
index ff519c4f..838d1e7c 100644
--- a/extensions/openai/errors.py
+++ b/extensions/openai/errors.py
@@ -13,8 +13,8 @@ def __repr__(self):
 
 
 class InvalidRequestError(OpenAIError):
-    def __init__(self, message, param, code=400, error_type='InvalidRequestError', internal_message=''):
-        super(OpenAIError, self).__init__(message, code, error_type, internal_message)
+    def __init__(self, message, param, code=400, internal_message=''):
+        super().__init__(message, code, internal_message)
         self.param = param
 
     def __repr__(self):
@@ -27,5 +27,5 @@ def __repr__(self):
 
 
 class ServiceUnavailableError(OpenAIError):
-    def __init__(self, message=None, code=500, error_type='ServiceUnavailableError', internal_message=''):
-        super(OpenAIError, self).__init__(message, code, error_type, internal_message)
+    def __init__(self, message="Service unavailable, please try again later.", code=503, internal_message=''):
+        super().__init__(message, code, internal_message)
diff --git a/extensions/openai/images.py b/extensions/openai/images.py
index d2be3192..9fdb625e 100644
--- a/extensions/openai/images.py
+++ b/extensions/openai/images.py
@@ -9,12 +9,16 @@ def generations(prompt: str, size: str, response_format: str, n: int):
     # Low effort implementation for compatibility. With only "prompt" being passed and assuming DALL-E
     # the results will be limited and likely poor. SD has hundreds of models and dozens of settings.
     # If you want high quality tailored results you should just use the Stable Diffusion API directly.
-    # it's too general an API to try and shape the result with specific tags like "masterpiece", etc,
-    # Will probably work best with the stock SD models.
-    # SD configuration is beyond the scope of this API.
+    # it's too general an API to try and shape the result with specific tags like negative prompts
+    # or "masterpiece", etc. SD configuration is beyond the scope of this API.
     # At this point I will not add the edits and variations endpoints (ie. img2img) because they
     # require changing the form data handling to accept multipart form data, also to properly support
     # url return types will require file management and a web serving files... Perhaps later!
+    base_model_size = 512 if not 'SD_BASE_MODEL_SIZE' in os.environ else int(os.environ.get('SD_BASE_MODEL_SIZE', 512))
+    sd_defaults = {
+        'sampler_name': 'DPM++ 2M Karras',  # vast improvement
+        'steps': 30,
+    }
 
     width, height = [int(x) for x in size.split('x')]  # ignore the restrictions on size
 
@@ -24,8 +28,21 @@ def generations(prompt: str, size: str, response_format: str, n: int):
         'width': width,
         'height': height,
         'batch_size': n,
-        'restore_faces': True,  # slightly less horrible
     }
+    payload.update(sd_defaults)
+
+    scale = min(width, height) / base_model_size
+    if scale >= 1.2:
+        # for better performance with the default size (1024), and larger res.
+        scaler = {
+            'width': width // scale,
+            'height': height // scale,
+            'hr_scale': scale,
+            'enable_hr': True,
+            'hr_upscaler': 'Latent',
+            'denoising_strength': 0.68,
+        }
+        payload.update(scaler)
 
     resp = {
         'created': int(time.time()),
@@ -38,7 +55,8 @@ def generations(prompt: str, size: str, response_format: str, n: int):
     response = requests.post(url=sd_url, json=payload)
     r = response.json()
     if response.status_code != 200 or 'images' not in r:
-        raise ServiceUnavailableError(r.get('detail', [{'msg': 'Unknown error calling Stable Diffusion'}])[0]['msg'], code=response.status_code)
+        print(r)
+        raise ServiceUnavailableError(r.get('error', 'Unknown error calling Stable Diffusion'), code=response.status_code, internal_message=r.get('errors',None))
     # r['parameters']...
     for b64_json in r['images']:
         if response_format == 'b64_json':
diff --git a/extensions/openai/moderations.py b/extensions/openai/moderations.py
index 66dfec9f..5b06a672 100644
--- a/extensions/openai/moderations.py
+++ b/extensions/openai/moderations.py
@@ -1,7 +1,7 @@
 import time
 import numpy as np
 from numpy.linalg import norm
-from extensions.openai.embeddings import get_embeddings_model
+from extensions.openai.embeddings import get_embeddings
 
 
 moderations_disabled = False  # return 0/false
@@ -11,21 +11,21 @@
 flag_threshold = 0.5
 
 
-def get_category_embeddings():
+def get_category_embeddings() -> dict:
     global category_embeddings, categories
     if category_embeddings is None:
-        embeddings = get_embeddings_model().encode(categories).tolist()
+        embeddings = get_embeddings(categories).tolist()
         category_embeddings = dict(zip(categories, embeddings))
 
     return category_embeddings
 
 
-def cosine_similarity(a, b):
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
     return np.dot(a, b) / (norm(a) * norm(b))
 
 
 # seems most openai like with all-mpnet-base-v2
-def mod_score(a, b):
+def mod_score(a: np.ndarray, b: np.ndarray) -> float:
     return 2.0 * np.dot(a, b)
 
 
@@ -37,8 +37,7 @@ def moderations(input):
         "results": [],
     }
 
-    embeddings_model = get_embeddings_model()
-    if not embeddings_model or moderations_disabled:
+    if moderations_disabled:
         results['results'] = [{
             'categories': dict([(C, False) for C in categories]),
             'category_scores': dict([(C, 0.0) for C in categories]),
@@ -53,7 +52,7 @@ def moderations(input):
         input = [input]
 
     for in_str in input:
-        for ine in embeddings_model.encode([in_str]).tolist():
+        for ine in get_embeddings([in_str]):
             category_scores = dict([(C, mod_score(category_embeddings[C], ine)) for C in categories])
             category_flags = dict([(C, bool(category_scores[C] > flag_threshold)) for C in categories])
             flagged = any(category_flags.values())
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index a0a5bcf6..86f2deb7 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -55,11 +55,13 @@ def start_sse(self):
 
     def send_sse(self, chunk: dict):
         response = 'data: ' + json.dumps(chunk) + '\r\n\r\n'
-        debug_msg(response)
+        debug_msg(response[:-4])
         self.wfile.write(response.encode('utf-8'))
 
     def end_sse(self):
-        self.wfile.write('data: [DONE]\r\n\r\n'.encode('utf-8'))
+        response = 'data: [DONE]\r\n\r\n'
+        debug_msg(response[:-4])
+        self.wfile.write(response.encode('utf-8'))
 
     def return_json(self, ret: dict, code: int = 200, no_debug=False):
         self.send_response(code)
@@ -84,6 +86,7 @@ def openai_error(self, message, code=500, error_type='APIError', param='', inter
             }
         }
         if internal_message:
+            print(error_type, message)
             print(internal_message)
             # error_resp['internal_message'] = internal_message
 
@@ -93,12 +96,10 @@ def openai_error_handler(func):
         def wrapper(self):
             try:
                 func(self)
-            except ServiceUnavailableError as e:
-                self.openai_error(e.message, e.code, e.error_type, internal_message=e.internal_message)
             except InvalidRequestError as e:
-                self.openai_error(e.message, e.code, e.error_type, e.param, internal_message=e.internal_message)
+                self.openai_error(e.message, e.code, e.__class__.__name__, e.param, internal_message=e.internal_message)
             except OpenAIError as e:
-                self.openai_error(e.message, e.code, e.error_type, internal_message=e.internal_message)
+                self.openai_error(e.message, e.code, e.__class__.__name__, internal_message=e.internal_message)
             except Exception as e:
                 self.openai_error(repr(e), 500, 'OpenAIError', internal_message=traceback.format_exc())
 
@@ -143,8 +144,7 @@ def do_POST(self):
         if '/completions' in self.path or '/generate' in self.path:
 
             if not shared.model:
-                self.openai_error("No model loaded.")
-                return
+                raise ServiceUnavailableError("No model loaded.")
 
             is_legacy = '/generate' in self.path
             is_streaming = body.get('stream', False)
@@ -176,8 +176,7 @@ def do_POST(self):
             # deprecated
 
             if not shared.model:
-                self.openai_error("No model loaded.")
-                return
+                raise ServiceUnavailableError("No model loaded.")
 
             req_params = get_default_req_params()
 
@@ -190,7 +189,10 @@ def do_POST(self):
 
             self.return_json(response)
 
-        elif '/images/generations' in self.path and 'SD_WEBUI_URL' in os.environ:
+        elif '/images/generations' in self.path:
+            if not 'SD_WEBUI_URL' in os.environ:
+                raise ServiceUnavailableError("Stable Diffusion not available. SD_WEBUI_URL not set.")
+
             prompt = body['prompt']
             size = default(body, 'size', '1024x1024')
             response_format = default(body, 'response_format', 'url')  # or b64_json
@@ -256,11 +258,11 @@ def run_server():
         try:
             from flask_cloudflared import _run_cloudflared
             public_url = _run_cloudflared(params['port'], params['port'] + 1)
-            print(f'Starting OpenAI compatible api at\nOPENAI_API_BASE={public_url}/v1')
+            print(f'OpenAI compatible API ready at: OPENAI_API_BASE={public_url}/v1')
         except ImportError:
             print('You should install flask_cloudflared manually')
     else:
-        print(f'Starting OpenAI compatible api:\nOPENAI_API_BASE=http://{server_addr[0]}:{server_addr[1]}/v1')
+        print(f'OpenAI compatible API ready at: OPENAI_API_BASE=http://{server_addr[0]}:{server_addr[1]}/v1')
 
     server.serve_forever()
 
diff --git a/extensions/openai/tokens.py b/extensions/openai/tokens.py
index f243c3c9..f8d6737a 100644
--- a/extensions/openai/tokens.py
+++ b/extensions/openai/tokens.py
@@ -1,6 +1,6 @@
 from extensions.openai.utils import float_list_to_base64
 from modules.text_generation import encode, decode
-
+import numpy as np
 
 def token_count(prompt):
     tokens = encode(prompt)[0]
@@ -12,14 +12,13 @@ def token_count(prompt):
     }
 
 
-def token_encode(input, encoding_format=''):
+def token_encode(input, encoding_format):
     # if isinstance(input, list):
     tokens = encode(input)[0]
 
     return {
         'results': [{
-            'encoding_format': encoding_format,
-            'tokens': float_list_to_base64(tokens) if encoding_format == "base64" else tokens,
+            'tokens': tokens,
             'length': len(tokens),
         }]
     }
diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index 0c9441a3..abc1acbc 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -3,9 +3,9 @@
 import numpy as np
 
 
-def float_list_to_base64(float_list):
+def float_list_to_base64(float_array: np.ndarray) -> str:
     # Convert the list to a float32 array that the OpenAPI client expects
-    float_array = np.array(float_list, dtype="float32")
+    #float_array = np.array(float_list, dtype="float32")
 
     # Get raw bytes
     bytes_array = float_array.tobytes()
diff --git a/extensions/send_pictures/script.py b/extensions/send_pictures/script.py
index 63421743..39c9362a 100644
--- a/extensions/send_pictures/script.py
+++ b/extensions/send_pictures/script.py
@@ -9,8 +9,6 @@
 from modules.ui import gather_interface_values
 from modules.utils import gradio
 
-# If 'state' is True, will hijack the next chat generation with
-# custom input text given by 'value' in the format [text, visible_text]
 input_hijack = {
     'state': False,
     'value': ["", ""]
@@ -20,6 +18,15 @@
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float32).to("cpu")
 
 
+def chat_input_modifier(text, visible_text, state):
+    global input_hijack
+    if input_hijack['state']:
+        input_hijack['state'] = False
+        return input_hijack['value']
+    else:
+        return text, visible_text
+
+
 def caption_image(raw_image):
     inputs = processor(raw_image.convert('RGB'), return_tensors="pt").to("cpu", torch.float32)
     out = model.generate(**inputs, max_new_tokens=100)
@@ -42,7 +49,10 @@ def ui():
 
     # Prepare the input hijack, update the interface values, call the generation function, and clear the picture
     picture_select.upload(
-        lambda picture, name1, name2: input_hijack.update({"state": True, "value": generate_chat_picture(picture, name1, name2)}), [picture_select, shared.gradio['name1'], shared.gradio['name2']], None).then(
+        lambda picture, name1, name2: input_hijack.update({
+            "state": True,
+            "value": generate_chat_picture(picture, name1, name2)
+        }), [picture_select, shared.gradio['name1'], shared.gradio['name2']], None).then(
         gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.generate_chat_reply_wrapper, shared.input_params, gradio('display', 'history'), show_progress=False).then(
         lambda: None, None, picture_select, show_progress=False)
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 04f34db3..6e1d0d4e 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -4,9 +4,11 @@
 import gradio as gr
 import torch
 
+from modules.logging_colors import logger
 from extensions.silero_tts import tts_preprocessor
 from modules import chat, shared
 from modules.utils import gradio
+from modules.ComputeDevice import gpu_dev
 
 torch._C._jit_set_profiling_mode(False)
 
@@ -17,7 +19,7 @@
     'language': 'en',
     'model_id': 'v3_en',
     'sample_rate': 48000,
-    'device': 'cpu',
+    'device': None,
     'show_text': False,
     'autoplay': True,
     'voice_pitch': 'medium',
@@ -54,7 +56,7 @@ def load_model():
         print(f'\nSilero TTS cache not found at {torch_cache_path}. Attempting to download...')
         tts_model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id'])
 
-    tts_model.to(params['device'])
+    tts_model.to(gpu_dev())
     return tts_model
 
 
diff --git a/extensions/superbooga/download_urls.py b/extensions/superbooga/download_urls.py
index efe300d2..424a9885 100644
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@@ -4,7 +4,10 @@
 
 
 def download_single(url):
-    response = requests.get(url, timeout=5)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+    }
+    response = requests.get(url, headers=headers, timeout=5)
     if response.status_code == 200:
         return response.content
     else:
diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py
index 1e07ad2c..cdc55687 100644
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@@ -16,6 +16,15 @@
 }
 
 
+def chat_input_modifier(text, visible_text, state):
+    global input_hijack
+    if input_hijack['state']:
+        input_hijack['state'] = False
+        return input_hijack['value']
+    else:
+        return text, visible_text
+
+
 def do_stt(audio, whipser_model, whipser_language):
     transcription = ""
     r = sr.Recognizer()
@@ -56,6 +65,7 @@ def ui():
     audio.change(
         auto_transcribe, [audio, auto_submit, whipser_model, whipser_language], [shared.gradio['textbox'], audio]).then(
         None, auto_submit, None, _js="(check) => {if (check) { document.getElementById('Generate').click() }}")
+
     whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None)
     whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None)
     auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)
diff --git a/models/config.yaml b/models/config.yaml
index 86d7293f..f9d0a4c0 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -274,9 +274,9 @@ TheBloke_WizardLM-30B-GPTQ:
   instruction_template: 'Alpaca'
 .*llama-(2|v2):
   truncation_length: 4096
+  rms_norm_eps: 5.0e-6
 .*llama-(2|v2).*chat:
   mode: 'instruct'
   instruction_template: 'Llama-v2'
 .*llama.*70b.*ggml.*\.bin:
   n_gqa: 8
-  rms_norm_eps: 1.0e-5
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
index 0d41ac0a..ac43337f 100644
--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from modules.ComputeDevice import gpu_dev
 
 import modules.shared as shared
 from modules.logging_colors import logger
@@ -41,7 +42,7 @@ def load_quantized(model_name):
     # Define the params for AutoGPTQForCausalLM.from_quantized
     params = {
         'model_basename': pt_path.stem,
-        'device': "cuda:0" if not shared.args.cpu else "cpu",
+        'device': gpu_dev,
         'use_triton': shared.args.triton,
         'inject_fused_attention': not shared.args.no_inject_fused_attention,
         'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
diff --git a/modules/ComputeDevice.py b/modules/ComputeDevice.py
new file mode 100644
index 00000000..3ffa768d
--- /dev/null
+++ b/modules/ComputeDevice.py
@@ -0,0 +1,234 @@
+import math
+import os
+import psutil
+import re
+import torch
+import unittest
+
+
+# from modules import shared
+import modules.shared as shared
+from modules.logging_colors import logger
+
+
+def has_gpu():
+    '''
+    Sets and returns the default torch.device object accordng to which deecices are available
+    on th emacnine. This will be the device used in context for any tensor operatuoins done
+    without providing an dexplicit device index or number.  This device number only applies to
+    CUDA devices and not MPS.
+    
+    Returns True for suda and mps
+            False for cpu
+    '''
+    return get_gpu()[0]
+
+
+def gpu_dev():
+    '''
+    Sets the default compute device for GPU acceleration.
+    
+    Returns torch.device object as cuda, mps, or cpu
+    '''
+    return get_gpu()[1]
+
+
+def get_gpu():
+    '''
+    Checks for GPU acceleration with either cuda or mps, will fallback to cpu
+
+    This should only really need to be called once. But will get called for every time
+    we check to see if there is an active GPU device. When this gets moved into a ComputeDevice
+    class, we can take care of this as class variables and methods.
+    
+    Returns a tuple (has_gpu, gpu_dev)
+        has_gou: true if cuda or mps is available
+        gpu_dev: the device found for compute
+    '''
+    # We don't *HAVE* to set a local rank index for each compute device, but it doesn't
+    # hurt anything of we do. This is mostly for CUDA and distributed setup.
+    local_rank = get_local_rank()
+    if torch.cuda.is_available():
+        logger.info("Using CUDA GPU Acceleration for Torch Device")
+        # torch.cuda.set_device(local_rank)
+        return True, torch.device("cuda", local_rank)
+    elif torch.backends.mps.is_available():
+        logger.info("Using MPS GPU Acceleration for Torch Device")
+        return True, torch.device("mps", local_rank)
+    else:
+        logger.warning("CPU only! No GPU acceleration available.  Possible performance impact.")
+        return False, torch.device("cpu", local_rank)
+
+
+def clear_gpu_cache():
+    '''
+    This clears the cache for the default torch device
+    Less than optimal, but should do for now.
+    '''
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    elif torch.backends.mps.is_available():
+        torch.mps.empty_cache()
+
+
+def get_local_rank():
+    '''
+    Get local renk is assigned in config or as environment variable.
+    '''
+    try:
+        local_rank = shared.args.local_rank
+    except TypeError:
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    return local_rank
+
+
+class ComputeDevice:
+    '''
+    Keep a list of all instances so we can use class methods for operating on all of them at once, like resetting, re-initiailixzing or anything else we might wat to do.
+    '''
+    devices = []
+
+    def __init__(self, device_type=None):
+        if device_type and ':' in device_type:
+            self.device_type, self.local_rank = device_type.split(':')
+            self.local_rank = int(self.local_rank)
+        else:
+            self.device_type = device_type if device_type else self.select_device()
+            self.local_rank = self.get_local_rank()
+
+        self.device = torch.device(self.device_type, self.local_rank)
+        ComputeDevice.devices.append(self)
+
+        # Initialize memory attributes
+        self.system_memory = None
+        self.gpu_memory = None
+        self.cpu_memory = None
+        # Calculate memory
+        self.calculate_memory()
+
+    @classmethod
+    def clear_all_cache(cls):
+        '''
+        This frees all cache space used by every device of ComputeDevice class we have created.
+        '''
+        for device in cls.devices:
+            device.clear_cache()
+
+    def clear_cache(self):
+        '''
+        This clears the cache for the torch device passeed to us.
+        '''
+        if self.device_type == 'cuda':
+            torch.cuda.empty_cache()
+        elif self.device_type == 'mps':
+            torch.mps.empty_cache()
+
+        # Remove the device from the list
+        ComputeDevice.devices.remove(self)
+
+    def get_local_rank(self):
+        '''
+        Get local renk is assigned in config or as environment variable.
+        '''
+        try:
+            local_rank = shared.args.local_rank
+        except TypeError:
+            local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        return local_rank
+
+    def select_device(self):
+        '''
+        This will contain the logic to select the appropriate device (CUDA, MPS, CPU) 
+        
+        Default is CPU
+
+        Local rank is just an index of the torch device.
+        
+        The statement: torch.device('cuds:0')
+        Is identical to: torch.device('cuda', 0)
+
+        '''
+        if torch.cuda.is_available():
+            return 'cuda'
+        elif torch.backends.mps.is_available():
+            return 'mps'
+        else:
+            return 'cpu'
+
+    @classmethod
+    def calculate_memory(cls):
+        '''
+        Perform all memory calculations to determine total system memory, total GPU memory, and CPU memory available for use by the application.  Some of these are adjusted by amounts for reservations specified in the config files.
+        '''
+        cls.system_memory = math.floor(psutil.virtual_memory().total / (1024 * 1024))
+
+        # Check for MPS, CUDA, or CPU and calculate total memory accordingly
+        if torch.backends.mps.is_available():
+            cls.gpu_memory = [cls.system_memory]
+        elif torch.cuda.is_available():
+            cls.gpu_memory = [math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)) for i in range(torch.cuda.device_count())]
+        else:
+            cls.gpu_memory = [cls.system_memory]
+
+        # Calculate default reserved GPU memory
+        cls.default_gpu_mem = []
+        if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
+            for i in shared.args.gpu_memory:
+                if 'mib' in i.lower():
+                    cls.default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
+                else:
+                    cls.default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
+        while len(cls.default_gpu_mem) < len(cls.gpu_memory):
+            cls.default_gpu_mem.append(0)
+
+        # Calculate default reserved CPU memory
+        if shared.args.cpu_memory is not None:
+            cls.cpu_reserved_memory = int(re.sub('[a-zA-Z ]', '', shared.args.cpu_memory))
+        else:
+            cls.cpu_reserved_memory = 0
+
+        # Calculate the total available memory for the application
+        cls.total_mem = [gm - dgm for gm, dgm in zip(cls.gpu_memory, cls.default_gpu_mem)]
+        cls.total_mem.append(cls.system_memory - cls.cpu_memory)
+        
+
+
+
+# Unit testing for this class.
+class TestComputeDevice(unittest.TestCase):
+    def setUp(self):
+        self.device = ComputeDevice('cpu')
+
+    def test_device_type(self):
+        self.assertEqual(self.device.device_type, 'cpu')
+
+    def test_local_rank(self):
+        self.assertEqual(self.device.local_rank, 0)
+
+    def test_device(self):
+        self.assertEqual(self.device.device.type, 'cpu')
+
+    def test_memory_calculation(self):
+        self.assertIsNotNone(self.device.system_memory)
+        self.assertIsNotNone(self.device.gpu_memory)
+        self.assertIsNotNone(self.device.cpu_memory)
+
+    def test_clear_cache(self):
+        # This is a bit tricky to test as it doesn't return anything
+        # But at least we can check it doesn't raise an error
+        try:
+            self.device.clear_cache()
+        except Exception as e:
+            self.fail(f"clear_cache raised an exception: {e}")
+
+    def test_clear_all_cache(self):
+        # Similar to test_clear_cache
+        try:
+            ComputeDevice.clear_all_cache()
+        except Exception as e:
+            self.fail(f"clear_all_cache raised an exception: {e}")
+
+# If this is run directly from the command line, rather than imported, it willr 
+# run the unit tests
+if __name__ == '__main__':
+    unittest.main()
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index ddc5f9a5..84833302 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -10,6 +10,7 @@
 
 import modules.shared as shared
 from modules.logging_colors import logger
+from modules.ComputeDevice import gpu_dev
 
 sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
 
@@ -171,10 +172,10 @@ def load_quantized(model_name):
         else:
             pre_layer = shared.args.pre_layer
 
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, pre_layer)
+        GPTQ_model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, pre_layer)
     else:
         threshold = False if model_type == 'gptj' else 128
-        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
+        GPTQ_model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
 
         # accelerate offload (doesn't work properly)
         if shared.args.gpu_memory or torch.cuda.device_count() > 1:
@@ -187,15 +188,14 @@ def load_quantized(model_name):
 
                 max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
             else:
-                max_memory = accelerate.utils.get_balanced_memory(model)
+                max_memory = accelerate.utils.get_balanced_memory(GPTQ_model)
 
-            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
+            device_map = accelerate.infer_auto_device_map(GPTQ_model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
             logger.info("Using the following device map for the quantized model:", device_map)
             # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
-            model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)
+            GPTQ_model = accelerate.dispatch_model(GPTQ_model, device_map=device_map, offload_buffers=True)
 
-        # No offload
-        elif not shared.args.cpu:
-            model = model.to(torch.device('cuda:0'))
+        gpu = gpu_dev()
+        GPTQ_model = GPTQ_model.to(gpu)
 
-    return model
+    return GPTQ_model
diff --git a/modules/RWKV.py b/modules/RWKV.py
index 35d69986..7ed18519 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -7,6 +7,7 @@
 
 import modules.shared as shared
 from modules.callbacks import Iteratorize
+from modules.ComputeDevice import gpu_dev
 
 np.set_printoptions(precision=4, suppress=True, linewidth=200)
 
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 1fa95e47..42e61a2e 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -7,6 +7,7 @@
 import transformers
 
 import modules.shared as shared
+from modules.ComputeDevice import clear_gpu_cache
 
 
 class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
@@ -90,5 +91,4 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 def clear_torch_cache():
     gc.collect()
-    if not shared.args.cpu:
-        torch.cuda.empty_cache()
+    clear_gpu_cache()
\ No newline at end of file
diff --git a/modules/chat.py b/modules/chat.py
index d2423555..f684768b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -175,10 +175,8 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
     # Preparing the input
     if not any((regenerate, _continue)):
-        text, visible_text = apply_extensions('input_hijack', text, visible_text)
-        if visible_text is None:
-            visible_text = text
-
+        visible_text = text
+        text, visible_text = apply_extensions('chat_input', text, visible_text, state)
         text = apply_extensions('input', text, state)
 
         # *Is typing...*
diff --git a/modules/deepspeed_parameters.py b/modules/deepspeed_parameters.py
index 9116f579..f170a385 100644
--- a/modules/deepspeed_parameters.py
+++ b/modules/deepspeed_parameters.py
@@ -1,6 +1,6 @@
 def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir):
     '''
-    DeepSpeed configration
+    DeepSpeed configuration
     https://huggingface.co/docs/transformers/main_classes/deepspeed
     '''
 
diff --git a/modules/extensions.py b/modules/extensions.py
index faf6cf6d..76b6be8b 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -1,13 +1,12 @@
 import traceback
 from functools import partial
+from inspect import signature
 
 import gradio as gr
 
 import extensions
 import modules.shared as shared
 from modules.logging_colors import logger
-from inspect import signature
-
 
 state = {}
 available_extensions = []
@@ -66,15 +65,11 @@ def _apply_string_extensions(function_name, text, state):
     return text
 
 
-# Input hijack of extensions
-def _apply_input_hijack(text, visible_text):
+# Extension functions that map string -> string
+def _apply_chat_input_extensions(text, visible_text, state):
     for extension, _ in iterator():
-        if hasattr(extension, 'input_hijack') and extension.input_hijack['state']:
-            extension.input_hijack['state'] = False
-            if callable(extension.input_hijack['value']):
-                text, visible_text = extension.input_hijack['value'](text, visible_text)
-            else:
-                text, visible_text = extension.input_hijack['value']
+        if hasattr(extension, 'chat_input_modifier'):
+            text, visible_text = extension.chat_input_modifier(text, visible_text, state)
 
     return text, visible_text
 
@@ -120,7 +115,11 @@ def _apply_tokenizer_extensions(function_name, state, prompt, input_ids, input_e
 def _apply_logits_processor_extensions(function_name, processor_list, input_ids):
     for extension, _ in iterator():
         if hasattr(extension, function_name):
-            getattr(extension, function_name)(processor_list, input_ids)
+            result = getattr(extension, function_name)(processor_list, input_ids)
+            if type(result) is list:
+                processor_list = result
+
+    return processor_list
 
 
 # Get prompt length in tokens after applying extension functions which override the default tokenizer output
@@ -187,12 +186,12 @@ def create_extensions_tabs():
 EXTENSION_MAP = {
     "input": partial(_apply_string_extensions, "input_modifier"),
     "output": partial(_apply_string_extensions, "output_modifier"),
+    "chat_input": _apply_chat_input_extensions,
     "state": _apply_state_modifier_extensions,
     "history": _apply_history_modifier_extensions,
     "bot_prefix": partial(_apply_string_extensions, "bot_prefix_modifier"),
     "tokenizer": partial(_apply_tokenizer_extensions, "tokenizer_modifier"),
     'logits_processor': partial(_apply_logits_processor_extensions, 'logits_processor_modifier'),
-    "input_hijack": _apply_input_hijack,
     "custom_generate_chat_prompt": _apply_custom_generate_chat_prompt,
     "custom_generate_reply": _apply_custom_generate_reply,
     "tokenized_length": _apply_custom_tokenized_length,
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 94d893c4..472c45fb 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -10,6 +10,7 @@
 
 from modules import shared
 from modules.logging_colors import logger
+from modules.ComputeDevice import get_gpu
 
 
 class LlamacppHF(PreTrainedModel):
@@ -30,7 +31,7 @@ def prepare_inputs_for_generation(self, input_ids, **kwargs):
 
     @property
     def device(self) -> torch.device:
-        return torch.device(0)
+        return get_gpu()
 
     def __call__(self, *args, **kwargs):
         # TODO: Some decoding methods (such as Contrastive Search) may not work at this time
@@ -99,6 +100,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
+            'n_gqa': shared.args.n_gqa or None,
+            'rms_norm_eps': shared.args.rms_norm_eps or None,
             'logits_all': True,
         }
 
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 180b0f37..d8d41cdd 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -25,8 +25,8 @@ class LlamaCppModel:
     def __init__(self):
         self.initialized = False
 
-    def __del__(self):
-        self.model.__del__()
+#     def __del__(self):
+#         self.model.__del__()
 
     @classmethod
     def from_pretrained(self, path):
@@ -53,6 +53,8 @@ def from_pretrained(self, path):
             'n_gpu_layers': shared.args.n_gpu_layers,
             'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
+            'n_gqa': shared.args.n_gqa or None,
+            'rms_norm_eps': shared.args.rms_norm_eps or None,
         }
 
         result.model = Llama(**params)
diff --git a/modules/loaders.py b/modules/loaders.py
index b760128f..c55cf0ff 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -30,6 +30,8 @@
     ],
     'llama.cpp': [
         'n_ctx',
+        'n_gqa',
+        'rms_norm_eps',
         'n_gpu_layers',
         'n_batch',
         'threads',
@@ -42,6 +44,8 @@
     ],
     'llamacpp_HF': [
         'n_ctx',
+        'n_gqa',
+        'rms_norm_eps',
         'n_gpu_layers',
         'n_batch',
         'threads',
diff --git a/modules/models.py b/modules/models.py
index 232d5fa6..449ab672 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -21,6 +21,7 @@
 from modules import llama_attn_hijack, sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import infer_loader
+from modules.ComputeDevice import get_gpu, has_gpu
 
 transformers.logging.set_verbosity_error()
 
@@ -35,9 +36,8 @@
     from modules.deepspeed_parameters import generate_ds_config
 
     # Distributed setup
-    local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
+    gpu_dev = get_gpu()
     world_size = int(os.getenv("WORLD_SIZE", "1"))
-    torch.cuda.set_device(local_rank)
     deepspeed.init_distributed()
     ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
     dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration
@@ -78,24 +78,24 @@ def load_model(model_name, loader=None):
     shared.args.loader = loader
     output = load_func_map[loader](model_name)
     if type(output) is tuple:
-        model, tokenizer = output
+        lcl_model, lcl_tokenizer = output
     else:
-        model = output
-        if model is None:
+        lcl_model = output
+        if lcl_model is None:
             return None, None
         else:
-            tokenizer = load_tokenizer(model_name, model)
+            lcl_tokenizer = load_tokenizer(model_name, lcl_tokenizer)
 
     # Hijack attention with xformers
     if any((shared.args.xformers, shared.args.sdp_attention)):
         llama_attn_hijack.hijack_llama_attention()
 
     logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.\n")
-    return model, tokenizer
+    return lcl_model, lcl_tokenizer
 
 
-def load_tokenizer(model_name, model):
-    tokenizer = None
+def load_tokenizer(model_name, tokenizer):
+    #tokenizer = None
     path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
     if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
         tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
@@ -147,11 +147,7 @@ def huggingface_loader(model_name):
     # Load the model in simple 16-bit mode by default
     if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
         model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
-        if torch.backends.mps.is_available():
-            device = torch.device('mps')
-            model = model.to(device)
-        else:
-            model = model.cuda()
+        model = model.to()
 
     # DeepSpeed ZeRO-3
     elif shared.args.deepspeed:
@@ -167,8 +163,7 @@ def huggingface_loader(model_name):
             "trust_remote_code": shared.args.trust_remote_code
         }
 
-        if not any((shared.args.cpu, torch.cuda.is_available(), torch.backends.mps.is_available())):
-            logger.warning("torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.")
+        if not has_gpu():
             shared.args.cpu = True
 
         if shared.args.cpu:
@@ -250,7 +245,8 @@ def flexgen_loader(model_name):
 def RWKV_loader(model_name):
     from modules.RWKV import RWKVModel, RWKVTokenizer
 
-    model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
+    gpu_dev = get_gpu()
+    model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device=gpu_dev)
     tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir))
     return model, tokenizer
 
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 3f37e48d..9319582e 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -30,8 +30,6 @@ def infer_loader(model_name):
         loader = 'llama.cpp'
     elif re.match('.*rwkv.*\.pth', model_name.lower()):
         loader = 'RWKV'
-    elif shared.args.flexgen:
-        loader = 'FlexGen'
     else:
         loader = 'Transformers'
 
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 0a86b4fd..08ab4826 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -9,6 +9,7 @@
     LogitsProcessorList,
     TemperatureLogitsWarper
 )
+from modules.ComputeDevice import get_gpu
 
 
 class TailFreeLogitsWarper(LogitsWarper):
@@ -106,7 +107,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         # Normalize the probabilities of the remaining words
         prob_topk = torch.softmax(sorted_logits, dim=0)
 
-        prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
+        gpu_dev = get_gpu()
+        prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to(gpu_dev)
 
         observed_surprise = -math.log2(prob_topk[prev_i])
         self.e = observed_surprise - self.mirostat_tau
diff --git a/modules/shared.py b/modules/shared.py
index f0a426a0..8558bbc0 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -31,8 +31,11 @@
 # For restarting the interface
 need_restart = False
 
+# Graceful Shutdown
+run_server = True
+
 settings = {
-    'dark_theme': False,
+    'dark_theme': True,
     'autoload_model': False,
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
@@ -96,7 +99,7 @@ def str2bool(v):
 parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
 
 # Model loader
-parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen')
+parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv')
 
 # Accelerate/transformers
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
@@ -128,6 +131,8 @@ def str2bool(v):
 parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
 parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)')
+parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama2 70b.')
+parser.add_argument('--rms_norm_eps', type=float, default=0, help='Must be 1e-5 for llama2 70b.')
 
 # GPTQ
 parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
@@ -152,14 +157,6 @@ def str2bool(v):
 # ExLlama
 parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
 parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
-parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
-parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both.")
-
-# FlexGen
-parser.add_argument('--flexgen', action='store_true', help='DEPRECATED')
-parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
-parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
-parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).")
 
 # DeepSpeed
 parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -170,6 +167,10 @@ def str2bool(v):
 parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
 parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
 
+# RoPE
+parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
+parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both.")
+
 # Gradio
 parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
 parser.add_argument('--listen-host', type=str, help='The hostname that the server will use.')
@@ -198,9 +199,6 @@ def str2bool(v):
 if args.gptq_for_llama:
     logger.warning('--gptq-for-llama has been deprecated and will be removed soon. Use --loader gptq-for-llama instead.')
     args.loader = 'gptq-for-llama'
-if args.flexgen:
-    logger.warning('--flexgen has been deprecated and will be removed soon. Use --loader flexgen instead.')
-    args.loader = 'FlexGen'
 
 # Security warnings
 if args.trust_remote_code:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index d3939d3f..c875c193 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -19,7 +19,8 @@
 from modules.extensions import apply_extensions
 from modules.html_generator import generate_4chan_html, generate_basic_html
 from modules.logging_colors import logger
-from modules.models import clear_torch_cache, local_rank
+from modules.models import clear_torch_cache
+from modules.ComputeDevice import get_gpu
 
 
 def generate_reply(*args, **kwargs):
@@ -126,8 +127,6 @@ def set_manual_seed(seed):
         seed = random.randint(1, 2**31)
 
     torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
 
     return seed
 
@@ -182,8 +181,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
 
         if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
             generate_func = generate_reply_custom
-        elif shared.args.flexgen:
-            generate_func = generate_reply_flexgen
         else:
             generate_func = generate_reply_HF
 
@@ -250,7 +247,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
     # Encode the input
     input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
     output = input_ids[0]
-    cuda = not any((shared.args.cpu, shared.args.deepspeed))
 
     # Add the encoded tokens to generate_params
     question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
@@ -281,8 +277,7 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
         if not state['stream']:
             with torch.no_grad():
                 output = shared.model.generate(**generate_params)[0]
-                if cuda:
-                    output = output.cuda()
+                output = output.to()
 
             yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
 
@@ -339,66 +334,3 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str
         new_tokens = len(encode(original_question + reply)[0]) - original_tokens
         print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
         return
-
-
-def generate_reply_flexgen(question, original_question, seed, state, stopping_strings=None, is_chat=False):
-    generate_params = {}
-    for k in ['max_new_tokens', 'do_sample', 'temperature']:
-        generate_params[k] = state[k]
-
-    if state['stream']:
-        generate_params['max_new_tokens'] = 8
-
-    # Encode the input
-    input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
-    output = input_ids[0]
-
-    # Find the eos tokens
-    eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
-    if not state['ban_eos_token']:
-        generate_params['stop'] = eos_token_ids[-1]
-
-    # Add the encoded tokens to generate_params
-    question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
-    original_input_ids = input_ids
-    generate_params.update({'inputs': input_ids})
-    if inputs_embeds is not None:
-        generate_params.update({'inputs_embeds': inputs_embeds})
-
-    t0 = time.time()
-    try:
-        if not is_chat:
-            yield ''
-
-        # Generate the entire reply at once.
-        if not state['stream']:
-            with torch.no_grad():
-                output = shared.model.generate(**generate_params)[0]
-
-            yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
-
-        # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
-        else:
-            for i in range(state['max_new_tokens'] // 8 + 1):
-                if shared.stop_everything:
-                    break
-
-                clear_torch_cache()
-                with torch.no_grad():
-                    output = shared.model.generate(**generate_params)[0]
-
-                if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
-                    break
-
-                yield get_reply_from_output_ids(output, original_input_ids, original_question, state)
-                input_ids = np.reshape(output, (1, output.shape[0]))
-                generate_params.update({'inputs': input_ids})
-
-    except Exception:
-        traceback.print_exc()
-    finally:
-        t1 = time.time()
-        original_tokens = len(original_input_ids[0])
-        new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
-        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
-        return
diff --git a/modules/training.py b/modules/training.py
index 1f8e5e5e..c98fded2 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -445,9 +445,9 @@ def tokenize(prompt, append_eos_token=False):
 
         def generate_prompt(data_point: dict[str, str]):
             for options, data in format_data.items():
-                if set(options.split(',')) == set(x[0] for x in data_point.items() if (x[1] is not None and len(x[1].strip()) > 0)):
+                if set(options.split(',')) == set(x[0] for x in data_point.items() if (type(x[1]) is str and len(x[1].strip()) > 0)):
                     for key, val in data_point.items():
-                        if val is not None:
+                        if type(val) is str:
                             data = data.replace(f'%{key}%', val)
                     return data
             raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
diff --git a/modules/ui.py b/modules/ui.py
index 704be925..d9b3a131 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -61,6 +61,8 @@ def list_model_elements():
         'mlock',
         'n_gpu_layers',
         'n_ctx',
+        'n_gqa',
+        'rms_norm_eps',
         'llama_cpp_seed',
         'gpu_split',
         'max_seq_len',
@@ -161,7 +163,10 @@ def apply_interface_values(state, use_persistent=False):
 
 
 class ToolButton(gr.Button, gr.components.IOComponent):
-    """Small button with single emoji as text, fits inside gradio forms"""
+    """
+    Small button with single emoji as text, fits inside gradio forms
+    Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui
+    """
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -171,6 +176,9 @@ def get_block_name(self):
 
 
 def create_refresh_button(refresh_component, refresh_method, refreshed_args, elem_class):
+    """
+    Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui
+    """
     def refresh():
         refresh_method()
         args = refreshed_args() if callable(refreshed_args) else refreshed_args
diff --git a/modules/utils.py b/modules/utils.py
index e257de2d..9ae5dc86 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -71,10 +71,7 @@ def natural_keys(text):
 
 
 def get_available_models():
-    if shared.args.flexgen:
-        return sorted([re.sub('-np$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if item.name.endswith('-np')], key=natural_keys)
-    else:
-        return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
+    return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
 
 
 def get_available_presets():
diff --git a/oobainst b/oobainst
new file mode 100644
index 00000000..72837766
--- /dev/null
+++ b/oobainst
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Load default environment variables if the defaults file exists. Shoudl
+# part of th einstallation package, but later copied to .env.local if the
+# user would like to customize locations, names, etc for their environment.
+# Load user overrides for environment varianbles. Set reasonable defaults
+# if none are set. Source in the oobabooga .env.local file
+[ -e "$PWD/.env.defaults" ] && source $PWD/.env.defaults 
+[ -e "$PWD/.env.local" ] && source $PWD/.env.local 
+[ -e "$PWD/text-generation-webui/.env.local" ] && \
+    source $PWD/text-generation-webui/.env.local
+
+
+_confirm_yn(){
+    # This is a function to confirm a Yes or No answer from the uset
+    # and determine is they wish to continue or not. On a negative response,
+    # this will exit with a return code of 1.
+    # TODO Might be noce to pass different return codes as a second parameter.
+    echo -e "$1"
+    
+    echo -n "Do you wish to continue? [Y|N]: "
+    read -n 1 -r response
+    echo ""
+
+    if [[ ! $response =~ ^[Yy]$ ]]; then 
+        echo -e "Exiting.\n"
+        exit 1
+    fi
+}
+
+# M Series or Intel - uname and arch give incorrect responses if run
+# inside Rosetta or other Intel environment like VSCode, sysctl better.
+KERNEL_ARCH=$(sysctl -n kern.version)
+case "${KERNEL_ARCH}" in
+    *X86_64*)   OS_ARCH="x86_64";;
+    *ARM64*)    OS_ARCH="arm64";;
+    *)          echo "Unknown system architecture: ${KERNEL_ARCH}! This script runs only on x86_64 or arm64" && exit
+esac
+
+# Check to see if base CPU architecture is consistent with current
+# environment, Give the user a chance to proceed or not.
+if [ "$OS_ARCH" != "$(uname -m)" ]; then
+   _confirm_yn '\nWARNING: The kernel string, \"arch\" and \"uname -m\" do not agree.\n\nIt seems you are running this in Rosetta, VSCode Terminal, or Intel virtual\nmachine with incorrect or conflicting architecture. This \nmay result in mixed or unintended binary architectures when compiling\nfrom source code.\n'
+fi
+
+
+# Begin th einstallation.
+if [ -e ${HOME}/.conda ]; then
+   _confirm_yn 
+
+fi
+
+curl  https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o miniconda.sh
+
+# (if necessary) install git and conda into a contained environment
+# download miniconda
+if [ ! -e "${CONDA_PYTHON_EXE}" ]; then
+    MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-MacOSX-${OS_ARCH}.sh"
+    echo "Downloading Miniconda from $CONDA_PREFIX to $INSTALL_DIR/miniconda_installer.sh"
+
+    mkdir -p "$CONDA_PREFIX"
+    curl -Lk "$MINICONDA_URL" > "$CONDA_PREFIX/miniconda_installer.sh"
+
+    chmod u+x "$CONDA_PREFIX/miniconda_installer.sh"
+    bash "$INSTALL_DIR/miniconda_installer.sh" -b -p $CONDA_ROOT
+
+    # test the conda binary
+    echo "Miniconda version:"
+    "$CONDA_ROOT/bin/conda" --version
+fi
+
+# create the installer env
+if [ ! -e "$CONDA_PREFIX" ]; then
+    "$CONDA_ROOT/bin/conda" create -y -k --prefix "$CONDA_PREFIX" python=3.10
+fi
+
+# check if conda environment was actually created
+if [ ! -e "$CONDA_PREFIX/bin/python" ]; then
+    echo "Conda environment is empty."
+    exit
+fi
+
+# environment isolation
+unset PYTHONPATH
+unset PYTHONHOME
+
+export PYTHONNOUSERSITE=1
+export CUDA_PATH="$CONDA_PREFIX"
+export CUDA_HOME="$CUDA_PATH"
+
+# activate installer env
+source "$CONDA_ROOT/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script)
+conda activate "$CONDA_DEFAULT_ENV"
+
+conda info --envs
+env | sort | grep -i pyth
+
+exit
+
+# setup installer env
+python ./webui.py
diff --git a/oobastart b/oobastart
new file mode 100755
index 00000000..7c780540
--- /dev/null
+++ b/oobastart
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Load user overrides for environment varianbles. Set reasonable defaults
+# if none are set. Source in the oobabooga .env.local file
+
+# These are set to sane defaults if they are not set in the runtime 
+# environment based on the output of "conda info -s" and assuming the 
+# default installer was used.  If someoen instaleld manually wans wishes to # use the "textgen" venv, they may set the environmrnt variables in the 
+# .env.local file in the text=generation-webui insatllation directory.
+export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=textgen}"
+export CONDA_EXE="${CONDA_EXE:=/Users/mps/projects/AI-PROJECTS/oobabooga_macos/installer_files/conda/bin/conda}"
+export CONDA_PREFIX="${CONDA_PREFIX:=/Users/mps/projects/AI-PROJECTS/oobabooga_macos/installer_files/conda/envs/textgen}"
+export CONDA_PROMPT_MODIFIER="${CONDA_PROMPT_MODIFIER:=(textgen) }"
+export CONDA_PYTHON_EXE="${CONDA_PYTHON_EXE:=/Users/mps/projects/AI-PROJECTS/oobabooga_macos/installer_files/conda/bin/python}"
+export CONDA_ROOT="${CONDA_ROOT:=/Users/mps/projects/AI-PROJECTS/oobabooga_macos/installer_files/conda}"
+
+INSTALL_DIR="$(pwd)/installer_files"
+# CONDA_ROOT_PREFIX="$(pwd)/installer_files/conda"
+# INSTALL_ENV_DIR="$(pwd)/installer_files/env"
+# conda_exists="F"
+MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-MacOSX-${OS_ARCH}.sh"
+
+
+# M Series or Intel - uname and arch give incorrect responses if run
+# inside Rosetta or other Intel environment like VSCode, sysctl better.
+KERNEL_ARCH=$(sysctl -n kern.version)
+case "${KERNEL_ARCH}" in
+    *X86_64*)   OS_ARCH="x86_64";;
+    *ARM64*)    OS_ARCH="arm64";;
+    *)          echo "Unknown system architecture: $KERNEL_ARCH! This script runs only on x86_64 or arm64" && exit
+esac
+
+# Check to see if base CPU architecture is consistent with current
+# environment, Give the user a chance to proceed or not.
+if [ "$OS_ARCH" != "$(uname -m)" ]; then
+    echo -e "\nWARNING: The kernel string, \"arch\" and \"uname -m\" do not agree.\n\nIt seems you are running this in Rosetta, VSCode Terminal, or Intel virtual\nmachine. This may result in mixed or unintended binary architectures when\ncompiling from source code.\n"
+    
+    echo -n "Do you wish to continue? [Y|N]: "
+    read -n 1 -r response
+    echo ""
+
+    if [[ ! $response =~ ^[Yy]$ ]]; then 
+        echo -e "Exiting.\n"
+        exit 1
+    fi
+fi
+
+# figure out whether git and conda needs to be installed
+# if [  ]; then conda_exists="T"; fi
+
+# (if necessary) install git and conda into a contained environment
+# download miniconda
+if [ ! -e "${CONDA_PYTHON_EXE}" ]; then
+    echo "Downloading Miniconda from $MINICONDA_DOWNLOAD_URL to $INSTALL_DIR/miniconda_installer.sh"
+
+    mkdir -p "$INSTALL_DIR"
+    curl -Lk "$MINICONDA_DOWNLOAD_URL" > "$INSTALL_DIR/miniconda_installer.sh"
+
+    chmod u+x "$INSTALL_DIR/miniconda_installer.sh"
+    bash "$INSTALL_DIR/miniconda_installer.sh" -b -p $CONDA_ROOT
+
+    # test the conda binary
+    echo "Miniconda version:"
+    "$CONDA_ROOT/bin/conda" --version
+fi
+
+# create the installer env
+if [ ! -e "$CONDA_PREFIX" ]; then
+    "$CONDA_ROOT/bin/conda" create -y -k --prefix "$CONDA_PREFIX" python=3.10
+fi
+
+# check if conda environment was actually created
+if [ ! -e "$CONDA_PREFIX/bin/python" ]; then
+    echo "Conda environment is empty."
+    exit
+fi
+
+# environment isolation
+unset PYTHONPATH
+unset PYTHONHOME
+
+export PYTHONNOUSERSITE=1
+export CUDA_PATH="$CONDA_PREFIX"
+export CUDA_HOME="$CUDA_PATH"
+
+# activate installer env
+source "$CONDA_ROOT/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script)
+conda activate "$CONDA_DEFAULT_ENV"
+
+conda info --envs
+env | sort | grep -i pyth
+
+# setup installer env
+python webui.py
diff --git a/server.py b/server.py
index 7231dfef..154f9bc8 100644
--- a/server.py
+++ b/server.py
@@ -1,19 +1,24 @@
-import os
-import warnings
-
-from modules.logging_colors import logger
-from modules.block_requests import OpenMonkeyPatch, RequestBlocker
-
-os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
-os.environ['BITSANDBYTES_NOWELCOME'] = '1'
-warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
-
-with RequestBlocker():
-    import gradio as gr
-
 import matplotlib
-matplotlib.use('Agg')  # This fixes LaTeX rendering on some systems
-
+from functools import partial
+from pathlib import Path
+from threading import Lock
+from PIL import Image
+from modules import chat, loaders, presets, shared, training, ui, utils
+from modules.block_requests import OpenMonkeyPatch, RequestBlocker
+from modules.extensions import apply_extensions
+from modules.github import clone_or_pull_repository
+from modules.html_generator import chat_html_wrapper
+from modules.logging_colors import logger
+from modules.LoRA import add_lora_to_model
+from modules.models import load_model, unload_model
+from modules.models_settings import (apply_model_settings_to_state,
+                                     get_model_settings_from_yamls,
+                                     save_model_settings,
+                                     update_model_parameters)
+from modules.text_generation import (generate_reply_wrapper,
+                                     get_encoded_length, stop_everything_event)
+from modules.utils import gradio
+import cProfile
 import importlib
 import json
 import math
@@ -22,34 +27,21 @@
 import sys
 import time
 import traceback
-from functools import partial
-from pathlib import Path
-from threading import Lock
-
+import warnings
 import psutil
 import torch
 import yaml
-from PIL import Image
-
 import modules.extensions as extensions_module
-from modules import chat, loaders, presets, shared, training, ui, utils
-from modules.extensions import apply_extensions
-from modules.github import clone_or_pull_repository
-from modules.html_generator import chat_html_wrapper
-from modules.LoRA import add_lora_to_model
-from modules.models import load_model, unload_model
-from modules.models_settings import (
-    apply_model_settings_to_state,
-    get_model_settings_from_yamls,
-    save_model_settings,
-    update_model_parameters
-)
-from modules.text_generation import (
-    generate_reply_wrapper,
-    get_encoded_length,
-    stop_everything_event
-)
-from modules.utils import gradio
+
+os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
+os.environ['BITSANDBYTES_NOWELCOME'] = '1'
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+
+with RequestBlocker():
+    import gradio as gr
+
+matplotlib.use('Agg')  # This fixes LaTeX rendering on some systems
+
 
 
 def load_model_wrapper(selected_model, loader, autoload=False):
@@ -215,10 +207,14 @@ def create_model_menus():
                         shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
                         shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
                         shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
-                        shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
-                        shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
+
                         shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=1024, value=shared.args.n_gpu_layers)
                         shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
+                        shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
+                        shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
+                        shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama2 70b.')
+                        shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.n_gqa, info='5e-6 (0.012394 for those without a calculator handy) is a good value for llama2 70b.')
+
                         shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
                         shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
                         shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
@@ -246,6 +242,7 @@ def create_model_menus():
                         shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
                         shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                         shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
+#                        shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                         shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
                         shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                         shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).')
@@ -318,7 +315,7 @@ def create_settings_menus(default_preset):
     with gr.Row():
         with gr.Column():
             with gr.Row():
-                shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset', elem_classes='slim-dropdown')
+                shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Generation parameters preset', elem_classes='slim-dropdown')
                 ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
                 shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
                 shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
@@ -373,7 +370,6 @@ def create_settings_menus(default_preset):
         1) Midnight Enigma
         2) Yara
         3) Shortwave
-        4) Kobold-Godlike
 
     ### Temperature
     Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.
@@ -566,6 +562,9 @@ def set_interface_arguments(interface_mode, extensions, bool_active):
 
     shared.need_restart = True
 
+def shutdown_server(interface_mode, extensions, bool_active):
+    stop_everything_event()
+    shared.run_server = False
 
 def create_interface():
 
@@ -591,6 +590,21 @@ def create_interface():
     if shared.args.extensions is not None and len(shared.args.extensions) > 0:
         extensions_module.load_extensions()
 
+    # Forcing some events to be triggered on page load
+    shared.persistent_interface_state.update({
+        'loader': shared.args.loader or 'Transformers',
+    })
+
+    if shared.is_chat():
+        shared.persistent_interface_state.update({
+            'mode': shared.settings['mode'],
+            'character_menu': shared.args.character or shared.settings['character'],
+            'instruction_template': shared.settings['instruction_template']
+        })
+
+        if Path("cache/pfp_character.png").exists():
+            Path("cache/pfp_character.png").unlink()
+
     # css/js strings
     css = ui.css if not shared.is_chat() else ui.css + ui.chat_css
     js = ui.main_js if not shared.is_chat() else ui.main_js + ui.chat_js
@@ -838,11 +852,18 @@ def create_interface():
 
                     extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.')
                     extension_status = gr.Markdown()
+                    shared.gradio['stop_server'] = gr.Button("Shutrown the Server", elem_classes="small-button", variant="primary")
+
 
             extension_name.submit(
                 clone_or_pull_repository, extension_name, extension_status, show_progress=False).then(
                 lambda: gr.update(choices=utils.get_available_extensions(), value=shared.args.extensions), None, gradio('extensions_menu'))
 
+            # Stop Server
+            shared.gradio['stop_server'].click(
+                shutdown_server, gradio('interface_modes_menu', 'extensions_menu', 'bool_menu'), None).then(
+                lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Shutting Down Server</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
+
             # Reset interface event
             shared.gradio['reset_interface'].click(
                 set_interface_arguments, gradio('interface_modes_menu', 'extensions_menu', 'bool_menu'), None).then(
@@ -1052,11 +1073,11 @@ def create_interface():
 
         create_file_saving_event_handlers()
 
-        shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
-        shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         if shared.settings['dark_theme']:
             shared.gradio['interface'].load(lambda: None, None, None, _js="() => document.getElementsByTagName('body')[0].classList.add('dark')")
 
+        shared.gradio['interface'].load(lambda: None, None, None, _js=f"() => {{{js}}}")
+        shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
         if shared.is_chat():
             shared.gradio['interface'].load(chat.redraw_html, shared.reload_inputs, gradio('display'))
 
@@ -1075,7 +1096,7 @@ def create_interface():
             shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch, auth=auth)
 
 
-if __name__ == "__main__":
+def main():
     # Loading custom settings
     settings_file = None
     if shared.args.settings is not None and Path(shared.args.settings).exists():
@@ -1102,6 +1123,8 @@ def create_interface():
         'skip_special_tokens': shared.settings['skip_special_tokens'],
         'custom_stopping_strings': shared.settings['custom_stopping_strings'],
         'truncation_length': shared.settings['truncation_length'],
+        'n_gqa': 0,
+        'rms_norm_eps': 0,
     }
 
     shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
@@ -1152,26 +1175,11 @@ def create_interface():
         if shared.args.lora:
             add_lora_to_model(shared.args.lora)
 
-    # Forcing some events to be triggered on page load
-    shared.persistent_interface_state.update({
-        'loader': shared.args.loader or 'Transformers',
-    })
-
-    if shared.is_chat():
-        shared.persistent_interface_state.update({
-            'mode': shared.settings['mode'],
-            'character_menu': shared.args.character or shared.settings['character'],
-            'instruction_template': shared.settings['instruction_template']
-        })
-
-        if Path("cache/pfp_character.png").exists():
-            Path("cache/pfp_character.png").unlink()
-
     shared.generation_lock = Lock()
 
     # Launch the web UI
     create_interface()
-    while True:
+    while shared.run_server:
         time.sleep(0.5)
         if shared.need_restart:
             shared.need_restart = False
@@ -1179,3 +1187,11 @@ def create_interface():
             shared.gradio['interface'].close()
             time.sleep(0.5)
             create_interface()
+
+if __name__ == "__main__":
+    if os.getenv('DEBUG_PROF') == "1":
+        logger.info(f"Profiling activated sending information to output.prof")
+        cProfile.run('main()', "output.prof")
+    else:
+        main()
+
diff --git a/settings-template.yaml b/settings-template.yaml
index de2c73d3..62351e54 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -1,4 +1,4 @@
-dark_theme: false
+dark_theme: True
 autoload_model: false
 max_new_tokens: 200
 max_new_tokens_min: 1

From dfe7752c2529f3cd25f115204a5fed62de9e9cdd Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Mon, 28 Aug 2023 11:28:34 -0500
Subject: [PATCH 04/13] Current point in time.

---
 modules/ComputeDevice.py | 2 +-
 server.py                | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/ComputeDevice.py b/modules/ComputeDevice.py
index 29b9468a..3ffa768d 100644
--- a/modules/ComputeDevice.py
+++ b/modules/ComputeDevice.py
@@ -11,7 +11,7 @@
 from modules.logging_colors import logger
 
 
-def gpu_available():
+def has_gpu():
     '''
     Sets and returns the default torch.device object accordng to which deecices are available
     on th emacnine. This will be the device used in context for any tensor operatuoins done
diff --git a/server.py b/server.py
index 5ae1e039..aac40913 100644
--- a/server.py
+++ b/server.py
@@ -11,6 +11,12 @@
 from modules.logging_colors import logger
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
+from modules.models_settings import (apply_model_settings_to_state,
+                                     get_model_settings_from_yamls,
+                                     save_model_settings,
+                                     update_model_parameters)
+from modules.text_generation import (generate_reply_wrapper,
+                                     get_encoded_length, stop_everything_event)
 from modules.models_settings import (apply_model_settings_to_state,
                                      get_model_settings_from_yamls,
                                      save_model_settings,
@@ -1191,7 +1197,6 @@ def main():
 if __name__ == "__main__":
     if os.getenv('DEBUG_PROF') == "1":
         logger.info(f"Profiling activated sending information to output.prof")
-        cProfile.run('main()', "outout.prof")
+        cProfile.run('main()', "output.prof")
     else:
         main()
-

From 757ba48068fdeb74bb039877b54e6f2e9d38258c Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Mon, 28 Aug 2023 12:54:01 -0500
Subject: [PATCH 05/13] missed a change

---
 server.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/server.py b/server.py
index 710e70b4..293c0132 100644
--- a/server.py
+++ b/server.py
@@ -11,12 +11,6 @@
 from modules.logging_colors import logger
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
-from modules.models_settings import (apply_model_settings_to_state,
-                                     get_model_settings_from_yamls,
-                                     save_model_settings,
-                                     update_model_parameters)
-from modules.text_generation import (generate_reply_wrapper,
-                                     get_encoded_length, stop_everything_event)
 from modules.models_settings import (apply_model_settings_to_state,
                                      get_model_settings_from_yamls,
                                      save_model_settings,

From 9850dd4a04a94ac2d85bfe507b106ee9b8ba4353 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Mon, 28 Aug 2023 13:14:46 -0500
Subject: [PATCH 06/13] Updated files and README

---
 .gitignore |  5 +++++
 README.md  | 28 ++++++++++++++++++++++++----
 server.py  |  2 +-
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 716d894e..1093ff5a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,8 @@ Thumbs.db
 *.swp
 .*.un~
 *.prof
+
+# Coming soon
+.env.local
+oobainst
+.env.local
diff --git a/README.md b/README.md
index a703b01d..ab1df857 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,26 @@
-# OLD VERSION - 1.3.1 Patched for macOS and Apple Silicon
+# MERGED 1.5 Version.  THIS IS A DEVELOPMENT VERSION
 
-Patched and working with macOS and Apple Silicon M1/M2 GPU now.
+This is a development version and I have not added many changes I had planned. Please feel free to use at your own risk as there may be bugs not yet found.
+
+Items Added to this version.
+ * "Stop Server" under the sessions tab. Use with caution if in multi-user, will probably disable this if in multi-user mode, however it offers better shutdown than just killing the process on the server.
+ * Added Python Class for handling diverse GPU/Compute devices like CUDA, CPU or MPS Changed code to use "torch device" once set initially to a device. Will fall back to CPU.
+
+Items working and tested on macOS
+ * More support for Apple Silicon M1/M2 processors.
+ * Working with new llama-cpp-python 0.1.81
+ * Works with LLaMa2 Models
+    * There GGML models will need conversion to GGUF format if using llama-cpp-python 0.1.81.
+    * Earlier version llama-coo-python still works
+    * Have not concluded testing of library dependencies, will have that updated in build instructions for oobagooba-macOS.
+    * Still mainly supporting GGML, now GGUF, GG-Universal Format files. You will have to convert your GGML files to GGUF format.
+
+Removed from this
+ * Tried to continue what was already started in removing FlexGEN from the repo.
+ * Removed Docker - if someone wants to help maintain for macOS, let me know.
+ * SLowly removing information on CUDA as it is not relevant to macOS.
+
+  **Updated Installation Instructions** for libraries in the [oobabooga-macOS Quickstart](https://github.com/unixwzrd/oobabooga-macOS/blob/main/macOS_Apple_Silicon_QuickStart.md) and the longer [Building Apple Silicon Support](https://github.com/unixwzrd/oobabooga-macOS/blob/main/macOS_Apple_Silicon_QuickStart.md)
 
 GGML support is in this release, and has not been extensively tested. From the look of upstream commits, there are some changes which must be made before this will work with Llama2 models.
 
@@ -13,7 +33,7 @@ Otherwise, use these instructions I have on putting together the macOS Python en
 
 I will be updating this README file with new information specifically regarding macOS and Apple Silicon.
 
-I would like to work closely with the oobaboogs team and try to implement simkilar solutions so the web UI can have a similar look and feel.
+I would like to work closely with the oobabooga team and try to implement similar solutions so the web UI can have a similar look and feel.
 
 Maintaining and improving support for macOS and Apple Silicon in this project has required significant research, debugging, and development effort. If you find my contributions helpful and want to show your appreciation, you can Buy Me a Coffee, sponsor this project, or consider me for job opportunities.
 
@@ -21,7 +41,7 @@ While the focus of this branch is to enhance macOS and Apple Silicon support, I
 
 Anyone who would like to assist with supporting Apple Silicon, let me know. There is much to do and I can only do so much by myself.
 
-- [OLD VERSION - 1.3.1 Patched for macOS and Apple Silicon](#old-version---131-patched-for-macos-and-apple-silicon)
+- [MERGED 1.5 Version.  THIS IS A DEVELOPMENT VERSION](#merged-15-version--this-is-a-development-version)
   - [Features](#features)
   - [Installation](#installation)
   - [Downloading models](#downloading-models)
diff --git a/server.py b/server.py
index 293c0132..c2a0991c 100644
--- a/server.py
+++ b/server.py
@@ -213,7 +213,7 @@ def create_model_menus():
                         shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
                         shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
                         shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama2 70b.')
-                        shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.n_gqa, info='5e-6 (0.012394 for those without a calculator handy) is a good value for llama2 70b.')
+                        shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='5e-6 (0.000005 for those not familiar with teh notation) is a good value for llama2 70b.')
 
                         shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
                         shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")

From 1f43fa97b0e0daf512548d266c97339c8aa96977 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Mon, 28 Aug 2023 13:21:44 -0500
Subject: [PATCH 07/13] Committing changes before applying stash

---
 .env.local | 20 +++++++++---------
 oobainst   | 60 +++++++++++++++++-------------------------------------
 2 files changed, 29 insertions(+), 51 deletions(-)

diff --git a/.env.local b/.env.local
index 86b8e4ba..66608c35 100644
--- a/.env.local
+++ b/.env.local
@@ -1,5 +1,4 @@
 #/bin/bash
-
 # This is for overriding the setup_*.sh, webui.py, server.py, and other
 # oogabooga environment variables. If they are not set here, they will
 # default to what the one-click-installers use as their defaults.
@@ -26,15 +25,20 @@ export OOBABOOGA_OPTS="--chat --verbose "
 #
 export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
 export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen.00}"
-export CONDA_EXE="${CONDA_EXE:=/Users/mps/miniconda3/bin/conda}"
-export CONDA_PREFIX="${CONDA_PREFIX:=/Users/mps/miniconda3}"
+export CONDA_EXE="${CONDA_EXE:=/Users/unixwzrd/miniconda3/bin/conda}"
+export CONDA_PREFIX="${CONDA_PREFIX:=/Users/unixwzrd/miniconda3}"
 export CONDA_PROMPT_MODIFIER="${CONDA_PROMPT_MODIFIER:=(base) }"
-export CONDA_PYTHON_EXE="${CONDA_PYTHON_EXE:=/Users/mps/miniconda3/bin/python}"
-export CONDA_ROOT="${CONDA_ROOT:=/Users/mps/miniconda3}"#/bin/bash
+export CONDA_PYTHON_EXE="${CONDA_PYTHON_EXE:=/Users/unixwzrd/miniconda3/bin/python}"
+export CONDA_ROOT="${CONDA_ROOT:=/Users/unixwzrd/miniconda3}"
 
 # This is for overriding the setup_*.sh, webui.py, server.py, and other
 # oogabooga environment variables. If they are not set here, they will
 # default to what the one-click-installers use as their defaults.
+#
+# Set the default Conda environment and the environment for the Web GUI
+#
+export CONDA_DEFAULT_ENV="base"
+export CONDA_OOBABOOGA_ENV="textgen"
 
 OOBABOOGA_BASE="oobabooga_macos"
 
@@ -73,13 +77,9 @@ export OOBABOOGA_OPTS="--chat --verbose "
 #     grep '^CONDA' | \
 #     awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}'
 
-export CONDA_ROOT="/Users/mps/miniconda3"
+export CONDA_ROOT="/Users/unixwzrd/miniconda3"
 export CONDA_PREFIX="${CONDA_ROOT}"
 # export CONDA_PROMPT_MODIFIER=" (base)"
 export CONDA_PYTHON_EXE="${CONDA_ROOT}/bin/python"
 export CONDA_EXE="${CONDA_DEFAULT_ENV}/bin/conda"
 
-# Set the default Conda environment and the environment for the Web GUI
-#
-export CONDA_DEFAULT_ENV="base"
-export CONDA_OOBABOOGA_ENV="textgen"
diff --git a/oobainst b/oobainst
index 72837766..2be9d564 100644
--- a/oobainst
+++ b/oobainst
@@ -39,54 +39,32 @@ esac
 
 # Check to see if base CPU architecture is consistent with current
 # environment, Give the user a chance to proceed or not.
+#
 if [ "$OS_ARCH" != "$(uname -m)" ]; then
    _confirm_yn '\nWARNING: The kernel string, \"arch\" and \"uname -m\" do not agree.\n\nIt seems you are running this in Rosetta, VSCode Terminal, or Intel virtual\nmachine with incorrect or conflicting architecture. This \nmay result in mixed or unintended binary architectures when compiling\nfrom source code.\n'
 fi
 
-
-# Begin th einstallation.
+# Installs Miniconda3 py311_23.5.2-0
+# 
+# -b           run install in batch mode (without manual intervention),
+#              it is expected the license terms (if any) are agreed upon
+# -f           no error if install prefix already exists
+# -h           print this help message and exit
+# -p PREFIX    install prefix, defaults to /Users/unixwzrd/miniconda3, must not contain spaces.
+# -s           skip running pre/post-link/install scripts
+# -u           update an existing installation
+# -t           run package tests after installation (may install conda-build)
+#
 if [ -e ${HOME}/.conda ]; then
-   _confirm_yn 
-
+   _confirm_yn 'It looks like you have an existing conda installation. Continuing this may\naffect your existing venvs.\n\nDo you wish to cintunue thi sinstall with Conda?' || \
+    (
+        echo -e "CONDA INSTALLS HERE"
+#        curl  https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh \
+#           ${CONDA_OPTS} -o miniconda.sh
+        sh miniconda.sh
+    )
 fi
 
-curl  https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o miniconda.sh
-
-# (if necessary) install git and conda into a contained environment
-# download miniconda
-if [ ! -e "${CONDA_PYTHON_EXE}" ]; then
-    MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-MacOSX-${OS_ARCH}.sh"
-    echo "Downloading Miniconda from $CONDA_PREFIX to $INSTALL_DIR/miniconda_installer.sh"
-
-    mkdir -p "$CONDA_PREFIX"
-    curl -Lk "$MINICONDA_URL" > "$CONDA_PREFIX/miniconda_installer.sh"
-
-    chmod u+x "$CONDA_PREFIX/miniconda_installer.sh"
-    bash "$INSTALL_DIR/miniconda_installer.sh" -b -p $CONDA_ROOT
-
-    # test the conda binary
-    echo "Miniconda version:"
-    "$CONDA_ROOT/bin/conda" --version
-fi
-
-# create the installer env
-if [ ! -e "$CONDA_PREFIX" ]; then
-    "$CONDA_ROOT/bin/conda" create -y -k --prefix "$CONDA_PREFIX" python=3.10
-fi
-
-# check if conda environment was actually created
-if [ ! -e "$CONDA_PREFIX/bin/python" ]; then
-    echo "Conda environment is empty."
-    exit
-fi
-
-# environment isolation
-unset PYTHONPATH
-unset PYTHONHOME
-
-export PYTHONNOUSERSITE=1
-export CUDA_PATH="$CONDA_PREFIX"
-export CUDA_HOME="$CUDA_PATH"
 
 # activate installer env
 source "$CONDA_ROOT/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script)

From ac6f61610d8246853874f8558d6b3e04aa61a94a Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Mon, 28 Aug 2023 13:42:56 -0500
Subject: [PATCH 08/13] .gitignore.. whatever.

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 1093ff5a..1fc67e2d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,4 +42,7 @@ Thumbs.db
 # Coming soon
 .env.local
 oobainst
+<<<<<<< Updated upstream
 .env.local
+=======
+>>>>>>> Stashed changes

From 2d963d7700c9ee87665ceab31384c836568e4608 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Tue, 29 Aug 2023 04:22:08 -0500
Subject: [PATCH 09/13] Update the requirements.tst for llama.cpp and gguf

---
 .gitignore       | 7 ++-----
 requirements.txt | 5 ++++-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1fc67e2d..5a017b3d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,8 +41,5 @@ Thumbs.db
 
 # Coming soon
 .env.local
-oobainst
-<<<<<<< Updated upstream
-.env.local
-=======
->>>>>>> Stashed changes
+oobstart
+oobainst
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 15efbdd7..692046f5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,4 +23,7 @@ tqdm
 # PEFT
 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524
 bitsandbytes==0.41.0
-llama-cpp-python==0.1.77
\ No newline at end of file
+
+# GGML/GGUF and llama.cpp Python API
+gguf
+llama-cpp-python==0.1.81
\ No newline at end of file

From d2b772f26b887b4fee8aaf845f0ffb1e0e5cad20 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Tue, 29 Aug 2023 07:47:47 -0500
Subject: [PATCH 10/13] Update to requitements for ctransformers

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 692046f5..2e2fbb14 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 accelerate==0.21.0
 colorama
+ctransformers
 datasets
 einops
 fastapi==0.100.0

From c8d0f2836058b20b36a87b9164c940f96e4c4a8f Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Fri, 1 Sep 2023 19:57:44 -0500
Subject: [PATCH 11/13] Update requirements.txt

bumped llama-cpp-python
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 15efbdd7..eb341794 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,4 +23,4 @@ tqdm
 # PEFT
 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524
 bitsandbytes==0.41.0
-llama-cpp-python==0.1.77
\ No newline at end of file
+llama-cpp-python==0.1.78

From 25d0d618bea88b72cbaa10608d62118c031b00c4 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Sat, 2 Sep 2023 04:29:03 -0500
Subject: [PATCH 12/13] Handle GGML files  for model loading

---
 README.md                  | 4 ++--
 modules/models_settings.py | 2 ++
 server.py                  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index ab1df857..df39f57e 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# MERGED 1.5 Version.  THIS IS A DEVELOPMENT VERSION
+# MERGED 1.5 Version.  macOS DEVELOPMENT VERSION
 
 This is a development version and I have not added many changes I had planned. Please feel free to use at your own risk as there may be bugs not yet found.
 
@@ -41,7 +41,7 @@ While the focus of this branch is to enhance macOS and Apple Silicon support, I
 
 Anyone who would like to assist with supporting Apple Silicon, let me know. There is much to do and I can only do so much by myself.
 
-- [MERGED 1.5 Version.  THIS IS A DEVELOPMENT VERSION](#merged-15-version--this-is-a-development-version)
+- [MERGED 1.5 Version.  macOS DEVELOPMENT VERSION](#merged-15-version--macos-development-version)
   - [Features](#features)
   - [Installation](#installation)
   - [Downloading models](#downloading-models)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 9319582e..35fdf0ce 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -26,6 +26,8 @@ def infer_loader(model_name):
         loader = 'AutoGPTQ'
     elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
         loader = 'llama.cpp'
+    elif re.match('.*gguf.*\.bin', model_name.lower()):
+        loader = 'llama.cpp'
     elif re.match('.*ggml.*\.bin', model_name.lower()):
         loader = 'llama.cpp'
     elif re.match('.*rwkv.*\.pth', model_name.lower()):
diff --git a/server.py b/server.py
index c2a0991c..cad0dd71 100644
--- a/server.py
+++ b/server.py
@@ -213,7 +213,7 @@ def create_model_menus():
                         shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
                         shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
                         shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='grouped-query attention. Must be 8 for llama2 70b.')
-                        shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='5e-6 (0.000005 for those not familiar with teh notation) is a good value for llama2 70b.')
+                        shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='5e-6 (0.000005 for those unfamiliar with the notation) is a good value for llama2 70b.')
 
                         shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
                         shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")

From ca81a5defd2f9b62a3cab02237a732d22d6382f5 Mon Sep 17 00:00:00 2001
From: M S <unixwzrd.register@mac.com>
Date: Fri, 8 Sep 2023 13:14:37 -0500
Subject: [PATCH 13/13] chaekcpint

---
 README.md                  | 4 ++--
 modules/models_settings.py | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index df39f57e..64135b07 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# MERGED 1.5 Version.  macOS DEVELOPMENT VERSION
+# MERGED 1.5 Version.  macOS TEST VERSION
 
 This is a development version and I have not added many changes I had planned. Please feel free to use at your own risk as there may be bugs not yet found.
 
@@ -41,7 +41,7 @@ While the focus of this branch is to enhance macOS and Apple Silicon support, I
 
 Anyone who would like to assist with supporting Apple Silicon, let me know. There is much to do and I can only do so much by myself.
 
-- [MERGED 1.5 Version.  macOS DEVELOPMENT VERSION](#merged-15-version--macos-development-version)
+- [MERGED 1.5 Version.  macOS TEST VERSION](#merged-15-version--macos-test-version)
   - [Features](#features)
   - [Installation](#installation)
   - [Downloading models](#downloading-models)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 35fdf0ce..36788073 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -26,9 +26,11 @@ def infer_loader(model_name):
         loader = 'AutoGPTQ'
     elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
         loader = 'llama.cpp'
-    elif re.match('.*gguf.*\.bin', model_name.lower()):
+    elif re.match('\.*\.gguf', model_name.lower()):
         loader = 'llama.cpp'
-    elif re.match('.*ggml.*\.bin', model_name.lower()):
+     elif re.match('.*gguf.*\.bin', model_name.lower()):
+        loader = 'llama.cpp'
+     elif re.match('.*ggml.*\.bin', model_name.lower()):
         loader = 'llama.cpp'
     elif re.match('.*rwkv.*\.pth', model_name.lower()):
         loader = 'RWKV'