Adding base only inferencing (#87)

vriveras · web-flow · commit 981777081f83 · 2023-12-13T12:40:26.000-07:00
diff --git a/configs/llama-v2-7b/README.md b/configs/llama-v2-7b/README.md
@@ -12,12 +12,15 @@ Since we ware using WSL environment and is shared you need to manually acitvate
 conda activate [conda-env-name] 
 ```
 
-### Model fine-tuning and inferencing
-
-Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.
+### Base model fine-tuning only
+To just try try the base model without fine-tuning you can run this command after activating conda.
 
 ```bash
-python finetuning/invoke_olive.py 
+cd inference
+
+# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
+# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
+python gradio_chat.py --baseonly
 ```
 
 Checkpoints and final model will be saved in `models` folder.
diff --git a/configs/llama-v2-7b/inference/gradio_chat.py b/configs/llama-v2-7b/inference/gradio_chat.py
@@ -1,10 +1,22 @@
 # Import necessary libraries
 from threading import Thread
+import argparse
+import os
 import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
 from utils import check_adapter_path, load_model, load_peft_model, load_tokenizer, get_device
 
+# Create the parser
+parser = argparse.ArgumentParser(description='Check model usage.')
+
+# Add the arguments
+parser.add_argument('--baseonly', action='store_true', 
+                    help='A boolean switch to indicate base only mode')
+
+# Execute the parse_args() method
+args = parser.parse_args()
+
 # Define model and adapter paths, data type, and quantization type
 model_name = "../model-cache/meta-llama/llama-2-7b"
 adapters_name = "../models/qlora/qlora/gpu-cpu_model/adapter"  # Ensure this path is correctly set before running
@@ -22,15 +34,20 @@
 
 model = load_model(model_name, torch_dtype, quant_type)
 model.resize_token_embeddings(len(tokenizer))
-model = load_peft_model(model, adapters_name)
+
+usingAdapter = False
+if os.path.exists(adapters_name) and not args.baseonly:
+    usingAdapter = true
+    model = load_peft_model(model, adapters_name)
+
 device = get_device()
 
 print(f"Model {model_name} loaded successfully on {device}")
 
 # Function to run the text generation process
 def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
     template = "<prompt_template>"
-    model_inputs = tokenizer(template.format(user_text), return_tensors="pt")
+    model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
     model_inputs = model_inputs.to(device)
 
     # Generate text in a separate thread
diff --git a/configs/llama-v2-7b/inference/utils.py b/configs/llama-v2-7b/inference/utils.py
@@ -14,8 +14,7 @@ def get_device_map():
         print("More than one GPU found. Setting device_map to use CUDA device 0.")
         return 'cuda:0'
     else:
-        print("Using the available device (CUDA device 0).")
-        return 'cuda'
+        return 'auto'
 
 def check_adapter_path(adapters_name):
     """
diff --git a/configs/mistral-7b/README.md b/configs/mistral-7b/README.md
@@ -12,6 +12,17 @@ Since we ware using WSL environment and is shared you need to manually acitvate
 conda activate [conda-env-name] 
 ```
 
+### Base model fine-tuning only
+To just try try the base model without fine-tuning you can run this command after activating conda.
+
+```bash
+cd inference
+
+# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
+# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
+python gradio_chat.py --baseonly
+```
+
 ### Model fine-tuning and inferencing
 
 Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.
diff --git a/configs/mistral-7b/inference/gradio_chat.py b/configs/mistral-7b/inference/gradio_chat.py
@@ -1,10 +1,22 @@
 # Import necessary libraries
 from threading import Thread
+import argparse
+import os
 import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
 from utils import check_adapter_path, load_model, load_peft_model, load_tokenizer, get_device
 
+# Create the parser
+parser = argparse.ArgumentParser(description='Check model usage.')
+
+# Add the arguments
+parser.add_argument('--baseonly', action='store_true', 
+                    help='A boolean switch to indicate base only mode')
+
+# Execute the parse_args() method
+args = parser.parse_args()
+
 # Define model and adapter paths, data type, and quantization type
 model_name = "../model-cache/mistralai/Mistral-7B"
 adapters_name = "../models/qlora/qlora/gpu-cpu_model/adapter"  # Ensure this path is correctly set before running
@@ -22,15 +34,20 @@
 
 model = load_model(model_name, torch_dtype, quant_type)
 model.resize_token_embeddings(len(tokenizer))
-model = load_peft_model(model, adapters_name)
+
+usingAdapter = False
+if os.path.exists(adapters_name) and not args.baseonly:
+    usingAdapter = true
+    model = load_peft_model(model, adapters_name)
+
 device = get_device()
 
 print(f"Model {model_name} loaded successfully on {device}")
 
 # Function to run the text generation process
 def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
     template = "<prompt_template>"
-    model_inputs = tokenizer(template.format(user_text), return_tensors="pt")
+    model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
     model_inputs = model_inputs.to(device)
 
     # Generate text in a separate thread
diff --git a/configs/mistral-7b/inference/utils.py b/configs/mistral-7b/inference/utils.py
@@ -14,8 +14,7 @@ def get_device_map():
         print("More than one GPU found. Setting device_map to use CUDA device 0.")
         return 'cuda:0'
     else:
-        print("Using the available device (CUDA device 0).")
-        return 'cuda'
+        return 'auto'
 
 def check_adapter_path(adapters_name):
     """
diff --git a/configs/phi-1_5/README.md b/configs/phi-1_5/README.md
@@ -12,6 +12,17 @@ Since we ware using WSL environment and is shared you need to manually acitvate
 conda activate [conda-env-name] 
 ```
 
+### Base model fine-tuning only
+To just try try the base model without fine-tuning you can run this command after activating conda.
+
+```bash
+cd inference
+
+# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
+# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
+python gradio_chat.py --baseonly
+```
+
 ### Model fine-tuning and inferencing
 
 Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.
diff --git a/configs/phi-1_5/inference/gradio_chat.py b/configs/phi-1_5/inference/gradio_chat.py
@@ -1,10 +1,22 @@
 # Import necessary libraries
 from threading import Thread
+import argparse
+import os
 import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
 from utils import check_adapter_path, load_model, load_peft_model, load_tokenizer, get_device
 
+# Create the parser
+parser = argparse.ArgumentParser(description='Check model usage.')
+
+# Add the arguments
+parser.add_argument('--baseonly', action='store_true', 
+                    help='A boolean switch to indicate base only mode')
+
+# Execute the parse_args() method
+args = parser.parse_args()
+
 # Define model and adapter paths, data type, and quantization type
 model_name = "../model-cache/microsoft/phi-1_5"
 adapters_name = "../models/qlora/qlora/gpu-cpu_model/adapter"  # Ensure this path is correctly set before running
@@ -22,15 +34,20 @@
 
 model = load_model(model_name, torch_dtype, quant_type)
 model.resize_token_embeddings(len(tokenizer))
-model = load_peft_model(model, adapters_name)
+
+usingAdapter = False
+if os.path.exists(adapters_name) and not args.baseonly:
+    usingAdapter = true
+    model = load_peft_model(model, adapters_name)
+
 device = get_device()
 
 print(f"Model {model_name} loaded successfully on {device}")
 
 # Function to run the text generation process
 def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
     template = "<prompt_template>"
-    model_inputs = tokenizer(template.format(user_text), return_tensors="pt")
+    model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
     model_inputs = model_inputs.to(device)
 
     # Generate text in a separate thread
diff --git a/configs/phi-1_5/inference/utils.py b/configs/phi-1_5/inference/utils.py
@@ -14,8 +14,7 @@ def get_device_map():
         print("More than one GPU found. Setting device_map to use CUDA device 0.")
         return 'cuda:0'
     else:
-        print("Using the available device (CUDA device 0).")
-        return 'cuda'
+        return 'auto'
 
 def check_adapter_path(adapters_name):
     """
diff --git a/configs/phi-2/README.md b/configs/phi-2/README.md
@@ -12,6 +12,17 @@ Since we ware using WSL environment and is shared you need to manually acitvate
 conda activate [conda-env-name] 
 ```
 
+### Base model fine-tuning only
+To just try try the base model without fine-tuning you can run this command after activating conda.
+
+```bash
+cd inference
+
+# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
+# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
+python gradio_chat.py --baseonly
+```
+
 ### Model fine-tuning and inferencing
 
 Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.
diff --git a/configs/phi-2/inference/gradio_chat.py b/configs/phi-2/inference/gradio_chat.py
@@ -1,10 +1,22 @@
 # Import necessary libraries
 from threading import Thread
+import argparse
+import os
 import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
 from utils import check_adapter_path, load_model, load_peft_model, load_tokenizer, get_device
 
+# Create the parser
+parser = argparse.ArgumentParser(description='Check model usage.')
+
+# Add the arguments
+parser.add_argument('--baseonly', action='store_true', 
+                    help='A boolean switch to indicate base only mode')
+
+# Execute the parse_args() method
+args = parser.parse_args()
+
 # Define model and adapter paths, data type, and quantization type
 model_name = "../model-cache/microsoft/phi-2"
 adapters_name = "../models/qlora/qlora/gpu-cpu_model/adapter"  # Ensure this path is correctly set before running
@@ -22,15 +34,20 @@
 
 model = load_model(model_name, torch_dtype, quant_type)
 model.resize_token_embeddings(len(tokenizer))
-model = load_peft_model(model, adapters_name)
+
+usingAdapter = False
+if os.path.exists(adapters_name) and not args.baseonly:
+    usingAdapter = true
+    model = load_peft_model(model, adapters_name)
+
 device = get_device()
 
 print(f"Model {model_name} loaded successfully on {device}")
 
 # Function to run the text generation process
 def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
     template = "<prompt_template>"
-    model_inputs = tokenizer(template.format(user_text), return_tensors="pt")
+    model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
     model_inputs = model_inputs.to(device)
 
     # Generate text in a separate thread
diff --git a/configs/phi-2/inference/utils.py b/configs/phi-2/inference/utils.py
@@ -14,8 +14,7 @@ def get_device_map():
         print("More than one GPU found. Setting device_map to use CUDA device 0.")
         return 'cuda:0'
     else:
-        print("Using the available device (CUDA device 0).")
-        return 'cuda'
+        return 'auto'
 
 def check_adapter_path(adapters_name):
     """
diff --git a/configs/zephyr-7b-beta/README.md b/configs/zephyr-7b-beta/README.md
@@ -12,6 +12,17 @@ Since we ware using WSL environment and is shared you need to manually acitvate
 conda activate [conda-env-name] 
 ```
 
+### Base model fine-tuning only
+To just try try the base model without fine-tuning you can run this command after activating conda.
+
+```bash
+cd inference
+
+# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
+# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
+python gradio_chat.py --baseonly
+```
+
 ### Model fine-tuning and inferencing
 
 Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.
diff --git a/configs/zephyr-7b-beta/inference/gradio_chat.py b/configs/zephyr-7b-beta/inference/gradio_chat.py
@@ -1,10 +1,22 @@
 # Import necessary libraries
 from threading import Thread
+import argparse
+import os
 import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
 from utils import check_adapter_path, load_model, load_peft_model, load_tokenizer, get_device
 
+# Create the parser
+parser = argparse.ArgumentParser(description='Check model usage.')
+
+# Add the arguments
+parser.add_argument('--baseonly', action='store_true', 
+                    help='A boolean switch to indicate base only mode')
+
+# Execute the parse_args() method
+args = parser.parse_args()
+
 # Define model and adapter paths, data type, and quantization type
 model_name = "../model-cache/HuggingFaceH4/zephyr-7b-beta"
 adapters_name = "../models/qlora/qlora/gpu-cpu_model/adapter"  # Ensure this path is correctly set before running
@@ -22,15 +34,20 @@
 
 model = load_model(model_name, torch_dtype, quant_type)
 model.resize_token_embeddings(len(tokenizer))
-model = load_peft_model(model, adapters_name)
+
+usingAdapter = False
+if os.path.exists(adapters_name) and not args.baseonly:
+    usingAdapter = true
+    model = load_peft_model(model, adapters_name)
+
 device = get_device()
 
 print(f"Model {model_name} loaded successfully on {device}")
 
 # Function to run the text generation process
 def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
     template = "<prompt_template>"
-    model_inputs = tokenizer(template.format(user_text), return_tensors="pt")
+    model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
     model_inputs = model_inputs.to(device)
 
     # Generate text in a separate thread
diff --git a/configs/zephyr-7b-beta/inference/utils.py b/configs/zephyr-7b-beta/inference/utils.py
@@ -14,8 +14,7 @@ def get_device_map():
         print("More than one GPU found. Setting device_map to use CUDA device 0.")
         return 'cuda:0'
     else:
-        print("Using the available device (CUDA device 0).")
-        return 'cuda'
+        return 'auto'
 
 def check_adapter_path(adapters_name):
     """