Skip to content

Commit 9817770

Browse files
authored
Adding base only inferencing (#87)
1 parent fa4b95b commit 9817770

File tree

15 files changed

+151
-24
lines changed

15 files changed

+151
-24
lines changed

configs/llama-v2-7b/README.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@ Since we ware using WSL environment and is shared you need to manually acitvate
1212
conda activate [conda-env-name]
1313
```
1414

15-
### Model fine-tuning and inferencing
16-
17-
Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.
15+
### Base model fine-tuning only
16+
To just try try the base model without fine-tuning you can run this command after activating conda.
1817

1918
```bash
20-
python finetuning/invoke_olive.py
19+
cd inference
20+
21+
# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
22+
# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
23+
python gradio_chat.py --baseonly
2124
```
2225

2326
Checkpoints and final model will be saved in `models` folder.

configs/llama-v2-7b/inference/gradio_chat.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
11
# Import necessary libraries
22
from threading import Thread
3+
import argparse
4+
import os
35
import torch
46
import gradio as gr
57
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
68
from utils import check_adapter_path, load_model, load_peft_model, load_tokenizer, get_device
79

10+
# Create the parser
11+
parser = argparse.ArgumentParser(description='Check model usage.')
12+
13+
# Add the arguments
14+
parser.add_argument('--baseonly', action='store_true',
15+
help='A boolean switch to indicate base only mode')
16+
17+
# Execute the parse_args() method
18+
args = parser.parse_args()
19+
820
# Define model and adapter paths, data type, and quantization type
921
model_name = "../model-cache/meta-llama/llama-2-7b"
1022
adapters_name = "../models/qlora/qlora/gpu-cpu_model/adapter" # Ensure this path is correctly set before running
@@ -22,15 +34,20 @@
2234

2335
model = load_model(model_name, torch_dtype, quant_type)
2436
model.resize_token_embeddings(len(tokenizer))
25-
model = load_peft_model(model, adapters_name)
37+
38+
usingAdapter = False
39+
if os.path.exists(adapters_name) and not args.baseonly:
40+
usingAdapter = true
41+
model = load_peft_model(model, adapters_name)
42+
2643
device = get_device()
2744

2845
print(f"Model {model_name} loaded successfully on {device}")
2946

3047
# Function to run the text generation process
3148
def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
3249
template = "<prompt_template>"
33-
model_inputs = tokenizer(template.format(user_text), return_tensors="pt")
50+
model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
3451
model_inputs = model_inputs.to(device)
3552

3653
# Generate text in a separate thread

configs/llama-v2-7b/inference/utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ def get_device_map():
1414
print("More than one GPU found. Setting device_map to use CUDA device 0.")
1515
return 'cuda:0'
1616
else:
17-
print("Using the available device (CUDA device 0).")
18-
return 'cuda'
17+
return 'auto'
1918

2019
def check_adapter_path(adapters_name):
2120
"""

configs/mistral-7b/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,17 @@ Since we ware using WSL environment and is shared you need to manually acitvate
1212
conda activate [conda-env-name]
1313
```
1414

15+
### Base model fine-tuning only
16+
To just try try the base model without fine-tuning you can run this command after activating conda.
17+
18+
```bash
19+
cd inference
20+
21+
# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
22+
# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
23+
python gradio_chat.py --baseonly
24+
```
25+
1526
### Model fine-tuning and inferencing
1627

1728
Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.

configs/mistral-7b/inference/gradio_chat.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
11
# Import necessary libraries
22
from threading import Thread
3+
import argparse
4+
import os
35
import torch
46
import gradio as gr
57
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
68
from utils import check_adapter_path, load_model, load_peft_model, load_tokenizer, get_device
79

10+
# Create the parser
11+
parser = argparse.ArgumentParser(description='Check model usage.')
12+
13+
# Add the arguments
14+
parser.add_argument('--baseonly', action='store_true',
15+
help='A boolean switch to indicate base only mode')
16+
17+
# Execute the parse_args() method
18+
args = parser.parse_args()
19+
820
# Define model and adapter paths, data type, and quantization type
921
model_name = "../model-cache/mistralai/Mistral-7B"
1022
adapters_name = "../models/qlora/qlora/gpu-cpu_model/adapter" # Ensure this path is correctly set before running
@@ -22,15 +34,20 @@
2234

2335
model = load_model(model_name, torch_dtype, quant_type)
2436
model.resize_token_embeddings(len(tokenizer))
25-
model = load_peft_model(model, adapters_name)
37+
38+
usingAdapter = False
39+
if os.path.exists(adapters_name) and not args.baseonly:
40+
usingAdapter = true
41+
model = load_peft_model(model, adapters_name)
42+
2643
device = get_device()
2744

2845
print(f"Model {model_name} loaded successfully on {device}")
2946

3047
# Function to run the text generation process
3148
def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
3249
template = "<prompt_template>"
33-
model_inputs = tokenizer(template.format(user_text), return_tensors="pt")
50+
model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
3451
model_inputs = model_inputs.to(device)
3552

3653
# Generate text in a separate thread

configs/mistral-7b/inference/utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ def get_device_map():
1414
print("More than one GPU found. Setting device_map to use CUDA device 0.")
1515
return 'cuda:0'
1616
else:
17-
print("Using the available device (CUDA device 0).")
18-
return 'cuda'
17+
return 'auto'
1918

2019
def check_adapter_path(adapters_name):
2120
"""

configs/phi-1_5/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,17 @@ Since we ware using WSL environment and is shared you need to manually acitvate
1212
conda activate [conda-env-name]
1313
```
1414

15+
### Base model fine-tuning only
16+
To just try try the base model without fine-tuning you can run this command after activating conda.
17+
18+
```bash
19+
cd inference
20+
21+
# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
22+
# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
23+
python gradio_chat.py --baseonly
24+
```
25+
1526
### Model fine-tuning and inferencing
1627

1728
Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.

configs/phi-1_5/inference/gradio_chat.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
11
# Import necessary libraries
22
from threading import Thread
3+
import argparse
4+
import os
35
import torch
46
import gradio as gr
57
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
68
from utils import check_adapter_path, load_model, load_peft_model, load_tokenizer, get_device
79

10+
# Create the parser
11+
parser = argparse.ArgumentParser(description='Check model usage.')
12+
13+
# Add the arguments
14+
parser.add_argument('--baseonly', action='store_true',
15+
help='A boolean switch to indicate base only mode')
16+
17+
# Execute the parse_args() method
18+
args = parser.parse_args()
19+
820
# Define model and adapter paths, data type, and quantization type
921
model_name = "../model-cache/microsoft/phi-1_5"
1022
adapters_name = "../models/qlora/qlora/gpu-cpu_model/adapter" # Ensure this path is correctly set before running
@@ -22,15 +34,20 @@
2234

2335
model = load_model(model_name, torch_dtype, quant_type)
2436
model.resize_token_embeddings(len(tokenizer))
25-
model = load_peft_model(model, adapters_name)
37+
38+
usingAdapter = False
39+
if os.path.exists(adapters_name) and not args.baseonly:
40+
usingAdapter = true
41+
model = load_peft_model(model, adapters_name)
42+
2643
device = get_device()
2744

2845
print(f"Model {model_name} loaded successfully on {device}")
2946

3047
# Function to run the text generation process
3148
def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
3249
template = "<prompt_template>"
33-
model_inputs = tokenizer(template.format(user_text), return_tensors="pt")
50+
model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt")
3451
model_inputs = model_inputs.to(device)
3552

3653
# Generate text in a separate thread

configs/phi-1_5/inference/utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ def get_device_map():
1414
print("More than one GPU found. Setting device_map to use CUDA device 0.")
1515
return 'cuda:0'
1616
else:
17-
print("Using the available device (CUDA device 0).")
18-
return 'cuda'
17+
return 'auto'
1918

2019
def check_adapter_path(adapters_name):
2120
"""

configs/phi-2/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,17 @@ Since we ware using WSL environment and is shared you need to manually acitvate
1212
conda activate [conda-env-name]
1313
```
1414

15+
### Base model fine-tuning only
16+
To just try try the base model without fine-tuning you can run this command after activating conda.
17+
18+
```bash
19+
cd inference
20+
21+
# Web browser interface allows to adjust a few parameters like max new token length, temperature and so on.
22+
# User has to manually open the link (e.g. http://127.0.0.1:7860) in a browser after gradio initiates the connections.
23+
python gradio_chat.py --baseonly
24+
```
25+
1526
### Model fine-tuning and inferencing
1627

1728
Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset.

0 commit comments

Comments
 (0)