Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recreated settings changes - Adds serveral options for llamacpp and ollama #1703

Merged
merged 5 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions private_gpt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""private-gpt."""

import logging
import os

Expand Down
29 changes: 25 additions & 4 deletions private_gpt/components/llm/llm_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,23 @@ def __init__(self, settings: Settings) -> None:
) from e

prompt_style = get_prompt_style(settings.llamacpp.prompt_style)

settings_kwargs = {
"tfs_z": settings.llamacpp.tfs_z, # ollama and llama-cpp
"top_k": settings.llamacpp.top_k, # ollama and llama-cpp
"top_p": settings.llamacpp.top_p, # ollama and llama-cpp
"repeat_penalty": settings.llamacpp.repeat_penalty, # ollama llama-cpp
"n_gpu_layers": -1,
"offload_kqv": True,
}
self.llm = LlamaCPP(
model_path=str(models_path / settings.llamacpp.llm_hf_model_file),
temperature=0.1,
temperature=settings.llm.temperature,
max_new_tokens=settings.llm.max_new_tokens,
context_window=settings.llm.context_window,
generate_kwargs={},
callback_manager=LlamaIndexSettings.callback_manager,
# All to GPU
model_kwargs={"n_gpu_layers": -1, "offload_kqv": True},
model_kwargs=settings_kwargs,
# transform inputs into Llama2 format
messages_to_prompt=prompt_style.messages_to_prompt,
completion_to_prompt=prompt_style.completion_to_prompt,
Expand Down Expand Up @@ -108,8 +115,22 @@ def __init__(self, settings: Settings) -> None:
) from e

ollama_settings = settings.ollama

settings_kwargs = {
"tfs_z": ollama_settings.tfs_z, # ollama and llama-cpp
"num_predict": ollama_settings.num_predict, # ollama only
"top_k": ollama_settings.top_k, # ollama and llama-cpp
"top_p": ollama_settings.top_p, # ollama and llama-cpp
"repeat_last_n": ollama_settings.repeat_last_n, # ollama
"repeat_penalty": ollama_settings.repeat_penalty, # ollama llama-cpp
}

self.llm = Ollama(
model=ollama_settings.llm_model, base_url=ollama_settings.api_base
model=ollama_settings.llm_model,
base_url=ollama_settings.api_base,
temperature=settings.llm.temperature,
context_window=settings.llm.context_window,
additional_kwargs=settings_kwargs,
)
case "mock":
self.llm = MockLLM()
8 changes: 5 additions & 3 deletions private_gpt/components/vector_store/vector_store_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,11 @@ def get_retriever(
index=index,
similarity_top_k=similarity_top_k,
doc_ids=context_filter.docs_ids if context_filter else None,
filters=_doc_id_metadata_filter(context_filter)
if self.settings.vectorstore.database != "qdrant"
else None,
filters=(
_doc_id_metadata_filter(context_filter)
if self.settings.vectorstore.database != "qdrant"
else None
),
)

def close(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions private_gpt/launcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""FastAPI app creation, logger configuration and main API routes."""

import logging

from fastapi import Depends, FastAPI, Request
Expand Down
1 change: 1 addition & 0 deletions private_gpt/server/utils/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* https://fastapi.tiangolo.com/tutorial/security/
* https://fastapi.tiangolo.com/tutorial/dependencies/dependencies-in-path-operation-decorators/
"""

# mypy: ignore-errors
# Disabled mypy error: All conditional function variants must have identical signatures
# We are changing the implementation of the authenticated method, based on
Expand Down
45 changes: 45 additions & 0 deletions private_gpt/settings/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ class LLMSettings(BaseModel):
"like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching "
"gpt-3.5-turbo LLM.",
)
temperature: float = Field(
0.1,
description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
)


class VectorstoreSettings(BaseModel):
Expand All @@ -119,6 +123,23 @@ class LlamaCPPSettings(BaseModel):
),
)

tfs_z: float = Field(
1.0,
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
)
top_k: int = Field(
40,
description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
)
top_p: float = Field(
0.9,
description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
)
repeat_penalty: float = Field(
1.1,
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
)


class HuggingFaceSettings(BaseModel):
embedding_hf_model_name: str = Field(
Expand Down Expand Up @@ -184,6 +205,30 @@ class OllamaSettings(BaseModel):
None,
description="Model to use. Example: 'nomic-embed-text'.",
)
tfs_z: float = Field(
1.0,
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
)
num_predict: int = Field(
None,
description="Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)",
)
top_k: int = Field(
40,
description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
)
top_p: float = Field(
0.9,
description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
)
repeat_last_n: int = Field(
64,
description="Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)",
)
repeat_penalty: float = Field(
1.1,
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
)


class UISettings(BaseModel):
Expand Down
1 change: 1 addition & 0 deletions private_gpt/ui/ui.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This file should be imported only and only if you want to run the UI locally."""

import itertools
import logging
import time
Expand Down
7 changes: 6 additions & 1 deletion settings-ollama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ llm:
mode: ollama
max_new_tokens: 512
context_window: 3900
temperature: 0.1 #The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)

embedding:
mode: ollama
Expand All @@ -13,10 +14,14 @@ ollama:
llm_model: mistral
embedding_model: nomic-embed-text
api_base: http://localhost:11434
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)

vectorstore:
database: qdrant

qdrant:
path: local_data/private_gpt/qdrant

5 changes: 5 additions & 0 deletions settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,16 @@ llm:
# Should be matching the selected model
max_new_tokens: 512
context_window: 3900
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)

llamacpp:
prompt_style: "mistral"
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)

embedding:
# Should be matching the value above in most cases
Expand Down
1 change: 1 addition & 0 deletions tests/server/utils/test_simple_auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
is currently architecture (it is hard to patch the `settings` and the app while
the tests are directly importing them).
"""

from typing import Annotated

import pytest
Expand Down
Loading