## Example of querying the Ollama Inference server with Mistral-7b, Langchain and a custom Prompt

### Set the Inference server url (replace with your own address)

In [1]:
inference_server_url = "https://ollama-<project>.apps.<cluster fqdn>"

## Preparation

## Install and load requirements

In [3]:
!pip install -q langchain==0.1.14 requests
!pip install httpx[http2]



In [4]:
# SSL verification bypass
!sed -i ':a;N;$!ba;s/response = requests.post(\n            url/response = requests.post(\n            verify=False,\n            url/g' /opt/app-root/lib64/python3.9/site-packages/langchain_community/llms/ollama.py
!cat /opt/app-root/lib64/python3.9/site-packages/langchain_community/llms/ollama.py | grep -A2 -B1 'response = req'

# Imports
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import Ollama
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

import requests

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import httpx



        response = requests.post(
            verify=False,
            url=api_url,


## Initiate model download

In [5]:
cookies = {
}
headers = {
    'Content-Type': 'application/json',
}
data = {
    'name': 'mistral'
}
url = inference_server_url+'/api/pull'
print('Performing model pull on ' + url)
response = requests.post(url,
                         headers=headers,
                         cookies=cookies,
                         json=data,
                         stream=True,
                         verify=False)

if response.status_code == 200:
    print("Streaming response:")
    # Iterate over the response content in chunks
    count = 0
    for chunk in response.iter_content(chunk_size=1024):  # Adjust chunk size as needed
        if chunk:  # filter out keep-alive new chunks
            decoded_chunk = chunk.decode()  # Decode bytes to string for printing
            count += 1
            if count in range(10):
                print(decoded_chunk)
            elif count % 50 == 0:
                print(decoded_chunk)
            # Example filtering condition: only print lines containing the word "error"
            elif "completed" not in decoded_chunk.lower():
                print(decoded_chunk)
else:
    print(f"Request failed with status code: {response.status_code}")

Performing model pull on https://ollama-test-ai-llm-ollama-cpu-model.apps.rdlopenshiftvm.rdlabo.local/api/pull
Streaming response:
{"status":"pulling manifest"}

{"status":"pulling ff82381e2bea","digest":"sha256:ff82381e2bea77d91c1b824c7afb83f6fb73e9f7de9dda631bcdbca564aa5435","total":4113289152,"completed":4113289152}
{"status":"pulling 43070e2d4e53","digest":"sha256:43070e2d4e532684de521b885f385d0841030efa2b1a20bafb76133a5e1379c1","total":11356,"completed":11356}
{"status":"pulling c43332387573","digest":"sha256:c43332387573e98fdfad4a606171279955b53d891ba2500552c2984a6560ffb4","total":67,"completed":67}
{"status":"pulling ed11eda7790d","digest":"sha256:ed11eda7790d05b49395598a42b155812b17e263214292f7b87d15e14003d337","total":30,"completed":30}
{"status":"pulling 42347cd80dc8","digest":"sha256:42347cd80dc868877d2807869c0e9c90034392b2f1f001cae1563488021e2e19","total":485,"completed":485}

{"status":"verifying sha256 digest"}

{"status":"writing manifest"}

{"status":"removing any unuse

### Listing models

In [6]:
url = inference_server_url+'/api/tags'
print('Listing available local models on ' + url)
response = requests.get(url,
                        verify=False)

if response.status_code == 200:
    # Parsing the JSON response body
    json_response = response.json()
    
    # Pretty printing the JSON object
    import json
    print(json.dumps(json_response, indent=4))
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")

Listing available local models on https://ollama-test-ai-llm-ollama-cpu-model.apps.rdlopenshiftvm.rdlabo.local/api/tags
{
    "models": [
        {
            "name": "mistral:latest",
            "model": "mistral:latest",
            "modified_at": "2024-07-11T11:27:52.821212724Z",
            "size": 4113301090,
            "digest": "2ae6f6dd7a3dd734790bbbf58b8909a606e0e7e97e94b7604e0aa7ae4490e6d8",
            "details": {
                "parent_model": "",
                "format": "gguf",
                "family": "llama",
                "families": [
                    "llama"
                ],
                "parameter_size": "7.2B",
                "quantization_level": "Q4_0"
            }
        }
    ]
}


#### Create the LLM instance

In [7]:
# LLM definition
llm = Ollama(
    base_url=inference_server_url,
    model="mistral",
    top_p=0.92,
    temperature=0.01,
    num_predict=512,
    repeat_penalty=1.03,
    callbacks=[StreamingStdOutCallbackHandler()]
)

#### Create the Prompt

In [8]:
template="""<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always be as helpful as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

Current conversation:
{history}
Human: {input}
AI:
[/INST]
"""
PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)

#### And finally some memory for the conversation

In [9]:
memory=ConversationBufferMemory()

## First example, fully verbose mode

#### Create the Chain using the different components

In [10]:
# Verbose mode is intentionally set to True so you can see the prompt and the history from the buffer memory
conversation = ConversationChain(llm=llm,
                                 prompt=PROMPT,
                                 verbose=True,
                                 memory=memory
                                )

#### Let's talk...

In [11]:
first_input = "Describe Paris in 100 words or less."
conversation.predict(input=first_input);



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always be as helpful as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

Current conversation:

Human: Describe Paris in 100 words or less.
AI:
[/INST]
[0m
 Paris, the City of Light, is a vibrant metropolis renowned for its rich history, art, and culture. Known as the global center of fashion, gastronomy, and philosophy, it's home to iconic landmarks like the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral. Its romantic ambiance is accentuate

In [12]:
# This is a follow-up question without context to showcase the conversation memory
second_input = "Is there a river?"
conversation.predict(input=second_input);



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always be as helpful as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

Current conversation:
Human: Describe Paris in 100 words or less.
AI:  Paris, the City of Light, is a vibrant metropolis renowned for its rich history, art, and culture. Known as the global center of fashion, gastronomy, and philosophy, it's home to iconic landmarks like the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral. Its romantic ambiance is accentuated by the Seine

### Second example, no verbosity

In [13]:
# Verbose mode is intentionally set to True so you can see the prompt and the history from the buffer memory
conversation2 = ConversationChain(llm=llm,
                                 prompt=PROMPT,
                                 verbose=False,
                                 memory=memory
                                )

In [14]:
# Let's clear the previous conversation first
memory.clear()

In [15]:
first_input = "Can you describe London in 100 words?"
conversation2.predict(input=first_input);

 London, the capital city of England and the United Kingdom, is a vibrant, historic, and cosmopolitan metropolis. Known for its iconic landmarks like the Tower of London, Buckingham Palace, and the London Eye, it's a city steeped in rich history. The River Thames, flowing through its heart, offers stunning views of the city's architecture, from modern skyscrapers to ancient buildings. London's cultural scene is diverse and dynamic, with world-class museums, galleries, and theaters. It's a global hub for finance, arts, fashion, and media, attracting millions of visitors each year. Despite its bustling urban landscape, London also boasts numerous parks and green spaces, providing a tranquil escape from the city's hustle and bustle.

In [16]:
# This is a follow-up question without context to showcase the conversation memory
second_input = "Is there a river?"
conversation2.predict(input=second_input);

 Yes, the River Thames flows through London. It is one of the most famous rivers in Europe and plays a significant role in the city's history and daily life.