In [None]:
!pip install scikit-build # needed to run setup.py
!pip uninstall -y llama-cpp-python
!cd ../.. && python3 setup.py install

In [None]:
!apt install -y graphviz
!pip install graphviz

In [1]:
import graphviz
from IPython.display import display, HTML

def wrap_graphviz(dot):
    return HTML(graphviz.Digraph(body=dot).pipe(format='svg').decode("utf-8"))

In [2]:
from llama_cpp import Llama
llm = Llama(model_path="/workspace/llama.cpp/models/7B/ggml-model-q4_0.bin", logits_all=True, verbose=False)

llama.cpp: loading model from /workspace/llama.cpp/models/7B/ggml-model-q4_0.bin
llama_model_load_internal: format     = ggjt v1 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =  68.20 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 1026.00 MB per state)
llama_init_from_file: kv self size  =  256.00 MB


# Given a list of strings return the most likely completion

In [3]:
import numpy
from typing import List

class TokenChoiceTree:
    def __init__(self, llm, token=None, depth=0, parent=None):
        self.llm = llm
        self.token = token
        self.depth = depth
        self.parent = parent # only used for GraphViz
        self.children = {}
        self.proba = None # probability of this token to be produced given its parent. None for the root.
        self.cumul = 1. # probability of this path being taken (product of `causal`). "1." for the root.

    def __add(self, sequence:List[int]):
        tok = sequence[0]
        if not tok in self.children:
            self.children.update({ tok : TokenChoiceTree(self.llm, token=tok, depth=self.depth+1, parent=self) })
        tree = self.children[tok]
        return tree if len(sequence) == 1 else tree.__add(sequence[1:])

    def add(self, text:str):
        return self.__add(self.llm.tokenize(text)[1:])

    def decode(self):
        return '' if self.token is None else self.llm.detokenize([self.token]).decode("utf-8", errors="ignore")

    def eval(self, prompt):
        if self.token is not None:
            prompt += self.decode()

        output = self.llm(prompt, max_tokens=1, logprobs=-1, full_logprobs=True)
        probs = numpy.exp(output['choices'][0]['logprobs'][0])

        for tree in self.children.values():
            tree.proba = probs[tree.token]
            tree.cumul = self.cumul * tree.proba
            tree.eval(prompt)

    def prob(self):
        if self.depth == 0 or self.cumul is None:
            return None
        #return self.cumul
        return numpy.power(self.cumul, 1./self.depth)

    @classmethod
    def run(cls, llm, prompt, texts):
        tree = cls(llm)
        leaves = [ tree.add(t) for t in texts ]
        tree.eval(prompt)
        return (tree, [ l.prob() for l in leaves ])

    @classmethod
    def choose(cls, llm, prompt, texts):
        return numpy.argmax(cls.run(llm, prompt, texts)[1])

    def depthfirst(self):
        yield self
        for c in self.children.values():
            yield from c.depthfirst()

    def toGraphViz(self):
        cnt = 0
        dotstr = ""
        for t in self.depthfirst():
            t.id = cnt
            cnt += 1
            if t.parent is not None:
                label = f"{t.decode()}\\n{int(t.prob()*10e12)/10e6}"
            else:
                label = "ROOT"
            dotstr += f'n_{t.id} [label="{label}"];'
            if t.parent is not None:
                dotstr += f'n_{t.parent.id} -> n_{t.id};'
        return dotstr

In [4]:
# `TokenChoiceTree.run` returns the tree and the probabilities for each input
print("RUN:    " + str(TokenChoiceTree.run(llm,
                           prompt="Q: Name the first three planets in the solar system? A:",
                           texts=[ b"1) Mercury, 2) Venus, and 3) Earth.", b"1) Jupiter, 2) Mars, and 3) Earth.", b"1) Mercury, 2) Uranus, and 3) Earth." ]
)))

# `TokenChoiceTree.choose` returns the index of the most probable input
print("CHOOSE: " + str(TokenChoiceTree.choose(llm,
                           prompt="Q: Name the first three planets in the solar system? A:",
                           texts=[ b"1) Mercury, 2) Venus, and 3) Earth.", b"1) Jupiter, 2) Mars, and 3) Earth.", b"1) Mercury, 2) Uranus, and 3) Earth." ]
)))

RUN:    (<__main__.TokenChoiceTree object at 0x7f4dac55d810>, [0.3645556520007401, 0.20174114780655208, 0.17524178480939925])
CHOOSE: 0


# Let LLama decide when to think and when to answer

Take ReAct's prompt structure but increases control over the prompt sequence.
This can be used to control more finely the completion:
 - choose between prescribed sequences of tokens to progress in the sequence of prompts
 - perform beam-search (or other search algo) with specific parameters based on current part of the prompt sequence

In [5]:
import sys
out_stream = sys.stdout

prompt_QNA_thoughts = """\
You are using an interractive questionnaire.
You have been given a question, think about it, then answer.

Use the following format after the start prompt:
```
Question: the input question you must answer
Thought[{max_thoughts}]: think about the question and the possible answer
Answer: use your thoughts to formulate the correct answer
```
Use as many `Thought` as you need to provide the correct answer. Each statement ends with a new line.

Start:
Question: {question}
"""

def QNA_thoughts(question, max_thoughts=5, completion_kwargs=[{},{}]):
    prompt = prompt_QNA_thoughts.format(question=question, max_thoughts=max_thoughts)
    out_stream.write(prompt)
    next_thought_idx = 1
    while next_thought_idx is not None:
        if next_thought_idx == 1:
            choice_idx = 0
            prefix = "Thought[1]: "
        elif next_thought_idx > max_thoughts:
            choice_idx = 1
            prefix = "Answer: "
            next_thought_idx = None
        else:
            choices = [ bytes(f"Thought[{next_thought_idx}]: ", "utf-8"), b"Answer: "]
            choice_idx = TokenChoiceTree.choose(llm, prompt, choices)
            prefix = choices[choice_idx].decode("utf-8", errors="ignore")

        out_stream.write(prefix)
        prompt += prefix
        if choice_idx == 1:
            next_thought_idx = None
        else:
            next_thought_idx += 1

        out = llm(prompt, stop='\n', **completion_kwargs[choice_idx])
        text = out['choices'][0]['text'].strip()
        prompt += text + '\n'

        out_stream.write(text+'\n')

In [6]:
QNA_thoughts(
    question="What are the first three planets in the solar system?",
    max_thoughts=10,
    completion_kwargs=[
      { 'max_tokens' : 10, 'temperature' : 0.2 },
      { 'max_tokens' : 50, 'temperature' : 0.8 }
  ]
)

You are using an interractive questionnaire.
You have been given a question, think about it, then answer.

Use the following format after the start prompt:
```
Question: the input question you must answer
Thought[10]: think about the question and the possible answer
Answer: use your thoughts to formulate the correct answer
```
Use as many `Thought` as you need to provide the correct answer. Each statement ends with a new line.

Start:
Question: What are the first three planets in the solar system?
Thought[1]: Mercury, Venus and Earth
Thought[2]: 1) Mercury is closest to the Sun.
Thought[3]: 2) Venus is the hottest planet
Thought[4]: 3) Earth is the largest planet in the solar
Thought[5]: 4) Mars has two moons
Thought[6]: 5) Jupiter is the biggest planet
Thought[7]: 6) Saturn has rings
Thought[8]: 7) Uranus is tilted on
Thought[9]: 8) Neptune is the coldest planet
Thought[10]: 9) Pluto is a dwarf planet
Answer: 2) Venus is the hottest planet, 3) Earth is the largest planet in the solar 

In [7]:
QNA_thoughts(
    question="What is a compiler?",
    max_thoughts=5,
    completion_kwargs=[
        { 'max_tokens' : 10, 'temperature' : 0.0 },
        { 'max_tokens' : 30, 'temperature' : 0.2 }
    ]
)

You are using an interractive questionnaire.
You have been given a question, think about it, then answer.

Use the following format after the start prompt:
```
Question: the input question you must answer
Thought[5]: think about the question and the possible answer
Answer: use your thoughts to formulate the correct answer
```
Use as many `Thought` as you need to provide the correct answer. Each statement ends with a new line.

Start:
Question: What is a compiler?
Thought[1]: A compiler is an application that translates source code
Thought[2]: into machine language instructions
Thought[3]: 
Thought[4]: 
Thought[5]: 
Answer: 


# Let LLama select a command from a list

In [8]:
import itertools

class CommandSelector:
    def __init__(self, llm, prompt):
        self.llm = llm
        self.prompt = prompt
        self.commands = []

    def add(self, template, *options):
        first = len(self.commands)
        self.commands += [ template.format(*opt) for opt in itertools.product(*options) ]
        return (first, self.commands[first:])

    def finalyse(self):
        self.commands = list(map(lambda x: bytes(x,"utf-8"), self.commands))

    def run(self, **inputs):
        prompt = self.prompt.format(**inputs)
        freestyle = llm(prompt, stop=['\n'])['choices'][0]['text']
        (tree,prob) = TokenChoiceTree.run(llm, prompt=prompt, texts=self.commands)
        idx = numpy.argmax(prob)
        return ( prob, self.commands[idx], tree, freestyle )

## Example: Home-automation

- Multiple listening devices
- Speech to text model listen for queries
- Query send to LLama with contextual information
- LLama decides what to do from a list of commands

In [10]:
CS = CommandSelector(llm=llm, prompt="Room: {room}\nQuery: {query}\nCommand: ")
CS.add("turn all lights off")
CS.add("turn temperature {} by {}", ["up","down"], [1,2,3,5,10])
CS.add("switch {} {} lights", ["on","off"], ["livingroom","kitchen","bedroom","office"])
CS.add("switch {} {} fan", ["on","off"], ["livingroom","bedroom","office"])
CS.finalyse()

tests = [
  { 'room' : "kitchen", 'query' : "it is dark in here" },
  { 'room' : "kitchen", 'query' : "I am very cold" },
  { 'room' : "kitchen", 'query' : "It is a bit too warm in here" },
  { 'room' : "bedroom", 'query' : "I can't see" },
  { 'room' : "bedroom", 'query' : "I am going to bed" }
]

for inputs in tests:
    ( prob, command, tree, freestyle ) = CS.run(**inputs)
    print(f"inputs.room={inputs['room']}")
    print(f"inputs.query={inputs['query']}")
    print(f"command={command}")
    print(f"freestyle={freestyle}")
    display(wrap_graphviz(tree.toGraphViz()))
    print("=======================================================================================")

inputs.room=kitchen
inputs.query=it is dark in here
command=b'switch on kitchen lights'
freestyle=007, do you expect me to talk? [Y/N]


inputs.room=kitchen
inputs.query=I am very cold
command=b'turn temperature up by 10'
freestyle=1. heat up


inputs.room=kitchen
inputs.query=It is a bit too warm in here
command=b'turn temperature down by 10'
freestyle=3.2


inputs.room=bedroom
inputs.query=I can't see
command=b'turn temperature up by 10'
freestyle=93218645403


inputs.room=bedroom
inputs.query=I am going to bed
command=b'switch off bedroom lights'
freestyle=19:32:54 [Bed Room]


