Source: https://github.com/conceptofmind/toolformer.git

In [3]:
!pip -q install accelerate
!pip -q install sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m0.9/1.3 MB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
#Tools
import copy
import calendar
import torch
import datetime
import time
from transformers import (AutoModelForSeq2SeqLM,
                          AutoTokenizer,
                          AutoModel,
                          T5ForConditionalGeneration,)
from typing import List
from operator import truediv, mul, add, sub

"""
retrieval
Uses Carptriever to retrieve sentences before the current context.
input_sentences - List[String], sentences to retrieve from
input_text - String, the input text (e.g. The dog's name is)
k - The number of sentences to retrieve
output - A list of strings, each string is the retrieved sentence, and the sentence after.
"""
def mean_pooling(token_embeddings: torch.Tensor, mask: torch.Tensor):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.0)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings
class Retriever:
    def __init__(self):
        self.model = AutoModel.from_pretrained(
            "CarperAI/carptriever-1", add_pooling_layer=False
        ).cuda()
        self.tokenizer = AutoTokenizer.from_pretrained("CarperAI/carptriever-1")

    def retrieval(
        self, input_sentences: List[str], input_text: str, k: int
    ) -> List[str]:
        if k > len(input_sentences):
            # I'd error but LMs do stupid stuff sometimes
            return input_sentences
        input_sentences = copy.deepcopy(input_sentences)
        input_sentences.append(input_text)
        output_list = []
        for sentence in input_sentences:
            inputs = self.tokenizer(
                sentence, padding=True, truncation=True, return_tensors="pt"
            )
            # print(inputs)
            inputs["input_ids"] = inputs["input_ids"].cuda()
            inputs["token_type_ids"] = inputs["token_type_ids"].cuda()
            inputs["attention_mask"] = inputs["attention_mask"].cuda()
            with torch.no_grad():
                outputs = self.model(**inputs)
                embeddings = mean_pooling(outputs[0], inputs["attention_mask"])
            output_list.append(embeddings)
        query_embedding, sentence_embeddings = output_list[-1], torch.concat(
            output_list[:-1], 0
        )
        # print(len(sentence_embeddings), sentence_embeddings[0].shape)
        scores = (query_embedding @ sentence_embeddings.transpose(0, 1)).cpu().tolist()
        # print(scores)
        sentence_score_pairs = sorted(
            zip(input_sentences[:-1], scores[0]), reverse=True, key=lambda x: x[1]
        )
        continued_sentence_score_pairs = sorted(
            zip(input_sentences[1:], scores[0]), reverse=True, key=lambda x: x[1]
        )
        # print(sentence_score_pairs)
        return [sentence_pair[0] + " " + continue_pair[0] for sentence_pair, continue_pair in zip(sentence_score_pairs[:k],
                                                                                                  continued_sentence_score_pairs[:k]
                                                                                                  )
        ]

"""
Calendar
Uses Python's datetime and calendar libraries to retrieve the current date.
input - None => output - A string, the current date.
"""
def Calendar(date=datetime.datetime.now()):
    return f"Today is {calendar.day_name[date.weekday()]}, {calendar.month_name[date.month]} {date.day}, {date.year}."

"""
Machine Translation - NLLB-600M
Uses HuggingFace's transformers library to translate input query to English.
input_query - A string, the input query (e.g. "what is a dog?") => output - A string, the translated input query.
"""
def MT(input_query: str):
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    input_ids = tokenizer(input_query, return_tensors="pt")
    outputs = model.generate(**input_ids,
                             forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"],
                             )
    output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return output

"""
Calculator
Calculates the result of a mathematical expression.
input_query - A string, the input query (e.g. "400/1400") => output - A float, the result of the calculation
Adapted from: https://levelup.gitconnected.com/3-ways-to-write-a-calculator-in-python-61642f2e4a9a
"""
def Calculator(input_query: str):
    operators = {"+": add, "-": sub, "*": mul, "/": truediv}
    if input_query.isdigit():
        return float(input_query)
    for c in operators.keys():
        left, operator, right = input_query.partition(c)
        if operator in operators:
            return round(operators[operator](Calculator(left), Calculator(right)), 2)

# Other Optional Tools
"""
SteamSHP
Uses HuggingFace's transformers library to generate text.
input_query - A string, the input query (e.g. "what is a dog?")
output - A list of strings, the generated text
"""
def SteamSHP(input_query: str):
    device = "cuda"  # if you have a GPU
    tokenizer = AutoTokenizer.from_pretrained("stanfordnlp/SteamSHP-flan-t5-large")
    model = T5ForConditionalGeneration.from_pretrained("stanfordnlp/SteamSHP-flan-t5-large").to(device)
    x = tokenizer([input_query], return_tensors="pt").input_ids.to(device)
    y = model.generate(x, max_new_tokens=10)
    output = tokenizer.batch_decode(y, skip_special_tokens=True)
    return output

In [2]:
if __name__ == "__main__":
    start = time.time()
    print(Calendar())  # Outputs a string, the current date
    end = time.time()
    print('elapsed time (Calendar): ', end - start)
    print('---')

    start = time.time()
    print(Calculator("400/1400"))  # For Optional Basic Calculator
    end = time.time()
    print('elapsed time (Calculator): ', end - start)
    print('---')

    start = time.time()
    print(MT("Un chien c'est quoi?"))  # What is a dog?
    end = time.time()
    print('elapsed time (MT): ', end - start)
    print('---')

    start = time.time()
    print(SteamSHP("What is a dog?"))  # Outputs a list with an answer
    end = time.time()
    print('elapsed time (SteamSHP): ', end - start)
    print('---')

    start = time.time()
    retriever = Retriever()
    ret_strings = [
    'New Orleans is a city in the southeastern region of the United States, located in the state of Louisiana.',
    'It is situated along the Mississippi River and is known for its distinctive music, Creole cuisine, unique dialects, and its annual celebrations and festivals, most notably Mardi Gras.',
    'The historic heart of the city is the French Quarter, known for its French and Spanish Creole architecture and vibrant nightlife along Bourbon Street.',
    'New Orleans is also a region of French Louisiana, along with the Cajun Country.'
    ]
    ret_val = "location of New Orleans"
    print(retriever.retrieval(ret_strings, ret_val, 3))
    end = time.time()
    print('elapsed time (retriever): ', end - start)

Today is Tuesday, January 2, 2024.
elapsed time (Calendar):  0.007524728775024414
---
0.29
elapsed time (Calculator):  0.000431060791015625
---


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


What is a dog?
elapsed time (MT):  28.424312591552734
---


config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

['A dog is a dog']
elapsed time (SteamSHP):  85.46702289581299
---


config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

['New Orleans is also a region of French Louisiana, along with the Cajun Country. location of New Orleans', 'New Orleans is a city in the southeastern region of the United States, located in the state of Louisiana. It is situated along the Mississippi River and is known for its distinctive music, Creole cuisine, unique dialects, and its annual celebrations and festivals, most notably Mardi Gras.', 'The historic heart of the city is the French Quarter, known for its French and Spanish Creole architecture and vibrant nightlife along Bourbon Street. New Orleans is also a region of French Louisiana, along with the Cajun Country.']
elapsed time (retriever):  44.75029373168945
