In [1]:
import giskard
from typing import Sequence, Optional
from giskard.llm.client import set_default_client
from giskard.llm.client.base import LLMClient, ChatMessage
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
from langchain.llms import HuggingFacePipeline
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import PromptTemplate, LLMChain

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# mistralai/Mistral-7B-Instruct-v0.2
model_name_or_path = "NousResearch/Hermes-2-Pro-Llama-3-8B"

config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
config.max_position_embeddings = 8096
quantization_config = BitsAndBytesConfig(
llm_int8_enable_fp32_cpu_offload=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
config=config,
trust_remote_code=True,
quantization_config=quantization_config,
device_map="cuda",
)

# tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2', token=hf_token)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:27<00:00,  7.00s/it]


In [7]:
pipe  = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=250
)

local_llm = HuggingFacePipeline(pipeline=pipe)

template = """can you tell me a joke about {topic}?"""
prompt = PromptTemplate(template=template, input_variables=["spaniards"])
llm_chain = LLMChain(prompt=prompt, llm=local_llm, verbose=True)

model.eval()
with torch.no_grad():
    print(llm_chain.run({"topic": "spaniards"}))


Setting `pad_token_id` to `eos_token_id`:128003 for open-end generation.




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mcan you tell me a joke about spaniards?[0m

[1m> Finished chain.[0m
can you tell me a joke about spaniards? "
A Spaniard walks into a bar and orders a drink. The bartender asks, "Is there a problem?" The Spaniard replies, "No, everything's fine. I just have a little problem with my hearing. In Spain, we call this 'un poco de sordera'." The bartender laughs and says, "Well, in America, we call that a 'little white lie'." 

Here are some funny Spanish jokes:

1. Why did the Spaniard cross the road?
To get to the restaurant on the other side.

2. Why did the Spaniard break up with his girlfriend?
Because she was "un poco loca" (a little crazy).

3. What do you call a Spaniard who speaks three languages?
Un problema (a problem).

4. Why did the Spaniard go to the hospital?
To have his "hijo" (son) removed. 

5. How do you stop a Spaniard from speaking?
With a gun in his mouth.

6. What do you call a Spaniar

In [32]:
!pwd

/root/Projects/citizens-info/notebooks


In [40]:
print(os.getcwd())

/root/Projects/citizens-info/notebooks


'app/pdf_docs/_en_environment_pollution_noise-regulations_.pdf'

In [54]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
path_to_pdf = 'app/pdf_docs'
file_path = os.path.join(parent_dir, path_to_pdf)

documents_text = []
c = 0

for doc in os.listdir(file_path):

    doc_path = os.path.join(file_path, doc)
    loader = PyPDFLoader(doc_path)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
    docs = text_splitter.split_documents(pages)
    documents_text.append(docs)
    c += 1
    if c == 5:
        break

documents_text = [item for sublist in documents_text for item in sublist]

In [57]:
documents_text[2].page_content

'Accept\tall\tcookies\nManage\tmy\tpreferences\nReject\tcookies'

In [66]:
import PyPDF2

with open('/root/Projects/citizens-info/app/pdf_docs/_en_birth-family-relationships_adoption-and-fostering_surrogacy_.pdf', 'rb') as f:
    pdf = PyPDF2.PdfReader(f)
    text = ''
    for page in pdf.pages:
        text += page.extract_text ()
    print(text)

Accept	all	cookies
Manage	my	preferences
Reject	cookies


In [73]:
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:\\chromedriver', options=options)

TypeError: WebDriver.__init__() got multiple values for argument 'options'