In [1]:
import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline

In [2]:
model_name = "ehartford/Wizard-Vicuna-13B-Uncensored"
# model_name = "TheBloke/wizard-vicuna-13B-GPTQ"
# model_name = "TheBloke/wizardLM-7B-HF"

tokenizer = LlamaTokenizer.from_pretrained(model_name)

model = LlamaForCausalLM.from_pretrained(model_name,
                                              load_in_8bit=True,
                                              device_map='auto',
                                              torch_dtype=torch.float16,
                                              low_cpu_mem_usage=True)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/tamizh/miniconda3/envs/langchain-llm/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/tamizh/miniconda3/envs/langchain-llm/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/tamizh/miniconda3/envs/langchain-llm/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=1024,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
import os

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [None]:
# Load and process the text files
loader = TextLoader('twitter_tos.txt')
# loader = DirectoryLoader('./new_articles/', glob="./*.txt", loader_cls=TextLoader)

documents = loader.load()

In [12]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [13]:
len(texts)

32

In [14]:
texts[3]

Document(page_content='We reserve the right to remove Content that violates the User Agreement, including for example, copyright or trademark violations or other intellectual property misappropriation, impersonation, unlawful conduct, or harassment. Information regarding specific policies and the process for reporting or appealing violations can be found in our Help Center (https://help.twitter.com/en/rules-and-policies/twitter-report-violation#specific-violations and https://help.twitter.com/en/managing-your-account/suspended-twitter-accounts).\n\nIf you believe that your Content has been copied in a way that constitutes copyright infringement, please report this by visiting our Copyright reporting form (https://help.twitter.com/forms/dmca) or contacting our designated copyright agent at:\n\nTwitter, Inc.\nAttn: Copyright Agent\n1355 Market Street, Suite 900\nSan Francisco, CA 94103\nReports: https://help.twitter.com/forms/dmca\nEmail: copyright@twitter.com\nYour Rights and Grant of R

## HF Instructor Embeddings

In [16]:
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings



# model_name="hkunlp/instructor-base" 
# model_kwargs={"device": "cpu"}

# model_name="hkunlp/instructor-xl" 
# model_kwargs={"device": "cuda"}


model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}


instructor_embeddings =  HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)




## Create the VectorStore

In [48]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [49]:
query = "Laws of which state, governs these Terms?"
docs = vectordb.similarity_search(query)

In [12]:
docs

[Document(page_content='If you are a federal, state, or local government entity in the United States using the Services in your official capacity and legally unable to accept the controlling law, jurisdiction or venue clauses above, then those clauses do not apply to you. For such U.S. federal government entities, these Terms and any action related thereto will be governed by the laws of the United States of America (without reference to conflict of laws) and, in the absence of federal law and to the extent permitted under federal law, the laws of the State of California (excluding choice of law).\n\nIn the event that any provision of these Terms is held to be invalid or unenforceable, then that provision will be limited or eliminated to the minimum extent necessary, and the remaining provisions of these Terms will remain in full force and effect. Twitter’s failure to enforce any right or provision of these Terms will not be deemed a waiver of such right or provision.', metadata={'sour

## Make a Retriever

In [20]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

## Make a Chain

In [21]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=local_llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [45]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [28]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print("Answer:")
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('Reference:')
    for i, src_doc in enumerate(llm_response["source_documents"]):
        print(f'{i} - ', end='')
        print(src_doc.page_content)

In [29]:
# full example
def get_answer(query):
    llm_response = qa_chain(query)
    process_llm_response(llm_response)

In [27]:
get_answer("Does the TOS allow third party to create products similar to Twitter?")

Answer:

No, according to the TOS, Twitter reserves the right to use any feedback, comments, or suggestions provided by
users regarding the Services, including any ideas for improving the Services or creating new products.
Additionally, Twitter owns all intellectual property rights related to the Services, excluding content
provided by users. Therefore, it is unlikely that third parties would be able to create products similar to
Twitter without infringing upon Twitter's intellectual property rights.

Reference:
0 - 
The Services are protected by copyright, trademark, and other laws of both the United States and other countries. Nothing in the Terms gives you a right to use the Twitter name or any of the Twitter trademarks, logos, domain names, other distinctive brand features, and other proprietary rights. All right, title, and interest in and to the Services (excluding Content provided by users) are and will remain the exclusive property of Twitter and its licensors. Any feedback, co

In [49]:
get_answer("What is permitted by the license to use Twitter services?")

According to the given context, using Twitter services allows users to submit, post or display content on or
through the services. The license grants users a personal, worldwide, royalty-free, non-assignable and non-
exclusive license to use the software provided by Twitter for their intended purposes. Users also grant
Twitter the right to curate, transform, and translate their content for promotional and publishing purposes.

Reference:
0.:
Twitter has an evolving set of rules for how ecosystem partners can interact with your Content on the Services. These rules exist to enable an open ecosystem with your rights in mind. You understand that we may modify or adapt your Content as it is distributed, syndicated, published, or broadcast by us and our partners and/or make changes to your Content in order to adapt the Content to different media.

You represent and warrant that you have, or have obtained, all rights, licenses, consents, permissions, power and/or authority necessary to grant 

In [54]:
query = "Does the TOS allow third party to create products similar to Twitter?"
llm_response = qa_chain(query)
dir(qa_chain)

['Config',
 '__abstractmethods__',
 '__annotations__',
 '__call__',
 '__class__',
 '__class_vars__',
 '__config__',
 '__custom_root_type__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__exclude_fields__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_validators__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__include_fields__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__json_encoder__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__post_root_validators__',
 '__pre_root_validators__',
 '__pretty__',
 '__private_attributes__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_args__',
 '__repr_name__',
 '__repr_str__',
 '__rich_repr__',
 '__schema_cache__',
 '__setattr__',
 '__setstate__',
 '__signature__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__try_update_forward_refs__',
 '__validators__',
 '_abc_impl',
 '_acall',
 '_aget_docs',
 '_calculate_keys',
 '_call',
 '_c

In [56]:
qa_chain.combine_documents_chain

StuffDocumentsChain(memory=None, callbacks=None, callback_manager=None, verbose=False, input_key='input_documents', output_key='output_text', llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:", template_format='f-string', validate_template=True), llm=HuggingFacePipeline(cache=None, verbose=False, callbacks=None, callback_manager=None, pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f22b879fdc0>, model_id='gpt2', model_kwargs=None, pipeline_kwargs=None), output_key='text'), document_prompt=PromptTemplate(input_variables=['page_content'], output_parser=None, partial_variables={}, template='

In [59]:
vectordb.get

<bound method Chroma.get of <langchain.vectorstores.chroma.Chroma object at 0x7f2241249e70>>

## References:

* https://www.shruggingface.com/blog/langchain-cloudflare-qa-agent

* https://www.geoffreylitt.com/2023/01/29/fun-with-compositional-llms-querying-basketball-stats-with-gpt-3-statmuse-langchain.html

* https://github.com/hwchase17/langchain/blob/master/docs/modules/indexes/document_loaders/examples/json.ipynb

In [43]:
index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Chroma, 
    embedding=embedding,
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
)

In [44]:
index_creator

VectorstoreIndexCreator(vectorstore_cls=<class 'langchain.vectorstores.chroma.Chroma'>, embedding=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}), text_splitter=<langchain.text_splitter.RecursiveCharacterTextSplitter object at 0x7f224170c550>, vectorstore_kwargs={})