In [None]:
!pip install -qq pypdf faiss-gpu pandas SQLAlchemy langchain-text-splitters

In [6]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from urllib.request import urlretrieve
from sqlalchemy import create_engine
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Persist prompt-response experiment results

In [32]:
## run the first time we create the db of prompts
engine = create_engine('sqlite:///prompts_history.db')
table_name = 'prompts'
columns = ["prompt", "response", "llm", "embedding_model", "relevant_documents", "datetime", "source"]
if not os.path.isfile("prompts_history.db"):
    row = [{el:"" for el in columns}]
    prompts_df = pd.DataFrame(columns=columns)
else:
    query = "SELECT * FROM prompts"
    prompts_df = pd.read_sql(query, engine)
prompts_df

Unnamed: 0,prompt,response,llm,embedding_model,relevant_documents,datetime,source


# Preare documents

## Load documents

In [14]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter

loader = DirectoryLoader('model_cards/', glob="**/*.md", loader_cls=TextLoader)
docs = loader.load()
len(docs)

283

## Splitting

In [15]:
# Split documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=16000,
                                               chunk_overlap=2000,
                                               separators=['\n', '.'])
docs_before_split = loader.load()
docs_after_split = text_splitter.split_documents(docs_before_split)
len(docs_after_split)

285

Documents should be:

- large enough to contain enough information to answer a question, and
- small enough to fit into the LLM prompt: Mistral-7B-v0.1 input tokens limited to 4096 tokens
- small enough to fit into the embeddings model: BAAI/bge-small-en-v1.5: input tokens limited to 512 tokens (roughly 2000 characters. Note: 1 token ~ 4 characters).

In [16]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 283 documents loaded, with average characters equal to 2482.
After split, there were 285 documents (chunks), with average characters equal to 2491 (average chunk length).


# Text Embeddings

In [17]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
#print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)


Size of the embedding:  (384,)


# INDEXING

### FAISS as a vector store

In [21]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [97]:
query = """What is the model that can be used for legal document classification?"""  
# similarity_search_with_score
relevant_documents = vectorstore.similarity_search(query)
#relevant_documents
vectorstore2 = FAISS.from_documents(relevant_documents, huggingface_embeddings)

In [127]:
relevant_documents[0]

Document(page_content='---\ntags:\n- text-classification\n- bert\n---\n\n# Model Card for bleurt-tiny-512 \n \n# Model Details\n \n## Model Description\n \nPytorch version of the original BLEURT models from ACL paper\n \n- **Developed by:** Elron Bandel, Thibault Sellam, Dipanjan Das and Ankur P. Parikh of Google Research\n- **Shared by [Optional]:** Elron Bandel\n- **Model type:** Text Classification \n- **Language(s) (NLP):** More information needed\n- **License:** More information needed \n- **Parent Model:** BERT\n- **Resources for more information:**\n     - [GitHub Repo](https://github.com/google-research/bleurt/tree/master)\n \t  - [Associated Paper](https://aclanthology.org/2020.acl-main.704/)\n    - [Blog Post](https://ai.googleblog.com/2020/05/evaluating-natural-language-generation.html)\n \t\n\n\n# Uses\n \n\n## Direct Use\nThis model can be used for the task of Text Classification \n \n## Downstream Use [Optional]\n \nMore information needed.\n \n## Out-of-Scope Use\n \nThe

In [33]:
date = datetime.now()
for doc in relevant_documents:
    prompts_df.loc[len(prompts_df)] = {
        "prompt": query, 
        "response": "", 
        "llm": "", 
        "embedding_model": "",
        "relevant_documents": str(doc), 
        "datetime": date,
        "source": str(doc.metadata["source"])}
prompts_df

Unnamed: 0,prompt,response,llm,embedding_model,relevant_documents,datetime,source
0,What is the model that can be used for legal d...,,,,page_content='---\ntags:\n- text-classificatio...,2024-06-19 14:29:28.161965,model_cards/Elron/bleurt-tiny-512/README.md
1,What is the model that can be used for legal d...,,,,page_content='---\ninference: false\nlicense: ...,2024-06-19 14:29:28.161965,model_cards/Fujitsu/AugCode/README.md
2,What is the model that can be used for legal d...,,,,page_content='---\nlanguage: en\nlicense: apac...,2024-06-19 14:29:28.161965,model_cards/Hate-speech-CNERG/bert-base-uncase...
3,What is the model that can be used for legal d...,,,,page_content='---\nlanguage:\n- da\nlicense: a...,2024-06-19 14:29:28.161965,model_cards/alexandrainst/da-ned-base/README.md


In [34]:
prompts_df.to_sql(table_name, engine, if_exists='replace', index=False)

4

In [35]:
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

---
tags:
- text-classification
- bert
---

# Model Card for bleurt-tiny-512 
 
# Model Details
 
## Model Description
 
Pytorch version of the original BLEURT models from ACL paper
 
- **Developed by:** Elron Bandel, Thibault Sellam, Dipanjan Das and Ankur P. Parikh of Google Research
- **Shared by [Optional]:** Elron Bandel
- **Model type:** Text Classification 
- **Language(s) (NLP):** More information needed
- **License:** More information needed 
- **Parent Model:** BERT
- **Resources for more information:**
     - [GitHub Repo](https://github.com/google-research/bleurt/tree/master)
 	  - [Associated Paper](https://aclanthology.org/2020.acl-main.704/)
    - [Blog Post](https://ai.googleblog.com/2020/05/evaluating-natural-language-generation.html)
 	


# Uses
 

## Direct Use
This model can be used for the task of Text Classification 
 
## Downstream Use [Optional]
 
More information needed.
 


# RETRIEVAL

## Create a retriever interface using vector store

In [128]:
# Use similarity searching algorithm and return 3 most relevant documents.
# ('similarity', 'similarity_score_threshold', 'mmr')
#retriever = vectorstore2.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 5,"score_threshold": 0.5})
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3,})
retriever2 = vectorstore2.as_retriever(search_type="similarity", search_kwargs={"k": 3,})
# "score_threshold": 0.5

In [125]:
docs = retriever.invoke(query)
docs[0]

Document(page_content='---\nlanguage: en\nlicense: apache-2.0\ntags:\n- generated_from_trainer\nmetrics:\n- accuracy\n- f1\nwidget:\n- text: The agent on the phone was very helpful and nice to me.\nbase_model: bert-base-uncased\nmodel-index:\n- name: bert-base-uncased-finetuned-surveyclassification\n  results: []\n---\n\n<!-- This model card has been generated automatically according to the information the Trainer had access to. You\nshould probably proofread and complete it, then remove this comment. -->\n\n# bert-base-uncased-finetuned-surveyclassification\n\nThis model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on a custom survey dataset.\nIt achieves the following results on the evaluation set:\n- Loss: 0.2818\n- Accuracy: 0.9097\n- F1: 0.9097\n\n## Model description\n\nMore information needed\n\n#### Limitations and bias\n\nThis model is limited by its training dataset of survey results for a particular customer service domain. This ma

# GENERATION

In [104]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    huggingfacehub_api_token="",
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    model_kwargs={"temperature":0.1, "max_length":500})

query = """What is the model that can be used for legal document classification?""" 
hf.invoke(query)

'What is the model that can be used for legal document classification? \nThe model that can be used for legal document classification is a supervised machine learning model, specifically a classification model. The model can be trained on a labeled dataset of legal documents, where each document is labeled with a specific category or class (e.g. contract, agreement, settlement, etc.). The model can then be used to classify new, unseen legal documents into one of the pre-defined categories.\n\nSome common classification models used for legal document classification include:\n\n1. Naive Bayes (NB)'

##  Open source LLM

In [105]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from huggingface_hub import login

login('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')
hf = HuggingFacePipeline.from_model_id(
    #model_id="meta-llama/Meta-Llama-3-8B-Instruct",
    model_id="/root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa", # refer to local path 
    task="text-generation",
    pipeline_kwargs={"temperature": 0.01, "max_new_tokens": 300}
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.40it/s]
Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.


In [106]:
llm = hf 
llm.invoke(query)

'**\nA. Naive Bayes\nB. Decision Trees\nC. Support Vector Machines\nD. Random Forest\nAnswer: C. Support Vector Machines\nExplanation: Support Vector Machines (SVM) is a popular machine learning model that can be used for legal document classification. SVM is a supervised learning model that can be used for classification and regression tasks. It is particularly effective for high-dimensional data and can handle non-linear relationships between the features and the target variable. SVM has been widely used in various applications, including text classification, image classification, and bioinformatics.\n\nIn the context of legal document classification, SVM can be used to classify documents into different categories such as contracts, agreements, lawsuits, and so on. The model can be trained on a dataset of labeled documents, and then used to classify new, unseen documents. SVM is a robust and accurate model that can handle the complexity of legal documents, which often contain complex

# Use together the retrieval system for relevant documents and the LLM

In [107]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

In [129]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever2,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

## Use RetrievalQA invoke method to execute the chain

### Option 1  (not active for the moment)

### Option 2

In [130]:
# Call the QA chain with our new query regarding the transparency
query = """Which of the retrieved model cards is the most transparent?
Transparency means disclosing more information about the model"""
result = retrievalQA.invoke({"query": query})
print(result['result'])

The most transparent model card is the one for the "bleurt-tiny-512" model. It provides a detailed description of the model, its training data, and its evaluation metrics. It also includes information about the potential biases and limitations of the model, as well as recommendations for responsible use. Additionally, it provides a clear citation for the model and its associated paper. Overall, the "bleurt-tiny-512" model card is the most transparent because it provides a comprehensive overview of the model and its capabilities.


In [131]:
date = datetime.now()
for doc in result['source_documents']:
    prompts_df.loc[len(prompts_df)] = {
        "prompt": query, 
        "response": result['result'], 
        "llm": "meta-llama/Meta-Llama-3-8B-Instruct", 
        "embedding_model": "BAAI/bge-small-en-v1.5",
        "relevant_documents": str(doc), 
        "datetime": date,
        "source": str(doc.metadata["source"])}
prompts_df

Unnamed: 0,prompt,response,llm,embedding_model,relevant_documents,datetime,source
0,What is the model that can be used for legal d...,,,,page_content='---\ntags:\n- text-classificatio...,2024-06-19 14:29:28.161965,model_cards/Elron/bleurt-tiny-512/README.md
1,What is the model that can be used for legal d...,,,,page_content='---\ninference: false\nlicense: ...,2024-06-19 14:29:28.161965,model_cards/Fujitsu/AugCode/README.md
2,What is the model that can be used for legal d...,,,,page_content='---\nlanguage: en\nlicense: apac...,2024-06-19 14:29:28.161965,model_cards/Hate-speech-CNERG/bert-base-uncase...
3,What is the model that can be used for legal d...,,,,page_content='---\nlanguage:\n- da\nlicense: a...,2024-06-19 14:29:28.161965,model_cards/alexandrainst/da-ned-base/README.md
4,What is the model that can be used for legal d...,The model that can be used for legal document ...,meta-llama/Meta-Llama-3-8B-Instruct,BAAI/bge-small-en-v1.5,page_content='---\ntags:\n- text-classificatio...,2024-06-19 15:17:06.590810,model_cards/Elron/bleurt-tiny-512/README.md
5,What is the model that can be used for legal d...,The model that can be used for legal document ...,meta-llama/Meta-Llama-3-8B-Instruct,BAAI/bge-small-en-v1.5,page_content='---\ninference: false\nlicense: ...,2024-06-19 15:17:06.590810,model_cards/Fujitsu/AugCode/README.md
6,What is the model that can be used for legal d...,The model that can be used for legal document ...,meta-llama/Meta-Llama-3-8B-Instruct,BAAI/bge-small-en-v1.5,page_content='---\nlanguage: en\nlicense: apac...,2024-06-19 15:17:06.590810,model_cards/Hate-speech-CNERG/bert-base-uncase...
7,What is the model that can be used for legal d...,The model that can be used for legal document ...,meta-llama/Meta-Llama-3-8B-Instruct,BAAI/bge-small-en-v1.5,page_content='---\ntags:\n- text-classificatio...,2024-06-19 15:45:39.190186,model_cards/Elron/bleurt-tiny-512/README.md
8,What is the model that can be used for legal d...,The model that can be used for legal document ...,meta-llama/Meta-Llama-3-8B-Instruct,BAAI/bge-small-en-v1.5,page_content='---\ninference: false\nlicense: ...,2024-06-19 15:45:39.190186,model_cards/Fujitsu/AugCode/README.md
9,What is the model that can be used for legal d...,The model that can be used for legal document ...,meta-llama/Meta-Llama-3-8B-Instruct,BAAI/bge-small-en-v1.5,page_content='---\nlanguage: en\nlicense: apac...,2024-06-19 15:45:39.190186,model_cards/Hate-speech-CNERG/bert-base-uncase...


In [136]:
prompts_df.iloc[22].to_dict()

{'prompt': 'Which of the retrieved model cards is the most transparent?\nTransparency means disclosing more information about the model',
 'response': 'The most transparent model card is the one for the "bleurt-tiny-512" model. It provides a detailed description of the model, its training data, and its evaluation metrics. It also includes information about the potential biases and limitations of the model, as well as recommendations for responsible use. Additionally, it provides a clear citation for the model and its associated paper. Overall, the "bleurt-tiny-512" model card is the most transparent because it provides a comprehensive overview of the model and its capabilities.',
 'llm': 'meta-llama/Meta-Llama-3-8B-Instruct',
 'embedding_model': 'BAAI/bge-small-en-v1.5',
 'datetime': Timestamp('2024-06-20 09:37:23.804637'),
 'source': 'model_cards/Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two/README.md'}

In [132]:
prompts_df.to_sql(table_name, engine, if_exists='replace', index=False)

23

In [133]:
relevant_docs = result['source_documents']
print(relevant_docs)



In [134]:
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, \nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: model_cards/Elron/bleurt-tiny-512/README.md, 
Content: ---
tags:
- text-classification
- bert
---

# Model Card for bleurt-tiny-512 
 
# Model Details
 
## Model Description
 
Pytorch version of the original BLEURT models from ACL paper
 
- **Developed by:** Elron Bandel, Thibault Sellam, Dipanjan Das and Ankur P. Parikh of Google Research
- **Shared by [Optional]:** Elron Bandel
- **Model type:** Text Classification 
- **Language(s) (NLP):** More information needed
- **License:** More information needed 
- **Parent Model:** BERT
- **Resources for more information:**
     - [GitHub Repo](https://github.com/google-research/bleurt/tree/master)
 	  - [Associated Paper](https://aclanthology.org/2020.acl-main.704/)
    - [Blog Post](https://ai.googleblog.com/2020/05/evaluating-natural-language

# Step 2: split the best retrieved models into markdown sections


In [120]:
markdown_document = """# Intro \n\n    ## History \n\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. 
John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] 
\n\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. 
\n\n ## Rise and divergence 
\n\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for 
\n\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. 
\n\n #### Standardization 
\n\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, 
launched what Atwood characterised as a standardisation effort. 
\n\n ## Implementations \n\n Implementations of Markdown are available for over a dozen programming languages."""

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(markdown_document)
print(md_header_splits[0])#.page_content)

page_content='# Intro  \n## History  \nMarkdown[9] is a lightweight markup language for creating formatted text using a plain-text editor.\nJohn Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]  \nMarkdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.' metadata={'Header 1': 'Intro', 'Header 2': 'History'}
