# Title

In [1]:
import os
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser
from haystack.nodes import EmbeddingRetriever

In [4]:
retriever_dict = dict()
file_converter_dict = dict() 
preprocessor_dict = dict()
document_store_dict = dict()
prompt_node_dict = dict()
prompt_node_dict = dict()
output_dict = dict()
system_message_dict = dict()
prompt_dict = dict()

In [3]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
openai_api_key = os.getenv('openai_api_key')

## Embedding models

In [12]:
# https://huggingface.co/models?pipeline_tag=sentence-similarity&sort=downloads
embedding_models_list = [
    'sentence-transformers/all-mpnet-base-v2',
    'sentence-transformers/all-MiniLM-L6-v2',
    "sentence-transformers/multi-qa-mpnet-base-dot-v1"
]

# Load RAW JSON

In [6]:
import json
def load_json(filename, filepath):
    """
    Load a JSON file using specified file path copied from windows file explorer.
    Back slashes in file path will be converted to forward slashes.

    Arguments:
    - filepath (raw string): Use the format r'<path>'.
    - filename (string).
    """
    filename = f'{filepath}/'.replace('\\','/')+filename
    with open(filename) as file:
        return json.load(file)

filepath = '../data/private/'
filename = 'Discord_all_messages.json'
messages = load_json(filename, filepath)

In [7]:
def filter_messages(messages):
    filtered_messages = []
    for message in messages:
        filtered_message = {
            'id': message['id'],
            'author_id': message['author_id'],
            'content': message['content'],
            'reference_id': message['reference_id']
        }
        filtered_messages.append(filtered_message)
    return filtered_messages

# messages = [...]  # Your list of dictionaries
filtered_messages = filter_messages(messages)
print(len(filtered_messages))

2249


In [8]:
filtered_messages[0].keys()

dict_keys(['id', 'author_id', 'content', 'reference_id'])

## Save filtered messages as JSON

In [9]:
from datetime import datetime
def save_to_json(obj, filename=None, description='Discord_messages_json', append_version=1,
    path='../data'
    ):
    """
    Save Python object as a JSON file.
    Parameters:
    - obj: Python object to be saved.
    - filename: Root of the filename.
    - path (raw string): Use the format r'<path>'. If None, file is saved in same directory as script.
    - append_version (bool): If true, append date and time to end of filename.
    """
    if description:
        filename = f'{description}_{datetime.now().strftime("%Y-%m-%d_%H%M")}'
        append_version = False
    elif filename == None:
        filename = f'{datetime.now().strftime("%Y-%m-%d_%H%M")}_outputs'
        append_version = False
    if path:
        path = f'{path}/'.replace('\\','/')
    if append_version:
        filename += f'_{datetime.now().strftime("%Y-%m-%d_%H%M%S")}'
    filename += '.json'
    with open(path+filename, 'w') as f:
        json.dump(obj, f)
    print(f'Object saved as JSON: {filename}')


description = 'truncated_Discord_messages'
save_path = '../data/testing_2023-11-24/'
save_to_json(filtered_messages[:100], description=description, path=save_path)

Object saved as JSON: truncated_Discord_messages_2023-11-24_1259.json


# Indexing pipeline

In [22]:
iteration = 7
file_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters
file_converter_dict[iteration] = file_converter

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by='passage', # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=100,
    split_respect_sentence_boundary=False,
    split_overlap=0,
  	max_chars_check = 500000
)
preprocessor_dict[iteration] = preprocessor

In [23]:
from datetime import datetime

def append_timestamp(string, ext=None):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    if ext:
        return f'{string}_{timestamp}.{ext}'
    else:
        return f'{string}_{timestamp}' 

def create_timestamp():
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    print(f'Created timestamp string: {timestamp}')
    return timestamp

timestamp_string = create_timestamp()

index_filename = 'journal_article_index'+ timestamp_string
config_filename = 'journal_article_config'+ timestamp_string
faiss_filename = 'faiss_document_store'+ timestamp_string
path = '../data/testing_2023-11-24/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url=f"sqlite:///{path}{faiss_filename}.db",
    # embedding_dim=384,
    faiss_index_factory_str="Flat"
    )
document_store_dict[iteration] = document_store

Created timestamp string: 2023-11-24_1410


In [24]:
retriever = EmbeddingRetriever( # https://docs.haystack.deepset.ai/reference/retriever-api
    document_store=document_store,
    embedding_model=embedding_models_list[0],
    model_format="sentence_transformers",
    top_k = 100
)
retriever_dict[iteration] = retriever

INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/all-mpnet-base-v2


In [26]:
def run_pipeline(data_filename, data_path):
    p = Pipeline()
    p.add_node(component=file_converter, name="FileConverter", inputs=["File"])
    p.add_node(component=preprocessor, name="PreProcessor", inputs=["FileConverter"])
    p.add_node(component=retriever, name="Retriever", inputs=["PreProcessor"])
    p.add_node(component=document_store, name="DocumentStore", inputs=["Retriever"])
    p.run(file_paths=[f"{data_path if data_path else '.'}/{data_filename}"])
    print(f'Number of documents: {len(p.get_document_store().get_all_documents())}')
    print(f'Document content type: {type(p.get_document_store().get_all_documents()[0].content)}') #### SH 2023-11-24 14:17 return the p object

data_path = '../data/testing_2023-11-24'
data_filename = 'truncated_Discord_messages_2023-11-24_1259.json'

run_pipeline(data_filename, data_path)

Converting files: 100%|██████████| 1/1 [00:00<00:00, 39.11it/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 87.13docs/s]




Batches:   0%|          | 0/1 [00:04<?, ?it/s]

Writing Documents: 10000it [00:00, 140012.22it/s]       

Number of documents: 1
Document content type: <class 'str'>





In [27]:
document_store.update_embeddings(retriever)
document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

INFO - haystack.document_stores.faiss -  Updating embeddings for 1 docs...
Updating Embedding:   0%|          | 0/1 [00:00<?, ? docs/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Documents Processed: 10000 docs [00:01, 8073.55 docs/s]      


# summarization pipeline

In [5]:
models_list = [
    'gpt-3.5-turbo-16k',
    'gpt-3.5-turbo-16k-0613',
    'gpt-3.5-turbo-instruct',
    'gpt-3.5-turbo-instruct-0914',
    'gpt-4',
    'gpt-4-0314',
    'gpt-4-0613',
    'gpt-4-1106-preview',
    'gpt-3.5-turbo',
    'gpt-3.5-turbo-0301',
    'gpt-3.5-turbo-0613',
    'gpt-3.5-turbo-1106',
]

In [10]:


def load_document_store(index_filename, config_filename, path):
    saved_document_store = FAISSDocumentStore.load(
        index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
        )
    print(f'Loading document store from {path}/{index_filename} and {path}/{config_filename}.')
    print(f'Document content data type: {type(saved_document_store.get_all_documents()[0].content)}')
    print(f'Number of documents: {len(saved_document_store.get_all_documents())}')
    return saved_document_store

# file_timestamp = timestamp_string
file_timestamp = '2023-11-24_1410'
path = '../data/testing_2023-11-24/'
index_filename = 'journal_article_index'+file_timestamp
config_filename = 'journal_article_config'+file_timestamp
faiss_filename = 'faiss_document_store'+file_timestamp
# path = path
saved_document_store = load_document_store(index_filename, config_filename, path)
document_store = saved_document_store

Loading document store from ../data/testing_2023-11-24//journal_article_index2023-11-24_1410 and ../data/testing_2023-11-24//journal_article_config2023-11-24_1410.
Document content data type: <class 'str'>
Number of documents: 1


In [33]:
iteration = 7.3
model_name = models_list[0]
max_length = 3000
system_message = """
The following is a JSON array of objects containing a sequence of Discord messages. 
The messages have the following attributes:
- id: The ID of the message.
- author_id: The ID of the message write.
- content: The content of the message.
- reference_id: The ID of the parent message. If the message is not a reply, this will be null.
    Otherwise, this value indicates that the message is a reply to the message with the specified ID.

Summarize the messages to create a Frequently Asked Questions document. Return the output as a JSON array 
where each element is a question-answer pair. For example:
[
    {
        "question": "Question 1",
        "answer": "Answer 1"
    },
    {
        "question": "Question 2",
        "answer": "Answer 2"
    }
]
"""
prompt = PromptTemplate( # https://docs.haystack.deepset.ai/docs/prompt_node#prompttemplates
    prompt='{query}\n\n Messages: {join(documents)} \n\nSummary: '
)
prompt_dict[iteration] = prompt

prompt_node = PromptNode( # https://docs.haystack.deepset.ai/reference/prompt-node-api # https://docs.haystack.deepset.ai/docs/prompt_node#in-a-pipeline
    model_name, api_key=openai_api_key, 
    max_length=max_length, # The maximum number of tokens the generated text output can have,
    default_prompt_template=prompt,
    model_kwargs={
        "temperature": 0,
        "response_format": { "type": "json_object" }
        }
    )
prompt_node_dict[iteration] = prompt_node
system_message_dict[iteration] = system_message

# retriever = EmbeddingRetriever( # https://docs.haystack.deepset.ai/reference/retriever-api
#     document_store=document_store,
#     embedding_model=embedding_models_list[0],
#     model_format="sentence_transformers",
#     top_k = 100
# )
# retriever_dict[iteration] = retriever


In [34]:
def run_summarization(system_message, document_store=None, retriever=None, prompt_node=prompt_node, use_retriever=False):
    summarize_pipeline = Pipeline()
    if use_retriever:
        print(f'Using retriever')
        summarize_pipeline.add_node(component=retriever, name="RetrieverNode", inputs=["Query"])
        summarize_pipeline.add_node(component=prompt_node, name="PromptNode", inputs=["RetrieverNode"])
        output = summarize_pipeline.run(query=system_message, params={"RetrieverNode":{"top_k": 100}})
    else:
        print(f'Not using retriever; using DocumentStore')
        summarize_pipeline.add_node(component=prompt_node, name="PromptNode", inputs=["Query"])
        output = summarize_pipeline.run(query=system_message, documents=document_store.get_all_documents())
    print(f"\n*Model output*: \n{output['results'][0]}")
    return output


output = run_summarization(
    system_message, 
    document_store=document_store, 
    retriever=None, 
    prompt_node=prompt_node, use_retriever=False
    )
output_dict[iteration] = output


Not using retriever; using DocumentStore

*Model output*: 
[
    {
        "question": "When embedded with FastAPI, is there a different port that Solara loads or is it the same port as uvicorn?",
        "answer": "Solara loads on the same port as uvicorn."
    },
    {
        "question": "Why does Solara not load when deployed as a Docker image?",
        "answer": "There may be an issue with websocket requesting jupyter widgets. Make sure you use the same browser and check the docker container logs for any errors."
    },
    {
        "question": "Is there a reverse proxy in between Solara and the server?",
        "answer": "Yes, there is a reverse proxy in production. Make sure to set up firewall rules as well."
    },
    {
        "question": "How can I make InputText multiline in Solara?",
        "answer": "There seems to be an issue with making InputText multiline. CSS class may not work. Further assistance is recommended."
    },
    {
        "question": "Why am I getting

# *End of Page*