In [3]:
from unstructured.partition.pdf import partition_pdf
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough,RunnableLambda

from langchain_postgres.vectorstores import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from langchain_community.storage import RedisStore
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from pathlib import Path
from IPython.display import display, HTML, Markdown
from base64 import b64decode
import os, hashlib, shutil, uuid, json, time
import torch, redis, streamlit as st
import logging
# Initialize Redis client
client = redis.Redis(host="localhost", port=6379, db=0)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
FILE_PATH = Path("MSF-Churn-Project.pdf") 

### Data Loading

In [6]:
def data_loading():

    raw_pdf_elements = partition_pdf(
        filename=FILE_PATH,
      
        infer_table_structure=True,
        strategy = "hi_res",
        
        extract_image_block_types = ["Image"],
        extract_image_block_to_payload  = True,

        chunking_strategy="by_title",     
        mode='elements',
        max_characters=10000,
        new_after_n_chars=5000,
        combine_text_under_n_chars=2000,
        image_output_dir_path="data/",
    )
    return raw_pdf_elements

In [7]:
pdf_elements = data_loading()

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [8]:
pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x342fc9bb0>,
 <unstructured.documents.elements.CompositeElement at 0x31660b9e0>,
 <unstructured.documents.elements.CompositeElement at 0x317db5c40>,
 <unstructured.documents.elements.CompositeElement at 0x309b63320>,
 <unstructured.documents.elements.CompositeElement at 0x3087b23f0>,
 <unstructured.documents.elements.CompositeElement at 0x3087e7cb0>]

In [9]:
tables = [element.metadata.text_as_html for element in pdf_elements if 'Table' in str(type(element))]
text = [element.text for element in pdf_elements if 'CompositeElement' in str(type(element))]

In [10]:
tables

[]

In [11]:
text

['CHURN PREDICTION AND OPTIMISATION– 8 JAN 2025\n\nOPTIMISATION– 8 JAN 2025\n\nAGENDA\n\n1. Introduction of project team (MSF, ESMT)\n\n2. Introduction to MSF – Flaminia\n\n3. Introduction to Fundraising Loyalty Management in MSF – Nicole 4. Overview of project scope – Zankar\n\n5. Project cadence and next steps – Rania\n\n6. Q&A\n\nLE MEDECINS SANS FRONTIERES ARZTE OHNE GRENZEN eV.\n\nPROJECT TEAM - INTRODUCTION\n\nMSF\n\nName Role Flaminia Sabrie Head of Strategy & Organisational Development Department Nicole Officer Loyalty Huwendiek Management (FR) Markus Kopf Head of IT Strategy Unit Zankar Koli Data Engineering & Analytics Specialist Rania Aboueid IT Project Support Officer\n\nESMT\n\nName Role Dr. Vlada MAAI - Analytics Pleshcheva Consulting Project Coordinator Djordje Pevcevic Mentor Giovanna Student Mariotto Whitaker Cavalcanti Stanislas Student Koralewski Youssef Ouidani Student Zakaria Belehri Student\n\nABOUT MSF\n\nWhen was MSF founded?\n\nMédecins Sans Frontières (MSF) wa

### Summarize the Data

In [12]:
# Summarize extracted text and tables using LLM
def summarize_text_and_tables(text, tables):
    logging.info("Ready to summarize data with LLM")
    prompt_text = """You are an assistant tasked with summarizing text and tables. \
    
                    You are to give a concise summary of the table or text and do nothing else. 
                    Table or text chunk: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini")
    summarize_chain = {"element": RunnablePassthrough()}| prompt | model | StrOutputParser()
    logging.info(f"{model} done with summarization")
    return {
        "text": summarize_chain.batch(text, {"max_concurrency": 5}),
        "table": summarize_chain.batch(tables, {"max_concurrency": 5})
    }

In [13]:
data_summary = summarize_text_and_tables(text, tables)

In [14]:
data_summary

{'text': ['The document outlines the agenda for a project meeting on Churn Prediction and Optimisation scheduled for January 8, 2025, involving the Médecins Sans Frontières (MSF) and ESMT teams. The agenda includes introductions of team members, an overview of MSF and its fundraising loyalty management, project scope, cadence, next steps, and a Q&A session. MSF, founded in 1971, is an international humanitarian organization focused on providing medical aid in crisis situations, operating in over 70 countries with a commitment to healthcare access for all. The project team consists of members from both MSF and ESMT, with various roles outlined for each individual.',
  "The MSF Charter outlines the principles and commitments of Médecins Sans Frontières (MSF), emphasizing their provision of assistance to those in need without discrimination, adherence to neutrality and impartiality, independence from political and religious influences, and acceptance of risks without compensation claims. 

In [15]:
text_summary = data_summary['text']

In [16]:
tables_summary = data_summary['table']

### Initialize Retriever

I created here a RedisStore (for metadata) and a PGVector instance (backed by my PostgreSQL database) using OpenAIEmbeddings. By calling initialize_retriever(), I connect to that vector-enabled Postgres DB (and auto-install its vector extension), register my “my_documents” collection, and return a MultiVectorRetriever that lets me efficiently look up similar documents by embedding.

In [19]:
def initialize_retriever():

    store = RedisStore(client=client)
    id_key = "doc_id"
    vectorstore = PGVector(
            embeddings=OpenAIEmbeddings(),
            collection_name=COLLECTION_NAME,
            connection=CONNECTION_STRING,
            use_jsonb=True,
            )
    retrieval_loader = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")
    return retrieval_loader

In [21]:
load_retriever = initialize_retriever()

### Add Summary to vectorstore & Raw data to RedisStore

In [24]:
# Store text, tables, and their summaries in the retriever

def store_docs_in_retriever(text, text_summary, table, table_summary, retriever):
    """Store text and table documents along with their summaries in the retriever."""

    def add_documents_to_retriever(documents, summaries, retriever, id_key = "doc_id"):
        """Helper function to add documents and their summaries to the retriever."""
        if not summaries:
            return None, []

        doc_ids = [str(uuid.uuid4()) for _ in documents]
        summary_docs = [
            Document(page_content=summary, metadata={id_key: doc_ids[i]})
            for i, summary in enumerate(summaries)
        ]

        retriever.vectorstore.add_documents(summary_docs, ids=doc_ids)
        retriever.docstore.mset(list(zip(doc_ids, documents)))     

# Add text, table, and image summaries to the retriever
    add_documents_to_retriever(text, text_summary, retriever)
    add_documents_to_retriever(table, table_summary, retriever)
    return retriever

In [25]:
retriever  = store_docs_in_retriever(text, text_summary, tables,  tables_summary, load_retriever)

In [27]:
query = "What are the key factors contributing to donor churn and the proposed strategies to improve retention in the MSF Churn Prediction and Optimisation report?"

In [28]:
docs = retriever.invoke(query)

In [29]:
docs

[b'PROJECT SCOPE (1/3)\n\nResearch Title:\n\n"Optimizing Donor Retention and Lifetime Value Strategies: Predictive Analytics and Segmentation for Enhanced Donor Engagement"\n\nResearch Objective:\n\nTo develop data-driven strategies for reducing donor churn and identifying new donor segments by leveraging predictive analytics, market segmentation, and effective data visualization.\n\nLE MEDECINS SANS FRONTIERES ARZTE OHNE GRENZEN eV.\n\nPROJECT SCOPE (2/3) Research Questions:\n\n1. What are the key factors contributing to donor attrition, and how can predictive models help in identifying probable donors?\n\n2. How can the existing donor data be segmented to better understand future donor lifetime value and enhance targeted communication and improve donor retention?\n\n3. How can the findings from donor behavior analysis be effectively visualized to support decision-making and stakeholder engagement?\n\n4. Which external data sources can be utilized to better predict the donor lifetime 

### RAG Pipeline

Parsed the retriever output

In [30]:
def parse_retriver_output(data):
    parsed_elements = []
    for element in data:
        # Decode bytes to string if necessary
        if isinstance(element, bytes):
            element = element.decode("utf-8")
        
        parsed_elements.append(element)
    
    return parsed_elements

Chat with the LLM using retrieved context

In [31]:
def chat_with_llm():


    prompt_text = """
                You are an AI Assistant tasked with understanding detailed
                information from text and tables. You are to answer the question based on the 
                context provided to you. You must not go beyond the context given to you.
                
                Context:
                {context}

                Question:
                {question}
                """

    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini")

    rag_chain = {
       "context": retriever | RunnableLambda(parse_retriver_output), "question": RunnablePassthrough(),
        } | RunnablePassthrough().assign(
        response=(
        prompt 
        | model 
        | StrOutputParser()
        )
        )

    return rag_chain

In [32]:
rag_chain = chat_with_llm()

In [45]:
response = rag_chain.invoke(
    "I want to know more about MSF, be concise please"
)

In [46]:
print(response['response'])

Médecins Sans Frontières (MSF) was founded in 1971 in France by a group of doctors and journalists. It is an independent organization focused on delivering emergency medical aid in crisis situations, regardless of race, religion, or political affiliation. MSF operates in over 70 countries and is composed mainly of healthcare professionals, with a current workforce of more than 69,000 people. The organization emphasizes neutrality, impartiality, and independence in its humanitarian efforts. Its funding comes from private donations, grants, and legacies, with a goal of retaining and engaging donors for long-term support.


In [39]:
response = rag_chain.invoke(
    "What is the objective of this project, and when is the deadline"
)

In [41]:
print(response['response'])

The objective of the project is to develop data-driven strategies for reducing donor churn and identifying new donor segments by leveraging predictive analytics, market segmentation, and effective data visualization. The deadline for the project is 13 April 2024, which is when the final deliverables are to be submitted.


In [42]:
response = rag_chain.invoke(
    "What are the key factors driving donor attrition and how does the project plan to predict and reduce churn?"
)

In [34]:
response

{'context': ['PROJECT SCOPE (1/3)\n\nResearch Title:\n\n"Optimizing Donor Retention and Lifetime Value Strategies: Predictive Analytics and Segmentation for Enhanced Donor Engagement"\n\nResearch Objective:\n\nTo develop data-driven strategies for reducing donor churn and identifying new donor segments by leveraging predictive analytics, market segmentation, and effective data visualization.\n\nLE MEDECINS SANS FRONTIERES ARZTE OHNE GRENZEN eV.\n\nPROJECT SCOPE (2/3) Research Questions:\n\n1. What are the key factors contributing to donor attrition, and how can predictive models help in identifying probable donors?\n\n2. How can the existing donor data be segmented to better understand future donor lifetime value and enhance targeted communication and improve donor retention?\n\n3. How can the findings from donor behavior analysis be effectively visualized to support decision-making and stakeholder engagement?\n\n4. Which external data sources can be utilized to better predict the dono

In [35]:
print(response['response'])

The key factors contributing to donor attrition are addressed in the project's research questions, specifically the first one: "What are the key factors contributing to donor attrition, and how can predictive models help in identifying probable donors?" The project plans to predict and reduce churn by developing validated predictive models that identify highly probable donors, which can help in targeting interventions to reduce donor churn. Additionally, the project includes phases focused on predictive analysis and segmentation to understand, segment, and predict donor lifetime value and churn, thereby informing strategies to enhance donor retention.


In [43]:
# here for instance , I asked a question that is not in the pdf , so MSF didn't mention in the pdf specific algorithms we need to use
response = rag_chain.invoke(
    "What predictive modeling techniques and evaluation metrics are proposed in the MSF Churn Prediction Project to forecast donor churn and lifetime value?"
)

In [44]:
print(response['response'])

The provided context does not specify any predictive modeling techniques or evaluation metrics proposed in the MSF Churn Prediction Project to forecast donor churn and lifetime value. Therefore, I cannot provide an answer to that question based on the given information.
