# Task 3

In [None]:
!pip install -q langchain-huggingface langchain-community sentence-transformers faiss-cpu transformers accelerate

from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline
import pandas as pd
import torch


In [None]:
# Define Retriever class
class ComplaintRetriever:
    def __init__(self, vector_store_path, model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu"):
        self.embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': device},
            encode_kwargs={'normalize_embeddings': True}
        )
        self.db = FAISS.load_local(vector_store_path, self.embeddings, allow_dangerous_deserialization=True)


    def retrieve(self, query, top_k=5):
        results = self.db.similarity_search(query, k=top_k)
        return results


In [None]:
class ComplaintAnswerGenerator:
    def __init__(self, model_name="google/flan-t5-base"):
        self.pipe = pipeline("text2text-generation", model=model_name, device=0 if torch.cuda.is_available() else -1)

    def format_prompt(self, context_chunks, question):
        context_text = "\n\n".join(context_chunks)
        return f"""You are a financial analyst assistant for CrediTrust. Your task is to answer questions about customer complaints.
Use the following retrieved complaint excerpts to formulate your answer. If the context doesn't contain the answer, state that you don't have enough information.

Context:
{context_text}

Question: {question}
Answer:"""

    def generate_answer(self, context_chunks, question):
        prompt = self.format_prompt(context_chunks, question)
        output = self.pipe(prompt, max_new_tokens=256)[0]["generated_text"]
        return output.strip()


In [None]:
class RAGPipeline:
    def __init__(self, vector_store_path):
        self.retriever = ComplaintRetriever(vector_store_path)
        self.generator = ComplaintAnswerGenerator()

    def answer_question(self, question, top_k=5):
        results = self.retriever.retrieve(question, top_k)
        chunks = [res.page_content for res in results]
        metadata = [res.metadata for res in results]
        answer = self.generator.generate_answer(chunks, question)
        return {
            "question": question,
            "answer": answer,
            "sources": chunks[:2],
            "metadata": metadata[:2]
        }


In [None]:
# Install transformers (only once)
!pip install -q transformers

# Imports
import pandas as pd
from transformers import pipeline

# Initialize summarizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Define summarization function
def summarize_answer(text):
    if not text.strip():
        return ""
    summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']

# Initialize RAG pipeline
vector_store_path = "/content/drive/MyDrive/vector_store/faiss_index_sample"
rag = RAGPipeline(vector_store_path)

# Define evaluation questions
questions = [
    "Why was my credit card payment rejected?",
    "How do I dispute a charge from a BNPL provider?",
    "What are common issues with money transfers?",
    "Why was my personal loan denied?",
    "How long does it take to get a savings account refund?"
]

# Run evaluation and summarize answers
results = []
for question in questions:
    response = rag.answer_question(question)
    summarized = summarize_answer(response['answer'])

    results.append({
        "Question": question,
        "Generated Answer": summarized,
        "Source Excerpts": "\n---\n".join(response['sources']),
        "Quality Score (1-5)": "",  # Fill manually
        "Comments": ""              # Fill manually
    })

# Create DataFrame
eval_df = pd.DataFrame(results)
eval_df


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Device set to use cuda:0
Your max_length is set to 150, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 150, but your input_length is only 5. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Your max_length is set to 150, but your input_length is only 110. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Your max_length is set to 150, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length m

Unnamed: 0,Question,Generated Answer,Source Excerpts,Quality Score (1-5),Comments
0,Why was my credit card payment rejected?,Late payment credit card even though money ac...,tried paying discover credit card xxxx time xx...,,
1,How do I dispute a charge from a BNPL provider?,Bankers speak bank speak bank confirm bank co...,unauthorized charge removed due winning disput...,,
2,What are common issues with money transfers?,Money sent deposited transferred lost used us...,several occasion sending receiving money probl...,,
3,Why was my personal loan denied?,Credit denial happened year applied personal ...,applied loan denied due application submitted ...,,
4,How long does it take to get a savings account...,Day of the day is the 24th anniversary of the...,waiting fund account returned since year told ...,,


In [None]:
# Save to markdown format
with open("evaluation_table.md", "w") as f:
    f.write(eval_df.to_markdown(index=False))
print("✅ Evaluation table saved to evaluation_table.md")


✅ Evaluation table saved to evaluation_table.md
