In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import pandas as pd
book_path = "/kaggle/input/gpt-dataset/book_input.pdf"

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
PAT = user_secrets.get_secret("pat")


GITHUB_USERNAME = "vladkisin"
REPO_NAME = "workmind-dev"
REPO_URL = f"https://{GITHUB_USERNAME}:{PAT}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
os.system(f"git clone {REPO_URL}")
os.chdir("/kaggle/working/workmind-dev")

In [None]:
! pip install -U -r requirements.txt --quiet

In [None]:
import wandb
wandb.login(key=user_secrets.get_secret("wandb_pat"))

In [None]:
import pymupdf

def filter_page(text):
  for term in ["Chapter Outline", "KEY TERMS", "EXERCISES/ACTIVITIES FOR TEACHERS AND STUDENT",
               "CASE STUDY:", "LEARNING OUTCOMES"]:
    if term in text:
      return True
  return False


pages = []
doc = pymupdf.open(book_path)
for page in doc[20:532]: # iterate over the relevant document pages (actual text)
  text = page.get_text()
  if not filter_page(text):
      pages.append(text)

text = "\n".join(pages)

In [None]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize import sent_tokenize

import torch
from llama_index.core.schema import TextNode
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, BitsAndBytesConfig
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate

from workmind.experiment.wandb.interventions import InterventionExperiment
from workmind.generators.interventions import RAGInterventionGenerator
from workmind.data.utils import chunk_text_by_sentences


sentences = sent_tokenize(text)
chunks = chunk_text_by_sentences(sentences)
nodes = [TextNode(text=chunk) for chunk in chunks]

In [None]:
system_prompt = """
You are an advanced text analysis assistant. Your task is to:
- Read the provided {entity}.
- Determine why the content indicates dissatisfaction or frustration. Most likely it does as it was identified by a sentiment analysis engine.
- If dissatisfaction is detected:
- Summarize the core issues clearly.
- Propose concise short-term and long-term HR interventions.
- If no dissatisfaction is detected, simply indicate that there is no frustration.
- Output the response in a structured format:
    0. Dissatisfaction detected: [Yes/No]
    1. Dissatisfaction reason: [Brief summary]
    2. Interventions:
        a) Short term: [Actionable recommendations]
        b) Long term: [Actionable recommendations]
"""

user_prompt = """
Please analyze the following {entity} and:
1. Check if there is any dissatisfaction or frustration expressed.
2. If so, identify the main reasons for the employee’s frustration.
3. Summarize these concerns briefly and clearly.
4. Recommend actionable short-term and long-term HR personnel interventions. Be clear and concise.
If there are certainly no signs of dissatisfaction are found, just indicate "Dissatisfaction detected: No" and stop generation.

Context information is below. Use it if applicable.\n
---------------------\n
{context_str}\n
---------------------\n

Emails:
{query_str}
"""


refine_prompt_str = (
    "We have the opportunity to refine the original answer "
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{context_msg}\n"
    "------------\n"
    "Given the new context, refine the original answer to better "
    "answer the question: {query_str}. "
    "If the context isn't useful, output the original answer again.\n"
    "Original Answer: {existing_answer}"
)



chat_text_qa_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content=(
            system_prompt.format(entity="email(s)")
        ),
    ),
    ChatMessage(role=MessageRole.USER, content=user_prompt.format(entity="email(s)")),
]
text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)

chat_refine_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content=(
            "Always answer the question, even if the context isn't helpful. Keep it concise."
        ),
    ),
    ChatMessage(role=MessageRole.USER, content=refine_prompt_str),
]
refine_template = ChatPromptTemplate(chat_refine_msgs)


In [None]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex(nodes, top_k=2)

In [None]:
df = pd.read_csv("/kaggle/input/gpt-dataset/interventions_eval_gpt4o.csv") # pd.read_csv("/kaggle/input/gpt-dataset/interventions_on_glassdoor_eval_gpt4o.csv")

In [None]:
anchor = df["intervention"].tolist()
#input_texts = [[text] for text in df["text"].tolist()]
input_texts = [eval(x) for x in df["texts"]]

In [None]:
PROJECT_NAME = "workmind-interventions"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

for model_name in [
    "tiiuae/Falcon3-7B-Instruct",
    "microsoft/Phi-3-mini-4k-instruct",
    "Qwen/Qwen2.5-7B-Instruct"
    ]:
  llm = HuggingFaceLLM(
      model_name=model_name,
      tokenizer_name=model_name,
      context_window=2048,
      max_new_tokens=512,
      model_kwargs={"quantization_config": quantization_config},
      device_map="cuda",
  )

  generator = RAGInterventionGenerator(
          llm=llm,
          index=index,
          text_qa_template=text_qa_template,
          refine_template=refine_template)
  with InterventionExperiment(generator, model_name + " RAG",  project_name=PROJECT_NAME) as exp:
      exp.evaluate(input_texts, anchor)
      del generator
      torch.cuda.empty_cache()