# Install

In [None]:
!pip install PyMuPDF bertopic

Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecti

# Import Lib

In [None]:
import fitz
import os
import re
from google import genai
import json
from zipfile import ZipFile
import ast
import zipfile
from IPython.display import clear_output
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd

# Data Preprocessing

In [None]:
path = "/content/Transcript"

In [None]:
with ZipFile(path+".zip", "r") as zObject:

  zObject.extractall(path)

In [None]:
client = genai.Client(api_key="")

In [None]:
qa_patterns = [r"^\s*Q&A\s*$", r"\banalyst q&a\s*\(ceo and cfo\)", r"\bour first question\b", r"\bQUESTION AND ANSWER SECTION\b"]


combined_pattern = re.compile(r'(' + '|'.join(qa_patterns) + r')', re.IGNORECASE | re.MULTILINE)

In [None]:
user_prompt = """You are a financial NLP assistant used to structure transcripts from earnings calls. Given the transcript below, remove all boilerplate text and headings and extract a structured section: Q&A.

Within Q&A, identify question and answer pairs with speaker names and roles.

- A single question may have multiple people answering. If so, include each responder in a separate field: `responder1`, `responder2`, etc., and their corresponding `answer1`, `answer2`, etc.
- Omit any follow-up expressions such as "Thank you", "Thanks", or other acknowledgments that occur after an answer is given. Only include the original question and the substantive answers.

Return a *valid JSON* object with this structure and nothing else:

{
  "qa": [
    {
      "questioner": "Analyst",
      "question": "What about your exposure to CRE?",
      "responder1": "John Smith, CFO",
      "answer1": "We’re confident in our position..."
    },
    {
      "questioner": "Investor",
      "question": "What can you tell us about the change in capital requirements?",
      "responder1": "Steve Harris, CEO",
      "answer1": "Our capital requirements this year...",
      "responder2": "Jane Doe, CRO",
      "answer2": "Additionally, we’ve taken measures to..."
    }
  ]
}"""


In [None]:
split_QA_context = """You are an AI assistant. Given the following transcript of an earnings call Q&A session, extract and return the full names of the individuals who asked questions.

Only include names of those asking questions — typically analysts or investors. Do not include names of those answering the questions and Do not repeat a name. Maintain the exact order in which the questions were asked.

Example output:
["Betsy Graseck", "Gerard Cassid"]

"""

In [None]:
def split_QA(qa_split, QA):

  qa_split = ast.literal_eval(qa_split)
  n = len(qa_split)
  parts = []
  json_result = []
  combined_qa = []

  for i in range(1, n+1, 2):


    name_pttern1 = qa_split[i - 1].replace(" ", r"(?: [A-Z]\.)? ")
    pattern1 = rf'(?m)^ *{name_pttern1}:?'
    start_index = re.search(pattern1, QA, flags=re.IGNORECASE)

    if i -1 == 0:

      name_pttern2 = qa_split[i + 1].replace(" ", r"(?: [A-Z]\.)? ")
      pattern2 = rf'(?m)^ *{name_pttern2}:?'
      stop_index = re.search(pattern2, QA, flags=re.IGNORECASE)
      parts.append(QA[0:stop_index.start() ])
    elif i + 1 >= n:

      parts.append(QA[start_index.start() : ])
    else:

      name_pttern2 = qa_split[i + 1].replace(" ", r"(?: [A-Z]\.)? ")
      pattern2 = rf'(?m)^ *{name_pttern2}:?'
      stop_index = re.search(pattern2, QA, flags=re.IGNORECASE)
      parts.append(QA[start_index.start() : stop_index.start()])

  clear_output(wait=False)
  for qa in parts:

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=f"{user_prompt} {qa}",
        config= {"response_mime_type": "application/json",
                 "temperature": 0.0},
        )
    json_result.append(json.loads(response.text.strip()))

  for item in json_result:
    combined_qa += item["qa"]

  return {"qa": combined_qa}

In [None]:
def transcript_name():

  clean_data = {}


  for banks in os.listdir(path):

    bank_path = os.path.join(path, banks)
    for pdf in os.listdir(bank_path):

      transcript = ""
      presentation = ""
      qa = ""

      pdf_path = os.path.join(bank_path, pdf)
      doc = fitz.open(pdf_path)
      for page in doc:

        transcript += page.get_text()

      match = combined_pattern.search(transcript)
      if match:
          index = match.start()
          first_part = transcript[:index].strip()
          second_part = transcript[index:].strip()

          response_split = client.models.generate_content(
              model="gemini-2.0-flash",
              contents=f"{split_QA_context} {second_part}",
              config={"temperature": 0.0}
              )

          split_sentence = response_split.text.strip().strip('`').strip("python")

          qa = split_QA(split_sentence, second_part)
          presentation =  first_part


      else:
          print(f"No Q&A transition phrase found. {banks, pdf}")

      year_quarter = os.path.splitext(pdf)[0].split("_")


      other_details = {
          "Details": [
              {
                  "Presentation": presentation,
                  "Year": year_quarter[0],
                  "quarter": year_quarter[1],
              }]}
      merged_json = {**other_details, **qa}

      nested_directory = os.path.join("Clean_transcript", banks)
      clean_json = pdf.split(".")[0]

      try:
        os.makedirs(nested_directory)
        clean_path = os.path.join(nested_directory, clean_json+".json")
        with open(clean_path, 'w', encoding='utf-8') as file:
          json.dump(merged_json, file, indent=4)

      except FileExistsError:
        clean_path = os.path.join(nested_directory, clean_json+".json")
        with open(clean_path, 'w', encoding='utf-8') as file:
          json.dump(merged_json, file, indent=4)


In [None]:
bank_trans = transcript_name()

In [None]:
def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                abs_path = os.path.join(root, file)
                rel_path = os.path.relpath(abs_path, start=folder_path)
                zipf.write(abs_path, arcname=rel_path)

In [None]:
zip_folder('/content/Clean_transcript', 'Clean_transcript.zip')

# Question Avoidance

In [None]:
clean_path = "/content/Clean_transcript"

with ZipFile("/content/Clean_transcript.zip", 'r') as zObject:

    zObject.extractall(path= clean_path)

In [None]:
def detect_avoidance(context, question, response):

    prompt = f"""
You are a helpful assistant that evaluates whether a response avoids answering a question.

### Context:
{context}

### Question:
{question}

### Response:
{response}

Does the response avoid answering the question?

Return only one word:
- Answered
- Avoided
"""

    result = client.models.generate_content(
              model="gemini-2.0-flash",
              contents= prompt,
              config={"temperature": 0.0}
              )

    clean_output = result.text.strip().split()[0]
    return "Answered" if "Answered" in clean_output else "Avoided"

In [None]:
for banks in os.listdir(clean_path):


  bank_path = os.path.join(clean_path, banks)
  for transcript in os.listdir(bank_path):
    transcript_path = os.path.join(bank_path, transcript)

    with open(transcript_path, "r", encoding="utf-8") as file:
      data = json.load(file)

    for i in data["qa"]:


      full_answer = (i.get("answer1") or "") + (i.get("answer2") or "")
      answer = detect_avoidance(data["Details"][0]["Presentation"], i["question"], full_answer)
      i["detect_avoidance"] = answer

    merged_json = {**{"Details": data["Details"]}, **{ "qa": data['qa'] }}

    with open(transcript_path, "w", encoding="utf-8") as file:
      json.dump(merged_json, file, indent=4, ensure_ascii=False)

In [None]:
zip_folder(clean_path, 'Clean_transcript.zip')

# Commonly Avoided Question by Quarter

In [None]:
prompt_risk_template = """
You are a risk analyst evaluating questions asked to banks.

Task:
- Review the list of questions.
- Identify only the questions that are clearly asking about **risk to the bank** — such as:
  - Credit risk
  - Regulatory or compliance risk
  - Market or interest rate risk
  - Macroeconomic uncertainty
  - Capital adequacy
  - Reputational risk
  - Operational or integration risk
- Only include a question if it contains a clear concern about potential negative outcomes or exposure.

Instructions:
- Be strict: Do not include general strategy, growth, or performance questions unless they explicitly or implicitly focus on risk.
- If **no** questions are about risk, reply with: No
- If there **are** risk-related questions, return them in a list format with the bank name as a header.

Format:
Bank Name:
- question 1
- question 2
...

Questions:
{questions}
"""

In [None]:
prompt_summarization = """
You are a risk-focused summarization analyst.

You have been given a list of risk-related questions that have already been identified across multiple banks. Your tasks are:

1. **Summarize** each risk-related question in 1–2 concise sentences.
2. **Group** the summaries by bank name.
3. At the end, **return a count** of how many risk-related questions came from each bank.

Only include the summaries and the final count section. Do not include any unrelated questions or commentary.

Questions:
{questions}
"""

In [None]:
def risk_assessment(prompt, questions):

  final_prompt = prompt.format(questions= questions)
  result = client.models.generate_content(
      model="gemini-2.0-flash",
      contents= final_prompt,
      config={"temperature": 0.0}
      )
  return result

In [None]:
banks = os.listdir(clean_path)
quarters_and_years = os.listdir(os.path.join(clean_path, banks[0]))

In [None]:
Avoided_count = {}
risk_questions = []
for quarter in quarters_and_years:

  questions = []
  Avoided_by_quarter = []
  for bank in banks:

    file_path = os.path.join(clean_path, bank, quarter)
    with open(file_path, "r", encoding="utf-8") as file:
      data = json.load(file)

    counter = 1
    question_counter = []
    for i in data["qa"]:
      if i["detect_avoidance"] == "Avoided":
        questions.append({bank :f'{counter}. {i["question"]} '})
        question_counter.append({bank :f'{counter}. {i["question"]} '})
        counter += 1

    Avoided_by_quarter.append({bank: len(question_counter)})
  Avoided_count[os.path.splitext(quarter)[0]] = Avoided_by_quarter

  assessing_risk  = risk_assessment(prompt_risk_template, questions)

  clean_output = assessing_risk.text.strip()
  if clean_output != "No":
    year_quarter = os.path.splitext(quarter)[0].split("_")
    print(f"Year: {year_quarter[0]}, Quarter: {year_quarter[1]}")
    risk_questions.append(clean_output)
    print(clean_output)
    print("\n")

cleaned_data = {}
for quarter, entries in Avoided_count.items():
    cleaned_data[quarter] = {k: v for d in entries for k, v in d.items()}


df = pd.DataFrame(cleaned_data).T
df = df.sort_index()
print("Number of Avoided Questions Count by Year and Q1 per bank")
print(df)
print("_____________________________________________________________")
print( risk_assessment(prompt_summarization, risk_questions).text.strip())

Year: 2024, Quarter: Q2
Citi:
- 2. Yeah, just to say real short follow-up to that. So you're doing all this great stuff, but you still fell short, just in like one sentence, despite doing all of this great stuff that you described, the regulator still said you didn't get it done. Why, after doing all that, didn't you get it done in the eyes of the regulators and why will it be fixed now? Just like a one sentence explanation for that if you have it. 

JPMorgan:
- 1. So, wanted to start off with a question on capital just given some indications that the Fed is considering favorable revisions to both Basel III Endgame and the GSIB surcharge calculations, which I know you've been pushing for some time. As you evaluate just different capital scenarios, are these revisions material enough where they could support a higher normalized ROTCE at the Firm versus the 17% target? And if so, just how that might impact or inform your appetite for buybacks going forward? 
- 3. Your 17% through the cyc

In [None]:
print(risk_questions)

[]
