<a href="https://colab.research.google.com/github/the-lixy/DiSCoAI-Coding-Task/blob/main/DiSCoAICodingTaskStreamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy
!pip install contractions
!python -m spacy download en_core_web_sm
!pip install openai==0.28.1
!pip install streamlit
!npm install -g localtunnel

In [2]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('GPTkey')

In [35]:
%%writefile DiSCoAIapp.py
import streamlit as st
import contractions
import spacy
from spacy.matcher import PhraseMatcher
import re
import os
from google.colab import userdata
import openai


st.title("Dialogue Function Classifier")
st.write("Please upload dialogue as a .txt file: ")

# file uploader widget
uploaded_file = st.file_uploader("Choose a file", type=["txt"])

if uploaded_file is not None:
  filename = uploaded_file.name

  # punctuation removal using regex
  def remove_punctuation(text):
      return re.sub(r'[^\w\s\?]', '', text)

  # parse dialogue to separate speaker and utterance
  rawUtterances = []
  utterances = []
  f = uploaded_file.read().decode("utf-8")
  lines = f.splitlines()
  for line in lines:
      line = line.strip()
      if ":" in line:
          speaker, utterance = line.split(":", 1)
          speaker = speaker.strip()
          rawUtterances.append((speaker, utterance))
          # normalisation steps - lowercase, noise removal, contraction expansion, remove punctuation
          utterance = utterance.strip().lower()
          utterance = contractions.fix(utterance)
          utterance = remove_punctuation(utterance)
          utterances.append((speaker, utterance))

  # display original dialogue turns
  st.write("Original Dialogue:")
  for i, (speaker, utterance) in enumerate(utterances):
    st.write(speaker + ": " + rawUtterances[i][1])

  # (testing only) display normalised dialogue turns
  #st.write("Normalised Dialogue:")
  #for i, (speaker, utterance) in enumerate(utterances):
    #st.write(speaker + ": " + utterances[i][1])

  nlp = spacy.load("en_core_web_sm")

  subjects = ["i", "you", "we", "he", "she", "they", "us"]

  commitment_phrases = [
      "will", "shall", "must", "ought to", "have to",
      "going to", "gotta", "intend to", "promise to", "swear to", "vow to",
      "guarantee", "commit to"
  ]

  proposal_phrases = [
      "should", "could", "would", "may", "might", "{subject} can",
      "can possibly", "can perhaps",
      "recommend", "suggest", "advise",
      "consider", "plan to", "aim to", "hope to",
      "wish to", "try to",
      "supposed to", "expected to",
      "would like to", "let us"
  ]

  justification_phrases = [
      "because", "since", "due to", "as a result of", "true but", "yes but",
      "therefore", "thus", "consequently", "for this reason"
  ]

  query_phrases = [
      "can {subject}", "could {subject}", "would {subject}", "will {subject}", "may {subject}",
      "might {subject}", "would {subject} mind", "do {subject} know", "can {subject}", "could {subject}",
      "is it possible", "may {subject}", "shall {subject}"
  ]

  deferral_phrases = [
      "maybe", "perhaps", "possibly", "might be", "could be", "not yet",
      "let us wait", "hold off", "postpone", "delay",
      "put off", "wait and see", "hold on", "defer", "for now", "need to think"
  ]

  challenge_phrases = [
      "why do not {subject}", "why does not {subject}", "not convinced", "how about", "what if",
      "are not {subject}", "is not {subject}", "is not it", "do not {subject} think", "would not it be",
      "could not {subject}", "shouldn't {subject}", "is it really", "are you sure", "i am not sure"
  ]

  #add all subjects to phrase lists
  challenge_phrases = [
    phrase.format(subject=s) if "{subject}" in phrase else phrase
    for phrase in challenge_phrases
    for s in subjects
  ]

  defferal_phrases = [
    phrase.format(subject=s) if "{subject}" in phrase else phrase
    for phrase in deferral_phrases
    for s in subjects
  ]

  query_phrases = [
    phrase.format(subject=s) if "{subject}" in phrase else phrase
    for phrase in query_phrases
    for s in subjects
  ]

  proposal_phrases = [
    phrase.format(subject=s) if "{subject}" in phrase else phrase
    for phrase in proposal_phrases
    for s in subjects
  ]

  commitment_phrases = [
    phrase.format(subject=s) if "{subject}" in phrase else phrase
    for phrase in commitment_phrases
    for s in subjects
  ]

  justification_phrases = [
    phrase.format(subject=s) if "{subject}" in phrase else phrase
    for phrase in justification_phrases
    for s in subjects
  ]


  matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

  matcher.add("COMMITMENT", [nlp(text) for text in commitment_phrases])
  matcher.add("PROPOSAL", [nlp(text) for text in proposal_phrases])
  matcher.add("JUSTIFICATION", [nlp(text) for text in justification_phrases])
  matcher.add("QUERY", [nlp(text) for text in query_phrases])
  matcher.add("DEFERRAL", [nlp(text) for text in deferral_phrases])
  matcher.add("CHALLENGE", [nlp(text) for text in challenge_phrases])

  def categorize_most_confident_category(text):
    doc = nlp(text)
    matches = matcher(doc)

    if "?" in text:
        return {"category": "query", "matches": ["?"]}

    counts = {
        "commitment": 0,
        "proposal": 0,
        "justification": 0,
        "query": 0,
        "deferral": 0,
        "challenge": 0
    }
    matched_phrases = {
        "commitment": [],
        "proposal": [],
        "justification": [],
        "query": [],
        "deferral": [],
        "challenge": []
    }

    for match_id, start, end in matches:
        span = doc[start:end]
        label = nlp.vocab.strings[match_id].lower()
        counts[label] += 1
        matched_phrases[label].append(span.text)

    # find category with max matches
    max_category = max(counts, key=counts.get)

    # if no matches found, return "statement"
    if counts[max_category] == 0:
        return {"category": "statement", "matches": ["none found"]}

    return {"category": max_category, "matches": matched_phrases[max_category]}

    # classify all turns
  classified = []
  for _, utterance in utterances:
      classified += [categorize_most_confident_category(utterance)]

  # display a table of utterances and their classifications
  import pandas as pd
  results = []
  for i, (speaker, _) in enumerate(utterances):
      results.append({
          "Speaker": speaker,
          "Utterance": rawUtterances[i][1],
          "Function": classified[i]['category'],
          "Reason": ", ".join(f'"{m}"' for m in classified[i]['matches'])
      })
  df = pd.DataFrame(results)
  st.table(df)  # or st.dataframe(df)


  # format the results for passing to the LLM
  i = 1
  formattedResults = []
  formattedResults.append("Speaker: " + "Utterance" + " | " + "Dialogue Function")

  for i, (speaker, utterance) in enumerate(utterances):
    formattedResults.append(speaker + ": " + rawUtterances[i][1] + " | " + classified[i]['category'] + "\n")

  api_key = os.getenv("OPENAI_API_KEY")

  if not api_key:
      st.error("API key not found. Please set it in Colab before running the app.")

  # Task 2 - LLM
  # make sure the API key is available
  openai.api_key = os.getenv("OPENAI_API_KEY")

  # request
  response = openai.ChatCompletion.create(
      model="gpt-4o-mini",
      messages=[
          {"role": "user", "content":
          "Very briefly summarise this conversation in past-tense narrative format. You must always make reference to the supplied dialogue function (Proposal, Challenge, Commitment, Justification, Query, Deferral, or Statement) of each utterance. Do not include the actual dialogue. Do not make your own assumptions about the dialogue function of any utterances."
          + str(formattedResults)}
      ]
  )

  st.write(response.choices[0].message["content"])

Overwriting DiSCoAIapp.py


In [None]:
import subprocess
import threading
import time
import re

# start streamlit app
def run_streamlit():
    subprocess.run(["streamlit", "run", "DiSCoAIapp.py"])

threading.Thread(target=run_streamlit).start()

# give Streamlit some time to boot
time.sleep(5)

# start localtunnel and capture output
lt_process = subprocess.Popen(
    ['lt', '--port', '8501', '--subdomain', 'albu-streamlit'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

# read and display the public URL from localtunnel's output
for line in lt_process.stdout:
    if 'your url is:' in line:
        url = re.search(r'(https://.*\.loca\.lt)', line)
        if url:
            print("Streamlit available at:", url.group(1), ".", "Tunnel may ask you for a password, the cell below will show you the password.")
            break

In [None]:
!wget -q -O - https://loca.lt/mytunnelpassword

In [29]:
# uncomment this to kill Streamlit and LocalTunnel processes
#!pkill streamlit
#!pkill lt