In [261]:
import pandas as pd
import numpy as np
import nltk
from nltk import pos_tag, word_tokenize
import re
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import json
import spacy

In [262]:
### Human Evaluator Verified Dataset
data = pd.read_csv("output_mode_2_only_valid_questions.csv")
og_data_size = len(data)

df_gp = data[["group", "prefix"]]
df_gp = df_gp.drop_duplicates(subset=["prefix"])
print(f"Original Data size of {og_data_size} reduced to -> {len(df_gp)}")

# group by the first part of the prefix - e.g 3000
df_gp['prefix'], df_gp['index'] = zip(*df_gp['prefix'].str.split('_').apply(lambda x: (x[0], x[1])))
df_gp['index'] = df_gp['index'].astype(int) - 1
df_gp['group'] = df_gp['group'].str.split('_').str[0]
df_gp = df_gp.groupby(['group', 'prefix'])['index'].apply(list).reset_index()

Original Data size of 909 reduced to -> 155


In [263]:
### Human Evaluator Verified Dataset
# algorithm to parse through each group and every prefix to collect a list of OQ/OA/CA/FUQS
full_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/full_clustered.json'
gpt_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/gpt_clustered.json'
org_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/org_clustered.json'

# Load data
full_df = pd.read_json(full_path)
gpt_df = pd.read_json(gpt_path)
org_df = pd.read_json(org_path)

filtered_data_full = pd.DataFrame(columns=full_df.columns)
filtered_data_gpt = pd.DataFrame(columns=gpt_df.columns)
filtered_data_org = pd.DataFrame(columns=org_df.columns)

json_df = None

# Explode the `generated_follow_up` column
# json_data = json_data.explode('generated_follow_up', ignore_index=True)

for index, row in df_gp.iterrows():
    match row['group']:
        case 'full':
            json_df = full_df
        case 'gpt':
            print()
            json_df = gpt_df
        case 'org':
            json_df = org_df
        case _:
            print("Invalid File Found")
            json_df = None
            break
    
    for _, json_data in json_df.iterrows():
        if int(json_data['id']) != int(row['prefix']):
            continue
        
        relevant_follow_ups = np.array(json_data['generated_follow_up'])
        relevant_follow_ups = relevant_follow_ups[row['index']]
        row_data = json_data
        row_data['generated_follow_up'] = relevant_follow_ups

        match row['group']:
            case 'full':
                filtered_data_full.loc[len(filtered_data_full)] = row_data
            case 'gpt':
                filtered_data_gpt.loc[len(filtered_data_gpt)] = row_data
            case 'org':
                filtered_data_org.loc[len(filtered_data_org)] = row_data
            case _:
                print("Invalid File Found")
                break
        break





















In [264]:
### Entire Dataset Collected by Each of the 3 Models
full_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/full_clustered.json'
gpt_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/gpt_clustered.json'
org_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/org_clustered.json'

# Load data
full_df = pd.read_json(full_path)
gpt_df = pd.read_json(gpt_path)
org_df = pd.read_json(org_path)

In [293]:
# Methods to Determine if a Sentence is a valid question
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Define a regex pattern to match informative questions
invalid_words_pattern = r'<\w+>'
nlp = spacy.load("en_core_web_sm")

def contains_question_mark(sentence, invalid_questions):
    if sentence[-1] == '?':
        return True
    else:
        invalid_questions.append(sentence)
        return False

def is_question_dependency_parsing(sentence, invalid_questions):
    """Detects whether a sentence is a question using dependency parsing."""
    doc = nlp(sentence)
    
    # Track question indicators
    is_wh_question = False
    is_aux_question = False
    
    # ✅ Track negation but don't invalidate questions outright
    # negation_found = any(token.dep_ == "neg" for token in doc)

    for token in doc:
        # ✅ WH-Questions (What, Why, How, Where, etc.)
        if token.dep_ in {"attr", "nsubj", "advmod"} and token.head.dep_ in {"ROOT", "nsubj", "advmod"}:
            is_wh_question = True
        
        # ✅ Yes/No Questions (Do you..., Can we..., Is it...)
        if token.dep_ == "aux" and token.head.dep_ == "ROOT":
            is_aux_question = True
        
        # ✅ Special Case: "Why", "How", "Where" directly at the start are always questions
        if token.text.lower() in {"why", "how", "where"}:
            is_wh_question = True
    
    # ✅ Final Decision:
    if not (is_wh_question or is_aux_question):
        # 🔥 **NEW: Only invalidate if negation makes it rhetorical**  
        invalid_questions.append(sentence)
        return False  # Negation in non-WH questions is more likely rhetorical
    # ❌ Not a valid question
    return True

def contains_invalid_word(question, invalid_questions):
    if bool(re.search(invalid_words_pattern, question)):
        invalid_questions.append(question)
        return True
    else:
        return False

def get_consecutive_word_sequences(sentence):
    """Extracts consecutive word sequences of at least `min_length` words."""
    # min_length = int(len(sentence.split(" "))/3) # if a sentence uses over half of it's words, it's copying
    min_length = 10
    words = re.findall(r'\b\w+\b', sentence)  # Extract words
    sequences = set()

    for i in range(len(words) - min_length + 1):
        phrase = " ".join(words[i:i + min_length])  # Create word sequence
        sequences.add(phrase)

    return sequences

def contains_duplicate_words(og_question, og_answer, follow_up_question, invalid_questions):
    """Checks if there is a common substring of at least `min_length` consecutive words between two sentences."""
    og_question_answer = og_question + " " + og_answer  # Merge question and answer
    
    og_seq = get_consecutive_word_sequences(og_question_answer)
    follow_up_seq = get_consecutive_word_sequences(follow_up_question)

    common_sequences = og_seq & follow_up_seq  # Only allow consecutive matches

    if common_sequences:
        invalid_questions.append(follow_up_question)
        return True  # Found duplicate consecutive words
    return False

# combining all the other methods
def is_valid_question(question, og_question, og_answer, invalid_questions):
    # print(question)
    return (
        contains_question_mark(question, invalid_questions) and 
        is_question_dependency_parsing(question, invalid_questions) and 
        not contains_invalid_word(question, invalid_questions) and
        not contains_duplicate_words(og_question, og_answer, question, invalid_questions)
        )

def filterInvalidFollowUpQuestions(df):
    # df columns = ['id', 'question', 'answer', 'follow-up', 'relation', 'generated_follow_up']
    invalid_questions = []

    for index, row in df.iterrows():
        original_question = row['question']
        original_answer = row['answer']

        valid_questions = [follow_up for follow_up in row['generated_follow_up'] if is_valid_question(follow_up, original_question, original_answer, invalid_questions)]
        df.at[index, "generated_follow_up"] = valid_questions
    
    return df, invalid_questions

[nltk_data] Downloading package punkt to /Users/tkang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tkang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [280]:
# og_question = "ELI5: What is a heat dome?"
# og_answer = "Is when the high pressure in the atmosphere traps the hot air in below. As you would know, the hot air rises, which only causes the air to compress because of the pressure from above and it gets hotter, hotter, hotter and denser (That's why you would kill for a glass of water, the hot air is literally pushing you against more hot air)"
# question = ">That's why you would kill for a glass of water  Is that why people die from heatstroke?"

# contains_duplicate_words(og_question, og_answer, question)

In [294]:
full_df.to_json("full_df.json", orient="records", indent=4)
gpt_df.to_json("gpt_df.json", orient="records", indent=4)
org_df.to_json("org_df.json", orient="records", indent=4)

In [295]:
print(f"before filtering out all invalid follow up questions in FULL: { len(full_df['generated_follow_up'].explode()) }")
full_df_valid_follow_up_only, full_invalid_questions = filterInvalidFollowUpQuestions(full_df.copy())
print(f"after filtering out all invalid follow up questions in FULL: {len(full_df_valid_follow_up_only['generated_follow_up'].explode())}")

print(f"before filtering out all invalid follow up questions in GPT: {len(gpt_df['generated_follow_up'].explode())}")
gpt_df_valid_follow_up_only, gpt_invalid_questions = filterInvalidFollowUpQuestions(gpt_df.copy())
print(f"after filtering out all invalid follow up questions in GPT: {len(gpt_df_valid_follow_up_only['generated_follow_up'].explode())}")

print(f"before filtering out all invalid follow up questions in ORG: {len(org_df['generated_follow_up'].explode())}")
org_df_valid_follow_up_only, org_invalid_questions = filterInvalidFollowUpQuestions(org_df.copy())
print(f"after filtering out all invalid follow up questions in ORG: {len(org_df_valid_follow_up_only['generated_follow_up'].explode())}")


before filtering out all invalid follow up questions in FULL: 2061
after filtering out all invalid follow up questions in FULL: 1931
before filtering out all invalid follow up questions in GPT: 1895
after filtering out all invalid follow up questions in GPT: 1827
before filtering out all invalid follow up questions in ORG: 2349
after filtering out all invalid follow up questions in ORG: 1568


In [296]:
### cluster by similar follow up questions
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import json
from nltk.corpus import stopwords
nltk.download("stopwords")

embedder = SentenceTransformer("all-mpnet-base-v2")

models = [full_df_valid_follow_up_only, gpt_df_valid_follow_up_only, org_df_valid_follow_up_only]
model_names = ["full", "gpt", "org"]
distance_thresholds = np.arange(0, 1.1, 0.1)
# distance_thresholds = [0,1]
clustered_models = []
stop_words = set(stopwords.words("english"))

def remove_stopwords(sentence):
    words = word_tokenize(sentence)  # Tokenize sentence into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return " ".join(filtered_words)  # Reconstruct sentence

for idx in range(len(models)):
    print("Start working on model")
    model_name = model_names[idx]
    data = models[idx].copy()

    # Loop over all rows in the dataset
    for dt in distance_thresholds:
        print(f"Evaluating distance threshold: {dt}...")
        # Store clustered follow-up questions
        clustered_follow_ups = []
        clustered_follow_ups_count = []

        for index, row in data.iterrows():
            task_id = row['id']
            
            # remove stopwrods
            current_corpus = [remove_stopwords(sentence) for sentence in row["generated_follow_up"]]

            # Ensure it's a list (not a single string)
            if isinstance(current_corpus, str):
                current_corpus = [current_corpus]
            elif not isinstance(current_corpus, list):
                clustered_follow_ups.append([])
                clustered_follow_ups_count.append(0)
                continue

            if len(current_corpus) <= 1:
                clustered_follow_ups.append(current_corpus)  # Keep as is
                clustered_follow_ups_count.append(1)  # Single item cluster
                print(f"Task {task_id} - Only 1 sample, skipping clustering.")
                continue

            # Convert sentences to embeddings
            corpus_embeddings = embedder.encode(current_corpus)

            # Perform agglomerative clustering
            clustering_model = AgglomerativeClustering(
                metric='cosine', linkage='average',
                n_clusters=None, distance_threshold=dt
            ) 
            clustering_model.fit(corpus_embeddings)
            cluster_assignment = clustering_model.labels_

            # Organize clustered sentences
            clustered_sentences = {}
            for sentence_id, cluster_id in enumerate(cluster_assignment):
                if cluster_id not in clustered_sentences:
                    clustered_sentences[cluster_id] = []
                clustered_sentences[cluster_id].append(current_corpus[sentence_id])

            # Collect all clustered follow-ups
            clustered_list = list(clustered_sentences.values())

            # Append clustered results to columns
            clustered_follow_ups.append(clustered_list)
            clustered_follow_ups_count.append(len(clustered_list))

        # Add new columns to DataFrame
        data[f"clustered_dt_{dt}"] = clustered_follow_ups
        data[f"clustered_count_dt_{dt}"] = clustered_follow_ups_count

    clustered_models.append(data)

print("Clustering complete for all models.")


[nltk_data] Downloading package stopwords to /Users/tkang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Start working on model
Evaluating distance threshold: 0.0...
Task 3034 - Only 1 sample, skipping clustering.
Task 3094 - Only 1 sample, skipping clustering.
Task 3452 - Only 1 sample, skipping clustering.
Evaluating distance threshold: 0.1...
Task 3034 - Only 1 sample, skipping clustering.
Task 3094 - Only 1 sample, skipping clustering.
Task 3452 - Only 1 sample, skipping clustering.
Evaluating distance threshold: 0.2...
Task 3034 - Only 1 sample, skipping clustering.
Task 3094 - Only 1 sample, skipping clustering.
Task 3452 - Only 1 sample, skipping clustering.
Evaluating distance threshold: 0.30000000000000004...
Task 3034 - Only 1 sample, skipping clustering.
Task 3094 - Only 1 sample, skipping clustering.
Task 3452 - Only 1 sample, skipping clustering.
Evaluating distance threshold: 0.4...
Task 3034 - Only 1 sample, skipping clustering.
Task 3094 - Only 1 sample, skipping clustering.
Task 3452 - Only 1 sample, skipping clustering.
Evaluating distance threshold: 0.5...
Task 3034 - O

In [297]:
df_clustered_full = pd.DataFrame(clustered_models[0])
df_clustered_gpt = pd.DataFrame(clustered_models[1])
df_clustered_org = pd.DataFrame(clustered_models[2])
average_cluster_count_per_dt = {
    "full": [],
    "gpt": [],
    "org": []
}

for model_i in range(3):
    l = []
    for i in distance_thresholds:
        l.append(clustered_models[model_i][f"clustered_count_dt_{i}"].mean())
    average_cluster_count_per_dt[model_names[model_i]] = l

# df_clustered_full["generated_follow_ups_clustered_count"]

# temp = df_clustered_full[df_clustered_full["generated_follow_ups_clustered_count"] > 1]
# for group in temp["generated_follow_ups_clustered"]:
#     print(group)

In [298]:
average_cluster_count_per_dt

{'full': [3.8542914171656686,
  3.852295409181637,
  3.8043912175648704,
  3.5668662674650697,
  2.9301397205588824,
  2.2215568862275448,
  1.6107784431137724,
  1.2355289421157685,
  1.0718562874251496,
  1.0159680638722555,
  1.001996007984032],
 'gpt': [3.6467065868263475,
  3.6447105788423153,
  3.5808383233532934,
  3.241516966067864,
  2.590818363273453,
  1.9001996007984032,
  1.3912175648702594,
  1.127744510978044,
  1.0199600798403194,
  1.001996007984032,
  1.0],
 'org': [3.129740518962076,
  3.1277445109780437,
  3.1197604790419162,
  3.057884231536926,
  2.904191616766467,
  2.6127744510978044,
  2.269461077844311,
  1.874251497005988,
  1.5049900199600799,
  1.1736526946107784,
  1.0]}

In [299]:
for i in range(11):
    print(f"Average Cluster Count for Distance Threshold ({distance_thresholds[i]}):")
    for model, values in average_cluster_count_per_dt.items():
        print(f"Model {model}: {values[i]}")
    print("\n")

Average Cluster Count for Distance Threshold (0.0):
Model full: 3.8542914171656686
Model gpt: 3.6467065868263475
Model org: 3.129740518962076


Average Cluster Count for Distance Threshold (0.1):
Model full: 3.852295409181637
Model gpt: 3.6447105788423153
Model org: 3.1277445109780437


Average Cluster Count for Distance Threshold (0.2):
Model full: 3.8043912175648704
Model gpt: 3.5808383233532934
Model org: 3.1197604790419162


Average Cluster Count for Distance Threshold (0.30000000000000004):
Model full: 3.5668662674650697
Model gpt: 3.241516966067864
Model org: 3.057884231536926


Average Cluster Count for Distance Threshold (0.4):
Model full: 2.9301397205588824
Model gpt: 2.590818363273453
Model org: 2.904191616766467


Average Cluster Count for Distance Threshold (0.5):
Model full: 2.2215568862275448
Model gpt: 1.9001996007984032
Model org: 2.6127744510978044


Average Cluster Count for Distance Threshold (0.6000000000000001):
Model full: 1.6107784431137724
Model gpt: 1.391217564

In [300]:
### after removing stopwords and using min_length = 10
print(f"Average Cluster Count across all Distance Threshold")
print(f"Model FULL: {np.mean(average_cluster_count_per_dt['full'])}")
print(f"Model ORG: {np.mean(average_cluster_count_per_dt['org'])}")
print(f"Model GPT: {np.mean(average_cluster_count_per_dt['gpt'])}")

Average Cluster Count across all Distance Threshold
Model FULL: 2.378697151152241
Model ORG: 2.343131917982218
Model GPT: 2.195064416621303


In [292]:
### ^ check the second one above
print(f"Average Cluster Count across all Distance Threshold")
print(f"Model FULL: {np.mean(average_cluster_count_per_dt['full'])}")
print(f"Model ORG: {np.mean(average_cluster_count_per_dt['org'])}")
print(f"Model GPT: {np.mean(average_cluster_count_per_dt['gpt'])}")

Average Cluster Count across all Distance Threshold
Model FULL: 2.378697151152241
Model ORG: 2.343131917982218
Model GPT: 2.195064416621303


In [277]:
### after removing stopwords - and using 1/3 length as min_length
print(f"Average Cluster Count across all Distance Threshold")
print(f"Model FULL: {np.mean(average_cluster_count_per_dt['full'])}")
print(f"Model ORG: {np.mean(average_cluster_count_per_dt['org'])}")
print(f"Model GPT: {np.mean(average_cluster_count_per_dt['gpt'])}")

Average Cluster Count across all Distance Threshold
Model FULL: 2.4071856287425146
Model ORG: 2.6162221012520415
Model GPT: 2.1981491562329887


In [254]:
# for when min_length = 10 for get_consecutive_word_sequences function
print(f"Average Cluster Count across all Distance Threshold")
print(f"Model FULL: {np.mean(average_cluster_count_per_dt['full'])}")
print(f"Model ORG: {np.mean(average_cluster_count_per_dt['org'])}")
print(f"Model GPT: {np.mean(average_cluster_count_per_dt['gpt'])}")

Average Cluster Count across all Distance Threshold
Model FULL: 2.4289602612955905
Model ORG: 2.3124659771366356
Model GPT: 2.256033387769915


In [180]:
# Calculate the average number of clusters for each OQ/OA pair
mean_cluster_count_full_df = df_clustered_full["generated_follow_ups_clustered_count"].mean()
mean_cluster_count_org_df = df_clustered_org["generated_follow_ups_clustered_count"].mean()
mean_cluster_count_gpt_df = df_clustered_gpt["generated_follow_ups_clustered_count"].mean()

print(f"Average Number of Unique Follow Up Questions (Clusters) for Full Dataset: {mean_cluster_count_full_df}")
print(f"Average Number of Unique Follow Up Questions (Clusters) for ORG Dataset: {mean_cluster_count_org_df}")
print(f"Average Number of Unique Follow Up Questions (Clusters) for GPT Dataset: {mean_cluster_count_gpt_df}")

Average Number of Unique Follow Up Questions (Clusters) for Full Dataset: 3.165668662674651
Average Number of Unique Follow Up Questions (Clusters) for ORG Dataset: 3.3473053892215567
Average Number of Unique Follow Up Questions (Clusters) for GPT Dataset: 2.784431137724551


In [None]:
# import nltk
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.model_selection import cross_val_score, StratifiedKFold
# from sklearn.metrics import classification_report

# # Download dataset
# nltk.download('nps_chat')
# posts = nltk.corpus.nps_chat.xml_posts()

# # Extract text and labels
# posts_text = [post.text for post in posts]
# y = [post.get('class') for post in posts]

# # Split into train and test (80-20 split)
# train_text = posts_text[:int(len(posts_text) * 0.8)]
# test_text = posts_text[int(len(posts_text) * 0.2):]

# y_train = y[:int(len(posts_text) * 0.8)]
# y_test = y[int(len(posts_text) * 0.2):]

# # Get TF-IDF features
# vectorizer = TfidfVectorizer(ngram_range=(1,3), 
#                              min_df=0.001, 
#                              max_df=0.7, 
#                              analyzer='word')

# X_train = vectorizer.fit_transform(train_text)
# X_test = vectorizer.transform(test_text)

# # Define classifier
# gb = GradientBoostingClassifier(n_estimators=400, random_state=0)

# # Use 5-fold cross-validation on the training set
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv_scores = cross_val_score(gb, X_train, y_train, cv=cv, scoring='accuracy')

# # Print cross-validation results
# print("Cross-Validation Scores:", cv_scores)
# print("Mean Accuracy:", cv_scores.mean())

# gb.fit(X_train, y_train)

# predictions_rf = gb.predict(X_test)

# #Accuracy of 86% not bad
# print(classification_report(y_test, predictions_rf))

[nltk_data] Downloading package nps_chat to /Users/tkang/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


Cross-Validation Scores: [0.76049675 0.74630396 0.7439385  0.73491124 0.73727811]
Mean Accuracy: 0.7445857113363823
