In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import pos_tag, word_tokenize
import re
from sklearn.cluster import AgglomerativeClustering
# from sentence_transformers import SentenceTransformer
import json
import spacy

In [9]:
### Human Evaluator Verified Dataset
data = pd.read_csv("output_mode_2_only_valid_questions.csv")
og_data_size = len(data)

df_gp = data[["group", "prefix"]]
df_gp = df_gp.drop_duplicates(subset=["prefix"])
print(f"Original Data size of {og_data_size} reduced to -> {len(df_gp)}")

# group by the first part of the prefix - e.g 3000
df_gp['prefix'], df_gp['index'] = zip(*df_gp['prefix'].str.split('_').apply(lambda x: (x[0], x[1])))
df_gp['index'] = df_gp['index'].astype(int) - 1
df_gp['group'] = df_gp['group'].str.split('_').str[0]
df_gp = df_gp.groupby(['group', 'prefix'])['index'].apply(list).reset_index()

Original Data size of 934 reduced to -> 155


In [10]:
### Human Evaluator Verified Dataset
# algorithm to parse through each group and every prefix to collect a list of OQ/OA/CA/FUQS
full_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/full_clustered.json'
gpt_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/gpt_clustered.json'
org_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/org_clustered.json'

# Load data
full_df = pd.read_json(full_path)
gpt_df = pd.read_json(gpt_path)
org_df = pd.read_json(org_path)

filtered_data_full = pd.DataFrame(columns=full_df.columns)
filtered_data_gpt = pd.DataFrame(columns=gpt_df.columns)
filtered_data_org = pd.DataFrame(columns=org_df.columns)

json_df = None

# Explode the `generated_follow_up` column
# json_data = json_data.explode('generated_follow_up', ignore_index=True)

for index, row in df_gp.iterrows():
    match row['group']:
        case 'full':
            json_df = full_df
        case 'gpt':
            print()
            json_df = gpt_df
        case 'org':
            json_df = org_df
        case _:
            print("Invalid File Found")
            json_df = None
            break
    
    for _, json_data in json_df.iterrows():
        if int(json_data['id']) != int(row['prefix']):
            continue
        
        relevant_follow_ups = np.array(json_data['generated_follow_up'])
        relevant_follow_ups = relevant_follow_ups[row['index']]
        row_data = json_data
        row_data['generated_follow_up'] = relevant_follow_ups

        match row['group']:
            case 'full':
                filtered_data_full.loc[len(filtered_data_full)] = row_data
            case 'gpt':
                filtered_data_gpt.loc[len(filtered_data_gpt)] = row_data
            case 'org':
                filtered_data_org.loc[len(filtered_data_org)] = row_data
            case _:
                print("Invalid File Found")
                break
        break





















In [2]:
### Entire Dataset Collected by Each of the 3 Models
full_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/full_clustered.json'
gpt_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/gpt_clustered.json'
org_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/org_clustered.json'

# Load data
full_df = pd.read_json(full_path)
gpt_df = pd.read_json(gpt_path)
org_df = pd.read_json(org_path)

In [3]:
print(f"FULL model average number of follow-ups: {full_df['generated_follow_up'].apply(lambda x : len(x)).mean()}")
print(f"ORG model average number of follow-ups: {org_df['generated_follow_up'].apply(lambda x : len(x)).mean()}")
print(f"GPT model average number of follow-ups: {gpt_df['generated_follow_up'].apply(lambda x : len(x)).mean()}")

FULL model average number of follow-ups: 4.11377245508982
ORG model average number of follow-ups: 4.688622754491018
GPT model average number of follow-ups: 3.7824351297405188


In [4]:
full_df["generated_follow_up"].head()

0    [What are the primary sources of heat for anim...
1    [What are the potential long-term effects of f...
2    [What are some examples of the side conflicts ...
3    [What is the role of the human ear in interpre...
4    [Can you explain what "sandbox game" means in ...
Name: generated_follow_up, dtype: object

In [16]:
def find_average_distinct_n(df, n):
    distinct_dfs = []
    for follow_ups in df["generated_follow_up"]:
        unique_n_grams = set()
        ngrams = []
        for follow_up in follow_ups:
            words = follow_up.split()
            if len(words) < n:
                return 0 
    
            ngrams.extend([tuple(words[i:i+n]) for i in range(len(words) - n + 1)])
            unique_n_grams.update(ngrams)
    
        distinct_dfs.append((len(unique_n_grams) / len(ngrams)) * 100)
    
    return distinct_dfs

In [None]:
# 1-distinct df 
n = 1
full_distinct_dfs_1 = find_average_distinct_n(full_df, n)
org_distinct_dfs_1 = find_average_distinct_n(org_df, n)
gpt_distinct_dfs_1 = find_average_distinct_n(gpt_df, n)

print(f"Average Distinct-1 (%) for Full Model: {np.mean(full_distinct_dfs_1)}")
print(f"Average Distinct-1 (%) for ORG Model: {np.mean(org_distinct_dfs_1)}")
print(f"Average Distinct-1 (%) for GPT Model: {np.mean(gpt_distinct_dfs_1)}")

n = 2
full_distinct_dfs_2 = find_average_distinct_n(full_df, n)
org_distinct_dfs_2 = find_average_distinct_n(org_df, n)
gpt_distinct_dfs_2 = find_average_distinct_n(gpt_df, n)

print(f"Average Distinct-2 (%) for Full Model: {np.mean(full_distinct_dfs_2)}")
print(f"Average Distinct-2 (%) for ORG Model: {np.mean(org_distinct_dfs_2)}")
print(f"Average Distinct-2 (%) for GPT Model: {np.mean(gpt_distinct_dfs_2)}")

Average Distinct-1 (%) for Full Model: 77.09446487061521
Average Distinct-1 (%) for ORG Model: 66.06013286221676
Average Distinct-1 (%) for GPT Model: 77.36019186050663
Average Distinct-2 (%) for Full Model: 94.85221978683695
Average Distinct-2 (%) for ORG Model: 91.11533928099017
Average Distinct-2 (%) for GPT Model: 94.41104283452837


In [12]:
# Methods to Determine if a Sentence is a valid question
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Define a regex pattern to match informative questions
invalid_words_pattern = r'<\w+>'
nlp = spacy.load("en_core_web_sm")

def contains_question_mark(sentence, invalid_questions):
    if sentence[-1] == '?':
        return True
    else:
        invalid_questions.append(sentence)
        return False

def is_question_dependency_parsing(sentence, invalid_questions):
    """Detects whether a sentence is a question using dependency parsing."""
    doc = nlp(sentence)
    
    # Track question indicators
    is_wh_question = False
    is_aux_question = False
    
    # ✅ Track negation but don't invalidate questions outright
    # negation_found = any(token.dep_ == "neg" for token in doc)

    for token in doc:
        # ✅ WH-Questions (What, Why, How, Where, etc.)
        if token.dep_ in {"attr", "nsubj", "advmod"} and token.head.dep_ in {"ROOT", "nsubj", "advmod"}:
            is_wh_question = True
        
        # ✅ Yes/No Questions (Do you..., Can we..., Is it...)
        if token.dep_ == "aux" and token.head.dep_ == "ROOT":
            is_aux_question = True
        
        # ✅ Special Case: "Why", "How", "Where" directly at the start are always questions
        if token.text.lower() in {"why", "how", "where"}:
            is_wh_question = True
    
    # ✅ Final Decision:
    if not (is_wh_question or is_aux_question):
        # 🔥 **NEW: Only invalidate if negation makes it rhetorical**  
        invalid_questions.append(sentence)
        return False  # Negation in non-WH questions is more likely rhetorical
    # ❌ Not a valid question
    return True

def contains_invalid_word(question, invalid_questions):
    if bool(re.search(invalid_words_pattern, question)):
        invalid_questions.append(question)
        return True
    else:
        return False

def get_consecutive_word_sequences(sentence):
    """Extracts consecutive word sequences of at least `min_length` words."""
    # min_length = int(len(sentence.split(" "))/3) # if a sentence uses over half of it's words, it's copying
    min_length = 10
    words = re.findall(r'\b\w+\b', sentence)  # Extract words
    sequences = set()

    for i in range(len(words) - min_length + 1):
        phrase = " ".join(words[i:i + min_length])  # Create word sequence
        sequences.add(phrase)

    return sequences

def contains_duplicate_words(og_question, og_answer, follow_up_question, invalid_questions):
    """Checks if there is a common substring of at least `min_length` consecutive words between two sentences."""
    og_question_answer = og_question + " " + og_answer  # Merge question and answer
    
    og_seq = get_consecutive_word_sequences(og_question_answer)
    follow_up_seq = get_consecutive_word_sequences(follow_up_question)

    common_sequences = og_seq & follow_up_seq  # Only allow consecutive matches

    if common_sequences:
        invalid_questions.append(follow_up_question)
        return True  # Found duplicate consecutive words
    return False

# combining all the other methods
def is_valid_question(question, og_question, og_answer, invalid_questions):
    # print(question)
    return (
        contains_question_mark(question, invalid_questions) and 
        is_question_dependency_parsing(question, invalid_questions) and 
        not contains_invalid_word(question, invalid_questions) and
        not contains_duplicate_words(og_question, og_answer, question, invalid_questions)
        )

def filterInvalidFollowUpQuestions(df):
    # df columns = ['id', 'question', 'answer', 'follow-up', 'relation', 'generated_follow_up']
    invalid_questions = []

    for index, row in df.iterrows():
        original_question = row['question']
        original_answer = row['answer']

        valid_questions = [follow_up for follow_up in row['generated_follow_up'] if is_valid_question(follow_up, original_question, original_answer, invalid_questions)]
        df.at[index, "generated_follow_up"] = valid_questions
    
    return df, invalid_questions

[nltk_data] Downloading package punkt to /Users/tkang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tkang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
# og_question = "ELI5: What is a heat dome?"
# og_answer = "Is when the high pressure in the atmosphere traps the hot air in below. As you would know, the hot air rises, which only causes the air to compress because of the pressure from above and it gets hotter, hotter, hotter and denser (That's why you would kill for a glass of water, the hot air is literally pushing you against more hot air)"
# question = ">That's why you would kill for a glass of water  Is that why people die from heatstroke?"

# contains_duplicate_words(og_question, og_answer, question)

In [14]:
full_df.to_json("full_df.json", orient="records", indent=4)
gpt_df.to_json("gpt_df.json", orient="records", indent=4)
org_df.to_json("org_df.json", orient="records", indent=4)

In [15]:
print(f"before filtering out all invalid follow up questions in FULL: { len(full_df['generated_follow_up'].explode()) }")
full_df_valid_follow_up_only, full_invalid_questions = filterInvalidFollowUpQuestions(full_df.copy())
print(f"after filtering out all invalid follow up questions in FULL: {len(full_df_valid_follow_up_only['generated_follow_up'].explode())}")

print(f"before filtering out all invalid follow up questions in GPT: {len(gpt_df['generated_follow_up'].explode())}")
gpt_df_valid_follow_up_only, gpt_invalid_questions = filterInvalidFollowUpQuestions(gpt_df.copy())
print(f"after filtering out all invalid follow up questions in GPT: {len(gpt_df_valid_follow_up_only['generated_follow_up'].explode())}")

print(f"before filtering out all invalid follow up questions in ORG: {len(org_df['generated_follow_up'].explode())}")
org_df_valid_follow_up_only, org_invalid_questions = filterInvalidFollowUpQuestions(org_df.copy())
print(f"after filtering out all invalid follow up questions in ORG: {len(org_df_valid_follow_up_only['generated_follow_up'].explode())}")


before filtering out all invalid follow up questions in FULL: 2061
after filtering out all invalid follow up questions in FULL: 1931
before filtering out all invalid follow up questions in GPT: 1895
after filtering out all invalid follow up questions in GPT: 1827
before filtering out all invalid follow up questions in ORG: 2349
after filtering out all invalid follow up questions in ORG: 1568


In [31]:
full_df_valid_follow_up_only.to_json("diversity_output/full_valid_fq_only.json", orient="records", indent=4)
gpt_df_valid_follow_up_only.to_json("diversity_output/gpt_valid_fq_only.json", orient="records", indent=4)
org_df_valid_follow_up_only.to_json("diversity_output/org_valid_fq_only.json", orient="records", indent=4)

In [17]:
### cluster by similar follow up questions
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import json
from nltk.corpus import stopwords
nltk.download("stopwords")

embedder = SentenceTransformer("all-mpnet-base-v2")

models = [full_df_valid_follow_up_only, gpt_df_valid_follow_up_only, org_df_valid_follow_up_only]
model_names = ["full", "gpt", "org"]
# distance_thresholds = np.arange(0, 1.1, 0.1)
distance_thresholds = [0.3]
clustered_models = []
stop_words = set(stopwords.words("english"))

def remove_stopwords(sentence):
    words = word_tokenize(sentence)  # Tokenize sentence into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return " ".join(filtered_words)  # Reconstruct sentence

for idx in range(len(models)):
    print("Start working on model")
    model_name = model_names[idx]
    data = models[idx].copy()

    # Loop over all rows in the dataset
    for dt in distance_thresholds:
        print(f"Evaluating distance threshold: {dt}...")
        # Store clustered follow-up questions
        clustered_follow_ups = []
        clustered_follow_ups_count = []

        for index, row in data.iterrows():
            task_id = row['id']
            
            # remove stopwrods
            current_corpus = [remove_stopwords(sentence) for sentence in row["generated_follow_up"]]

            # Ensure it's a list (not a single string)
            if isinstance(current_corpus, str):
                current_corpus = [current_corpus]
            elif not isinstance(current_corpus, list):
                clustered_follow_ups.append([])
                clustered_follow_ups_count.append(0)
                continue

            if len(current_corpus) <= 1:
                clustered_follow_ups.append(current_corpus)  # Keep as is
                clustered_follow_ups_count.append(1)  # Single item cluster
                print(f"Task {task_id} - Only 1 sample, skipping clustering.")
                continue

            # Convert sentences to embeddings
            corpus_embeddings = embedder.encode(current_corpus)

            # Perform agglomerative clustering
            clustering_model = AgglomerativeClustering(
                metric='cosine', linkage='average',
                n_clusters=None, distance_threshold=dt
            ) 
            clustering_model.fit(corpus_embeddings)
            cluster_assignment = clustering_model.labels_

            # Organize clustered sentences
            clustered_sentences = {}
            for sentence_id, cluster_id in enumerate(cluster_assignment):
                if cluster_id not in clustered_sentences:
                    clustered_sentences[cluster_id] = []
                clustered_sentences[cluster_id].append(current_corpus[sentence_id])

            # Collect all clustered follow-ups
            clustered_list = list(clustered_sentences.values())

            # Append clustered results to columns
            clustered_follow_ups.append(clustered_list)
            clustered_follow_ups_count.append(len(clustered_list))

        # Add new columns to DataFrame
        data[f"clustered_dt_{dt}"] = clustered_follow_ups
        data[f"clustered_count_dt_{dt}"] = clustered_follow_ups_count

    clustered_models.append(data)

print("Clustering complete for all models.")


[nltk_data] Downloading package stopwords to /Users/tkang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Start working on model
Evaluating distance threshold: 0.3...
Task 3034 - Only 1 sample, skipping clustering.
Task 3094 - Only 1 sample, skipping clustering.
Task 3452 - Only 1 sample, skipping clustering.
Start working on model
Evaluating distance threshold: 0.3...
Task 3032 - Only 1 sample, skipping clustering.
Task 3219 - Only 1 sample, skipping clustering.
Task 3311 - Only 1 sample, skipping clustering.
Task 3326 - Only 1 sample, skipping clustering.
Task 3359 - Only 1 sample, skipping clustering.
Task 3393 - Only 1 sample, skipping clustering.
Task 3410 - Only 1 sample, skipping clustering.
Start working on model
Evaluating distance threshold: 0.3...
Task 3002 - Only 1 sample, skipping clustering.
Task 3010 - Only 1 sample, skipping clustering.
Task 3012 - Only 1 sample, skipping clustering.
Task 3029 - Only 1 sample, skipping clustering.
Task 3034 - Only 1 sample, skipping clustering.
Task 3036 - Only 1 sample, skipping clustering.
Task 3059 - Only 1 sample, skipping clustering.
T

In [18]:
df_clustered_full = pd.DataFrame(clustered_models[0])
df_clustered_gpt = pd.DataFrame(clustered_models[1])
df_clustered_org = pd.DataFrame(clustered_models[2])
average_cluster_count_per_dt = {
    "full": [],
    "gpt": [],
    "org": []
}

for model_i in range(3):
    l = []
    for i in distance_thresholds:
        l.append(clustered_models[model_i][f"clustered_count_dt_{i}"].mean())
    average_cluster_count_per_dt[model_names[model_i]] = l

# df_clustered_full["generated_follow_ups_clustered_count"]

# temp = df_clustered_full[df_clustered_full["generated_follow_ups_clustered_count"] > 1]
# for group in temp["generated_follow_ups_clustered"]:
#     print(group)

In [19]:
average_cluster_count_per_dt

{'full': [3.5668662674650697],
 'gpt': [3.241516966067864],
 'org': [3.057884231536926]}

In [21]:
for i in range(1):
    print(f"Average Cluster Count for Distance Threshold ({distance_thresholds[i]}):")
    for model, values in average_cluster_count_per_dt.items():
        print(f"Model {model}: {values[i]}")
    print("\n")

Average Cluster Count for Distance Threshold (0.3):
Model full: 3.5668662674650697
Model gpt: 3.241516966067864
Model org: 3.057884231536926




In [22]:
### after removing stopwords and using min_length = 10
print(f"Average Cluster Count across all Distance Threshold")
print(f"Model FULL: {np.mean(average_cluster_count_per_dt['full'])}")
print(f"Model ORG: {np.mean(average_cluster_count_per_dt['org'])}")
print(f"Model GPT: {np.mean(average_cluster_count_per_dt['gpt'])}")

Average Cluster Count across all Distance Threshold
Model FULL: 3.5668662674650697
Model ORG: 3.057884231536926
Model GPT: 3.241516966067864


In [292]:
### ^ check the second one above
print(f"Average Cluster Count across all Distance Threshold")
print(f"Model FULL: {np.mean(average_cluster_count_per_dt['full'])}")
print(f"Model ORG: {np.mean(average_cluster_count_per_dt['org'])}")
print(f"Model GPT: {np.mean(average_cluster_count_per_dt['gpt'])}")

Average Cluster Count across all Distance Threshold
Model FULL: 2.378697151152241
Model ORG: 2.343131917982218
Model GPT: 2.195064416621303


In [277]:
### after removing stopwords - and using 1/3 length as min_length
print(f"Average Cluster Count across all Distance Threshold")
print(f"Model FULL: {np.mean(average_cluster_count_per_dt['full'])}")
print(f"Model ORG: {np.mean(average_cluster_count_per_dt['org'])}")
print(f"Model GPT: {np.mean(average_cluster_count_per_dt['gpt'])}")

Average Cluster Count across all Distance Threshold
Model FULL: 2.4071856287425146
Model ORG: 2.6162221012520415
Model GPT: 2.1981491562329887


In [254]:
# for when min_length = 10 for get_consecutive_word_sequences function
print(f"Average Cluster Count across all Distance Threshold")
print(f"Model FULL: {np.mean(average_cluster_count_per_dt['full'])}")
print(f"Model ORG: {np.mean(average_cluster_count_per_dt['org'])}")
print(f"Model GPT: {np.mean(average_cluster_count_per_dt['gpt'])}")

Average Cluster Count across all Distance Threshold
Model FULL: 2.4289602612955905
Model ORG: 2.3124659771366356
Model GPT: 2.256033387769915


In [23]:
# Calculate the average number of clusters for each OQ/OA pair
mean_cluster_count_full_df = df_clustered_full["generated_follow_ups_clustered_count"].mean()
mean_cluster_count_org_df = df_clustered_org["generated_follow_ups_clustered_count"].mean()
mean_cluster_count_gpt_df = df_clustered_gpt["generated_follow_ups_clustered_count"].mean()

print(f"Average Number of Unique Follow Up Questions (Clusters) for Full Dataset: {mean_cluster_count_full_df}")
print(f"Average Number of Unique Follow Up Questions (Clusters) for ORG Dataset: {mean_cluster_count_org_df}")
print(f"Average Number of Unique Follow Up Questions (Clusters) for GPT Dataset: {mean_cluster_count_gpt_df}")

KeyError: 'generated_follow_ups_clustered_count'