In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk import pos_tag, word_tokenize
import re
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import json
import spacy

In [3]:
### Human Evaluator Verified Dataset
data = pd.read_csv("output_mode_2_only_valid_questions.csv")
og_data_size = len(data)

df_gp = data[["group", "prefix"]]
df_gp = df_gp.drop_duplicates(subset=["prefix"])
print(f"Original Data size of {og_data_size} reduced to -> {len(df_gp)}")

# group by the first part of the prefix - e.g 3000
df_gp['prefix'], df_gp['index'] = zip(*df_gp['prefix'].str.split('_').apply(lambda x: (x[0], x[1])))
df_gp['index'] = df_gp['index'].astype(int) - 1
df_gp['group'] = df_gp['group'].str.split('_').str[0]
df_gp = df_gp.groupby(['group', 'prefix'])['index'].apply(list).reset_index()

Original Data size of 909 reduced to -> 155


In [4]:
### Human Evaluator Verified Dataset
# algorithm to parse through each group and every prefix to collect a list of OQ/OA/CA/FUQS
full_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/full_clustered.json'
gpt_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/gpt_clustered.json'
org_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/org_clustered.json'

# Load data
full_df = pd.read_json(full_path)
gpt_df = pd.read_json(gpt_path)
org_df = pd.read_json(org_path)

filtered_data_full = pd.DataFrame(columns=full_df.columns)
filtered_data_gpt = pd.DataFrame(columns=gpt_df.columns)
filtered_data_org = pd.DataFrame(columns=org_df.columns)

json_df = None

# Explode the `generated_follow_up` column
# json_data = json_data.explode('generated_follow_up', ignore_index=True)

for index, row in df_gp.iterrows():
    match row['group']:
        case 'full':
            json_df = full_df
        case 'gpt':
            print()
            json_df = gpt_df
        case 'org':
            json_df = org_df
        case _:
            print("Invalid File Found")
            json_df = None
            break
    
    for _, json_data in json_df.iterrows():
        if int(json_data['id']) != int(row['prefix']):
            continue
        
        relevant_follow_ups = np.array(json_data['generated_follow_up'])
        relevant_follow_ups = relevant_follow_ups[row['index']]
        row_data = json_data
        row_data['generated_follow_up'] = relevant_follow_ups

        match row['group']:
            case 'full':
                filtered_data_full.loc[len(filtered_data_full)] = row_data
            case 'gpt':
                filtered_data_gpt.loc[len(filtered_data_gpt)] = row_data
            case 'org':
                filtered_data_org.loc[len(filtered_data_org)] = row_data
            case _:
                print("Invalid File Found")
                break
        break





















In [5]:
### Entire Dataset Collected by Each of the 3 Models
full_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/full_clustered.json'
gpt_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/gpt_clustered.json'
org_path = '/Users/tkang/Documents/research/nlp_followupqg/Auto_Evaluation/org_clustered.json'

# Load data
full_df = pd.read_json(full_path)
gpt_df = pd.read_json(gpt_path)
org_df = pd.read_json(org_path)

In [24]:
# Methods to Determine if a Sentence is a valid question
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Define a regex pattern to match informative questions
invalid_words_pattern = r'<\w+>'
nlp = spacy.load("en_core_web_sm")

def contains_question_mark(sentence):
    return sentence[-1] == '?'

def is_question_dependency_parsing(sentence):
    doc = nlp(sentence)
    for token in doc:
        if token.dep_ == "aux" and token.head.dep_ == "ROOT":
            return True  # Finds auxiliary verbs like "is", "does" in questions
        if token.dep_ == "attr" and token.head.dep_ == "ROOT":
            return True  # Finds WH-questions like "What is..."

def contains_invalid_words(question):
    return not bool(re.search(invalid_words_pattern, question))

def get_word_sequences(sentence):
    """Returns a set of word sequences of at least `min_length` words from a sentence."""
    words = re.findall(r'\b\w+\b', sentence)  # Extract words
    sequences = set()

    for i in range(len(words) - min_length + 1):
        phrase = " ".join(words[i:i + min_length])  # Create word sequence
        sequences.add(phrase)

    return sequences

def contains_duplicate_words(og_question, og_answer, follow_up_question):
    """Checks if there is a common substring of more than `min_length` words between two sentences."""
    og_question_answer = og_question + og_answer
    
    og_seq = get_word_sequences(og_question_answer)
    follow_up_seq = get_word_sequences(follow_up_question)

    return not bool(og_seq.intersection(follow_up_seq))  # Find common sequences

# combining all the other methods
def is_valid_question(question, og_question, og_answer):
    # print(question)
    return (
        contains_question_mark(question) and 
        is_question_dependency_parsing(question) and 
        contains_invalid_words(question) and
        contains_duplicate_words(og_question, og_answer, question)
        )

[nltk_data] Downloading package punkt to /Users/tkang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tkang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
og_question = 

In [23]:
full_df.to_json("full_df.json", orient="records", indent=4)
gpt_df.to_json("gpt_df.json", orient="records", indent=4)
org_df.to_json("org_df.json", orient="records", indent=4)

In [21]:
def filterInvalidFollowUpQuestions(df):
    # df columns = ['id', 'question', 'answer', 'follow-up', 'relation', 'generated_follow_up']
    for index, row in df.iterrows():
        valid_questions = [follow_up for follow_up in row['generated_follow_up'] if is_valid_question(follow_up)]
        df.at[index, "generated_follow_up"] = valid_questions
    
    return df

print(f"before filtering out all invalid follow up questions in FULL: { len(full_df['generated_follow_up'].explode()) }")
full_df_valid_follow_up_only = filterInvalidFollowUpQuestions(full_df.copy())
print(f"after filtering out all invalid follow up questions in FULL: {len(full_df_valid_follow_up_only['generated_follow_up'].explode())}")

print(f"before filtering out all invalid follow up questions in GPT: {len(gpt_df['generated_follow_up'].explode())}")
gpt_df_valid_follow_up_only = filterInvalidFollowUpQuestions(gpt_df.copy())
print(f"after filtering out all invalid follow up questions in GPT: {len(gpt_df_valid_follow_up_only['generated_follow_up'].explode())}")

print(f"before filtering out all invalid follow up questions in ORG: {len(org_df['generated_follow_up'].explode())}")
full_org_valid_follow_up_only = filterInvalidFollowUpQuestions(org_df.copy())
print(f"after filtering out all invalid follow up questions in ORG: {len(full_org_valid_follow_up_only['generated_follow_up'].explode())}")


before filtering out all invalid follow up questions in FULL: 2061
after filtering out all invalid follow up questions in FULL: 2011
before filtering out all invalid follow up questions in GPT: 1895
after filtering out all invalid follow up questions in GPT: 1877
before filtering out all invalid follow up questions in ORG: 2349
after filtering out all invalid follow up questions in ORG: 2067


In [8]:
json_data = full_df_valid_follow_up_only.to_json(orient="records", indent=4)

In [33]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report

# Download dataset
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()

# Extract text and labels
posts_text = [post.text for post in posts]
y = [post.get('class') for post in posts]

# Split into train and test (80-20 split)
train_text = posts_text[:int(len(posts_text) * 0.8)]
test_text = posts_text[int(len(posts_text) * 0.2):]

y_train = y[:int(len(posts_text) * 0.8)]
y_test = y[int(len(posts_text) * 0.2):]

# Get TF-IDF features
vectorizer = TfidfVectorizer(ngram_range=(1,3), 
                             min_df=0.001, 
                             max_df=0.7, 
                             analyzer='word')

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)

# Define classifier
gb = GradientBoostingClassifier(n_estimators=400, random_state=0)

# Use 5-fold cross-validation on the training set
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(gb, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation results
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

[nltk_data] Downloading package nps_chat to /Users/tkang/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


Cross-Validation Scores: [0.76049675 0.74630396 0.7439385  0.73491124 0.73727811]
Mean Accuracy: 0.7445857113363823


In [31]:
test_text[0:2]

['hey there asl please?', 'i jus wanna know sumfin really important ....']

In [25]:
gb.fit(X_train, y_train)

predictions_rf = gb.predict(X_test)

#Accuracy of 86% not bad
print(classification_report(y_test, predictions_rf))

              precision    recall  f1-score   support

      Accept       0.80      0.71      0.75       167
         Bye       0.89      0.77      0.83       155
     Clarify       0.62      0.33      0.43        24
   Continuer       0.65      0.43      0.52       115
     Emotion       0.94      0.67      0.78       868
    Emphasis       0.82      0.48      0.61       132
       Greet       0.97      0.91      0.94      1044
       Other       0.00      0.00      0.00        32
      Reject       0.88      0.70      0.78       122
   Statement       0.73      0.94      0.82      2505
      System       0.99      0.98      0.99      2279
     nAnswer       0.69      0.76      0.72        58
  whQuestion       0.90      0.88      0.89       432
     yAnswer       0.79      0.64      0.71        89
  ynQuestion       0.93      0.62      0.74       432

    accuracy                           0.86      8454
   macro avg       0.77      0.65      0.70      8454
weighted avg       0.87   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
