# Cluster Human Questions

In [5]:
# collect all human questions
path_qs = '../data/cast19/human_questions.txt'

gs_qs = {}
with open(path_qs, 'r') as f:
    next(f)
    for line in f:
        q_id, q = line.split('\t')
#         print(q_id)
        gs_qs[q_id] = (q.strip('\n'))
print("Total questions: %d" % len(gs_qs))
# print(gs_qs['1_11'])
all_questions = list(gs_qs.values())
print(all_questions[:2])

# GT labels
gt_clusters = [q_id.split('_')[0] for q_id in gs_qs]
print(gt_clusters[:2])
n_clusters = len(set(gt_clusters))
print("# GT clusters: %d" % n_clusters)
print(all_questions[0])

Total questions: 748
["What is a physician's assistant?", "What are the educational requirements required to become a physician's assistant?"]
['1', '1']
# GT clusters: 80
What is a physician's assistant?


In [6]:
# 0. preprocess questions -> keywords
from collections import Counter
import spacy

nlp = spacy.load('en_core_web_sm')
stopwords = ['tell']

keywords = Counter()
for q in all_questions:
    # preprocess question
    q = nlp(q.lower())
    keywords.update([token.text for token in q if not token.is_punct and not token.is_stop])
#     print(keywords)
# fts = [word for (word, freq) in keywords.items() if freq > 1]
print(keywords.most_common(10))
# terms = [token.text for token in nlp(q.lower()) if token.text in fts and token.text not in stopwords]

[('tell', 49), ('important', 28), ('energy', 24), ('describe', 22), ('different', 21), ('chemical', 21), ('types', 20), ('music', 19), ('good', 16), ('main', 15)]


In [9]:
# 1. embed questions
# embed with transformer trained on Wiki
# model = 'bert-base-wikipedia-sections-mean-tokens'
# embed with transformer trained for STS
model = 'roberta-large-nli-stsb-mean-tokens'
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/clustering_wikipedia_sections.py
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(model)
# encode all questions as vectors (embed)
q_vectors = model.encode(all_questions)
print(len(q_vectors))

748


In [10]:
# 1. embed questions
# embed with tfidf sklearn
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer()
# q_vectors = vectorizer.fit_transform(all_questions).todense()
# print(q_vectors[0])
# print(len(q_vectors))

In [11]:
# 2. compare embeddings
# compare all questions with each other
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(q_vectors, q_vectors)
print(len(similarities))
similarities[0]
sum_similarities = sum(similarities)
# print(len(sum_similarities))
import numpy as np
np.argmax(sum_similarities)
# sum_similarities

748


479

In [12]:
# 3. cluster questions
from sklearn.cluster import KMeans

num_clusters = n_clusters

km = KMeans(n_clusters=num_clusters)

%time km.fit(similarities)

clusters = km.labels_.tolist()
# from sklearn.externals import joblib
#joblib.dump(km,  'doc_cluster.pkl')
# km = joblib.load('doc_cluster.pkl')
# clusters = km.labels_.tolist()
print(clusters)

CPU times: user 4.47 s, sys: 211 ms, total: 4.68 s
Wall time: 2.36 s
[53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 12, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 63, 38, 17, 63, 63, 63, 63, 38, 63, 63, 63, 5, 5, 5, 5, 5, 5, 5, 5, 5, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 68, 46, 33, 33, 11, 11, 33, 33, 33, 19, 58, 58, 58, 58, 23, 23, 23, 23, 58, 58, 11, 11, 11, 11, 76, 2, 78, 24, 24, 78, 76, 76, 24, 46, 24, 24, 51, 51, 51, 51, 51, 14, 51, 51, 78, 2, 2, 2, 2, 2, 2, 76, 2, 54, 16, 54, 54, 61, 54, 54, 54, 54, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 76, 76, 15, 26, 46, 3, 16, 6, 76, 77, 77, 77, 77, 61, 61, 14, 56, 56, 56, 64, 64, 56, 56, 56, 56, 17, 3, 3, 19, 19, 19, 19, 3, 17, 52, 44, 44, 44, 44, 44, 44, 44, 61, 76, 44, 44, 17, 65, 65, 65, 17, 73, 17, 37, 37, 46, 70, 70, 70, 70, 70, 70, 73, 73, 73, 17, 73, 73, 73, 73, 74, 4, 74, 74, 74, 4, 15, 38, 74, 74, 57, 57, 0, 0, 0, 57, 62, 57, 9, 75, 75, 75, 75, 38, 14, 30, 7, 7, 7, 7, 7, 7, 7, 7, 7, 17, 17, 65, 17, 17, 17, 73, 73, 52, 71, 4, 

In [13]:
# evaluate clustering
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_mutual_info_score, fowlkes_mallows_score

print(gt_clusters)
print(adjusted_rand_score(gt_clusters, clusters))
print(normalized_mutual_info_score(gt_clusters, clusters))
print(adjusted_mutual_info_score(gt_clusters, clusters))
print(fowlkes_mallows_score(gt_clusters, clusters))

['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '4', '4', '4', '4', '4', '4', '4', '4', '4', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '6', '6', '6', '6', '6', '6', '6', '6', '6', '7', '7', '7', '7', '7', '7', '7', '7', '7', '7', '7', '8', '8', '8', '8', '8', '8', '8', '9', '9', '9', '9', '9', '9', '9', '9', '9', '10', '10', '10', '10', '10', '10', '10', '10', '10', '11', '11', '11', '11', '11', '11', '11', '11', '12', '12', '12', '12', '12', '12', '12', '12', '12', '13', '13', '13', '13', '13', '13', '13', '13', '13', '13', '13', '14', '14', '14', '14', '14', '14', '14', '14', '14', '15', '15', '15', '15', '15', '15', '15', '16', '16', '16', '16', '16', '16', '16', '16', '16', '17', '17', '17', '17', '17', '17', '17', '17', '17', '17', '18', '18', '18', '18', '18', '18', '18', '18', '18', '18', '18', '19', '19', '19', '19', '19', '19', '19', '19', '19', '2

In [14]:
# show predicted clusters
from collections import defaultdict

predicted_clusters = defaultdict(list)
for i, cluster in enumerate(clusters):
    predicted_clusters[cluster].append(i)
# print(predicted_clusters)

for indices in predicted_clusters.values():
    for i in indices:
        print(all_questions[i])
    # new cluster
    print('\n')

What is a physician's assistant?
What are the educational requirements required to become a physician's assistant?
What does it cost for the educational requirements required to become a physician's assistant?
What's the average starting salary of physician's assistants in the UK?
What's the average starting salary of physician's assistants in the US?
What school subjects are needed to become a registered nurse?
What is the physician's assistant PA average salary vs a registered nurse RN?
What the difference between a physician's assistant PA and a nurse practitioner?
Do nurse practitioners NPs or physician's assistants PAs make more?
Is a physician's assistant PA above a nurse practitioner NP?
What is the fastest way to become a nurse practitioner NP?


How much longer does it take to become a doctor after being an nurse practitioner NP?
What is taught in sociology?


What are the main breeds of goat?
Tell me about boer goats.
What breed of goat is good for meat?
Are angora goats good

# Order Human Questions

In [None]:
# collect all human questions
path_qs = '../data/cast19/human_questions.txt'

gs_qs = {}
with open(path_qs, 'r') as f:
    next(f)
    for line in f:
        q_id, q = line.split('\t')
#         print(q_id)
        gs_qs[q_id] = (q.strip('\n'))
print("Total questions: %d" % len(gs_qs))
# print(gs_qs['1_11'])
all_questions = list(gs_qs.values())
print(all_questions[:2])

# GT labels
gt_clusters = [q_id.split('_')[0] for q_id in gs_qs]
print(gt_clusters[:2])
n_clusters = len(set(gt_clusters))
print("# GT clusters: %d" % n_clusters)

# Generated Questions

In [None]:
# collect all generated questions

