In [1]:
import io
import os.path
import re
import tarfile
import json
import smart_open

In [12]:
def extract_documents(ds_path):
    ds = [' '.join(json.loads(_)['text']) for _ in open(ds_path)]
    return ds

nq_docs = list(extract_documents("../../data/nq/train.jsonl"))
race_docs = list(extract_documents("../../data/RACE/train.jsonl"))
sciq_docs = list(extract_documents("../../data/SciQ/train.jsonl"))
print(f"number of docs: {len(nq_docs)}, {len(race_docs)}, {len(sciq_docs)}")
docs = nq_docs + race_docs + sciq_docs

nq_test_docs = list(extract_documents("../../data/nq/dev.jsonl"))
race_test_docs = list(extract_documents("../../data/RACE/dev.jsonl"))
sciq_test_docs = list(extract_documents("../../data/SciQ/test.jsonl"))


number of docs: 89453, 18613, 11679


In [34]:
nq_ds = open("../../data/nq/train.jsonl").readlines()

In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [6]:
stop_words = set(stop_words)

In [13]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
def split_tokens(docs):
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric() and token not in stop_words] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]
    return docs
nq_docs = split_tokens(nq_docs)
race_docs = split_tokens(race_docs)
sciq_docs = split_tokens(sciq_docs)
docs = split_tokens(docs)

In [27]:
nq_test_docs = split_tokens(nq_test_docs)

In [14]:
race_test_docs = split_tokens(race_test_docs)
sciq_test_docs = split_tokens(sciq_test_docs)

In [15]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
nq_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in nq_docs]
race_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in race_docs]
sciq_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in sciq_docs]
race_test_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in race_test_docs]
sciq_test_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in sciq_test_docs]

In [28]:
nq_test_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in nq_test_docs]

In [16]:
# Compute bigrams.
from gensim.models import Phrases

def add_bigrams(docs):
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
add_bigrams(docs)
add_bigrams(nq_docs)
add_bigrams(race_docs)
add_bigrams(sciq_docs)
add_bigrams(race_test_docs)
add_bigrams(sciq_test_docs)



In [30]:
add_bigrams(nq_test_docs)

In [17]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

def get_dictionary(docs):
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    return dictionary
dictionary = get_dictionary(docs+nq_docs+race_test_docs+sciq_test_docs)

In [18]:
# Bag-of-words representation of the documents. need to use the same dictionary.
corpus = [dictionary.doc2bow(doc) for doc in docs]
nq_corpus = [dictionary.doc2bow(doc) for doc in nq_docs]
race_corpus = [dictionary.doc2bow(doc) for doc in race_docs]
sciq_corpus = [dictionary.doc2bow(doc) for doc in sciq_docs]
race_test_corpus = [dictionary.doc2bow(doc) for doc in race_test_docs]
sciq_test_corpus = [dictionary.doc2bow(doc) for doc in sciq_test_docs]

In [31]:
nq_test_corpus = [dictionary.doc2bow(doc) for doc in nq_test_docs]

In [19]:
print('Number of unique tokens: %d' % len(dictionary))
print(f'Number of documents: {len(nq_corpus)}, {len(race_corpus)}, {len(sciq_corpus)}, {len(race_test_corpus)}, {len(sciq_test_corpus)}')

Number of unique tokens: 34838
Number of documents: 89453, 18613, 11679, 1036, 1000


In [20]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [21]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -3.6039.
[([(0.012310617, 'people'),
   (0.011563973, 'said'),
   (0.01077379, 'like'),
   (0.010208301, 'one'),
   (0.008664568, 'time'),
   (0.008580215, 'school'),
   (0.008215999, 'day'),
   (0.007836241, 'get'),
   (0.007749909, 'child'),
   (0.0073356824, 'make'),
   (0.007298846, 'go'),
   (0.0071585346, 'life'),
   (0.0067995302, 'say'),
   (0.006612989, 'would'),
   (0.006522541, 'good'),
   (0.0060412483, 'thing'),
   (0.0058971485, 'see'),
   (0.005868483, 'take'),
   (0.0058029527, 'help'),
   (0.0057608425, 'way')],
  -1.7604466637069196),
 ([(0.019915888, 'cell'),
   (0.013187877, 'energy'),
   (0.010464404, 'water'),
   (0.010382532, 'figure'),
   (0.009793314, 'body'),
   (0.009465443, 'food'),
   (0.009312558, 'called'),
   (0.008784117, 'also'),
   (0.008638417, 'may'),
   (0.008340241, 'example'),
   (0.008281657, 'plant'),
   (0.008275462, 'form'),
   (0.0073485067, 'blood'),
   (0.0065326546, 'molecule'),
   (0.0062180455, 'system'),
   (0.

In [22]:
from gensim import corpora, models, similarities, downloader

In [23]:
# Convert another corpus to the LDA space and index it.
index = similarities.MatrixSimilarity(model[corpus])


In [24]:
import numpy as np

In [25]:
def domain_prediction(index, corpus):
    preds = []
    data_idx = [89453, 89453+18613, 89453+18613+11679]
    for d in corpus:
        p_idx = np.argmax(index[model[d]])
        for idx, domain_idx in enumerate(data_idx):
            if p_idx < domain_idx:
                preds.append(idx)
                break
    return preds

In [32]:
nq_pred = domain_prediction(index, nq_test_corpus)
race_pred = domain_prediction(index, race_test_corpus)
sciq_pred = domain_prediction(index, sciq_test_corpus)

In [33]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
ground_truth = [0]*len(nq_pred) + [1]*len(race_pred) + [2] * len(sciq_pred)
preds = nq_pred + race_pred + sciq_pred
precision, recall, f1, _ = precision_recall_fscore_support(ground_truth, preds, average=None)
print(precision, recall, f1,)

[0.83474155 0.79383634 0.69134253] [0.90150295 0.72104247 0.551     ] [0.86683871 0.75569044 0.6132443 ]


In [39]:
def domain_selection(index, corpus, num_keeped=1):
    num_nq_ds = 89453
    selection = []
    for d in corpus:
        nq_scores = index[model[d]][:num_nq_ds]
        p_idx = np.argsort(nq_scores)[::-1]
        selection.extend([nq_ds[_] for _ in p_idx[:num_keeped]])
    return selection

d = np.arange(len(race_corpus))
np.random.shuffle(d)
f = open("../../data/nq/race_lda/1000.jsonl", "w")
race_selected = domain_selection(index, [race_corpus[_] for _ in d[:1000]]) # race_test_corpus)
for w in race_selected:
    f.write(w)
f.close()

d = np.arange(len(sciq_corpus))
np.random.shuffle(d)
f = open("../../data/nq/sciq_lda/1000.jsonl", "w")
sciq_selected = domain_selection(index, [sciq_corpus[_] for _ in d[:1000]]) # sciq_test_corpus)
for w in sciq_selected:
    f.write(w)
f.close()