Code referenced from  https://github.com/sahanbull/context-agnostic-engagement/blob/0472e76c6bd00d686b235d844e2fb4d71649400c/context_agnostic_engagement/feature_extraction/_api_utils.py#L47

In [None]:
pip install python-terrier


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-terrier
  Downloading python-terrier-0.8.1.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 3.2 MB/s 
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Collecting pyjnius~=1.3.0
  Downloading pyjnius-1.3.0-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 24.0 MB/s 
[?25hCollecting matchpy
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 7.4 MB/s 
[?25hCollecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting deprecation
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting chest
  Downloading chest-0.2.3.tar.gz (9.6 kB)
Collecting nptyping==1.4.4
  Downloading nptyping-1.4.4-py3-none-any.whl (31 kB)
Collecting ir_datasets>=0.3.2
  Downloading ir_datasets-0.5.3-py3-none-any.whl (303 kB)
[K     |██████████████████████████████

In [None]:
import pyterrier as pt
pt.init()


terrier-assemblies 5.6 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.6 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

from collections import defaultdict
import time

import requests
import ujson as json

ERROR_KEY = u'error'
STATUS_FIELD = u'status'

URL_FIELD = u'url'
PAGERANK_FIELD = u'pageRank'

COSINE_FIELD = u'cosine'

ANNOTATION_DATA_FIELD = u'annotation_data'

_WIKIFIER_URL = u"http://www.wikifier.org/annotate-article"
_WIKIFIER_MAX_SERVER_LIMIT = 25000
WIKIFIER_MAX_CHAR_CEILING = round(_WIKIFIER_MAX_SERVER_LIMIT * .99)  # 99% of max allowed num chars for a post request




import re

SENTENCE_AGGREGATOR = " "
LEN_SENTENCE_AGGR = len(SENTENCE_AGGREGATOR)


def _make_regex_with_escapes(escapers):
    words_regex = r'{}[^_\W]+{}'

    temp_regexes = []
    for escaper_pair in escapers:
        (start, end) = escaper_pair
        temp_regexes.append(words_regex.format(start, end))

    return r"|".join(temp_regexes)


def shallow_word_segment(phrase, escape_pairs=None):
    """ Takes in a string phrase and segments it into words based on simple regex
    Args:
        phrase (str): phrase to be segmented to words
        escape_pairs ([(str, str)]): list of tuples where each tuple is a pair of substrings that should not be
                    used as word seperators. The motivation is to escapte special tags such as [HESITATION], ~SILENCE~
                    IMPORTANT: Row regex has to be used when definng escapte pairs
                        ("[", "]") will not work as [] are special chars in regex. Instead ("\[", "\]")
    Returns:
        ([str]): list of words extracted from the phrase
    """
    if escape_pairs is None:
        escape_pairs = []

    escape_pairs.append(("", ""))

    _regex = _make_regex_with_escapes(escape_pairs)
    return re.findall(_regex, phrase, flags=re.UNICODE)


def _segment_sentences(text):
    """segments a text into a set of sentences
    Args:
        text:
    Returns:
    """
    import en_core_web_sm
    nlp = en_core_web_sm.load()

    text_sentences = nlp(text)

    for sentence in text_sentences.sents:
        yield sentence.text


def partition_text(text, max_size):
    """takes a text string and creates a list of substrings that are shorter than a given length
    Args:
        text (str): text to be partitioned (usually a lecture transcript)
        max_size (int): maximum number of characters one partition should contain
    Returns:
        chunks([str]): list of sub strings where each substring is shorter than the given length
    """
    # get sentences
    sentences = _segment_sentences(text)

    chunks = []

    temp_sents = []
    temp_len = 0
    for sentence in sentences:
        len_sentence = len(sentence)
        expected_len = temp_len + LEN_SENTENCE_AGGR + len_sentence  # estimate length cost
        if expected_len > max_size:  # if it goes above threshold,
            if len(temp_sents) > 0:
                chunks.append(SENTENCE_AGGREGATOR.join(temp_sents))  # first load the preceding chunk
                temp_sents = []
                temp_len = 0

        temp_sents.append(sentence)  # then aggregate the sentence to the temp chunk
        temp_len += len_sentence

    if len(temp_sents) > 0:
        chunks.append(SENTENCE_AGGREGATOR.join(temp_sents))  # send the remainder chunk

    return chunks






def _get_wikififier_concepts(resp):
    annotations = [{URL_FIELD: ann[URL_FIELD],
                    COSINE_FIELD: ann[COSINE_FIELD],
                    PAGERANK_FIELD: ann[PAGERANK_FIELD]} for ann in resp.get("annotations", [])]

    return {
        ANNOTATION_DATA_FIELD: annotations,
        STATUS_FIELD: resp[STATUS_FIELD]
    }


def _get_wikifier_response(text, api_key, df_ignore, words_ignore):
    params = {"text": text,
              "userKey": api_key,
              "nTopDfValuesToIgnore": df_ignore,
              "nWordsToIgnoreFromList": words_ignore}
    r = requests.post(_WIKIFIER_URL, params)
    if r.status_code == 200:
        resp = json.loads(r.content)
        if ERROR_KEY in resp:
            raise ValueError("error in response : {}".format(resp[ERROR_KEY]))
        return resp
    else:
        raise ValueError("http status code 200 expected, got status code {} instead".format(r.status_code))


def wikify(text, key, df_ignore, words_ignore):
    """This function takes in a text representation of a lecture transcript and associates relevant Wikipedia topics to
    it using www.wikifier.org entity linking technology.
    Args:
        text (str): text that needs to be Wikified (usually lecture transcript string)
        key (str): API key for Wikifier obtained from http://www.wikifier.org/register.html
        df_ignore (int): Most common words to ignore based on Document frequency
        words_ignore (int): Most common words to ignore based on Term frequency
    Returns:
        [{key:val}]: a dict with status of the request and the list of Wiki topics linked to the text
    """
    try:
        resp = _get_wikifier_response(text, key, df_ignore, words_ignore)
        resp[STATUS_FIELD] = 'success'
    except ValueError as e:
        try:
            STATUS_ = e.message
        except:
            STATUS_ = e.args[0]
        return {
            STATUS_FIELD: STATUS_
        }
    time.sleep(0.5)
    return _get_wikififier_concepts(resp)
# values for Doc Frequency and Words to Ignore, more details about these variables
# found at: http://www.wikifier.org/info.html
DF_IGNORE_VAL = 50
WORDS_IGNORE_VAL = 50


def get_wikipedia_topic_features(text, api_key, chunk_size=5000):
    """ get Wikification for the transcript using http://www.wikifier.org
    Args:
        text (str): text that needs to be Wikified
        api_key (str): API key for Wikifier obtained from http://www.wikifier.org/register.html
        chunk_size (int): maximum number of characters that need included in each Wikified fragment.
    Returns:
        enrichments ([{str: val}]): list of annotated chunks from the transcript
    """
    text_partitions = partition_text(text, max_size=chunk_size)

    enrichments = []
    i = 1
    for text_part in text_partitions:
        temp_record = {}
        annotations = wikify(text_part, api_key, DF_IGNORE_VAL, WORDS_IGNORE_VAL)
        temp_record["part"] = i
        temp_record["text"] = text_part
        temp_record["annotations"] = annotations
        enrichments.append(temp_record)
        i += 1

    return enrichments


def get_ranked_topics(chunks, option, top_n):
    """ ranks the topics using the aggregated score across multiple Wikified chunks of the text.
    Args:
        chunks ([{str: val}]): list of Wikified chunks for the transcript
        option {str}: pageRank or cosine
        top_n (int): n top ranked topics of interest
    Returns:
        final_rec ({str:val}): dict with key for top_n_url or top_n_value and the URL or value of the topic
    """
    chunks = list(chunks)

    total_length = sum([len(part["text"]) for part in chunks])

    records = defaultdict(list)
    for part in chunks:
        annotations = part["annotations"]["annotation_data"]
        weight = len(part["text"])
        norm = weight / total_length
        for concept in annotations:
            url = concept["url"]
            val = concept.get(option, 0.)
            records[url].append(val * norm)

    rec = [(title, sum(val)) for title, val in records.items()]

    # sort by normalised weight
    rec.sort(key=lambda l: l[1], reverse=True)
    n_recs = rec[:top_n]

    final_rec = {}
    for idx, item in enumerate(n_recs):
        url, val = item
        _idx = idx + 1
        final_rec["topic_{}_{}_url".format(_idx, option)] = url
        final_rec["topic_{}_{}_val".format(_idx, option)] = val

    return final_rec


def get_authority_wiki_features(text, api_key, top_n):
    """ returns top-n most authoritative Wikipedia topics with PageRank scores.
    Calculated using http://www.wikifier.org/
    Args:
        text (str): text that needs to be Wikified for authority
        api_key (str): API key for Wikifier obtained from http://www.wikifier.org/register.html
        top_n (int): n top ranking topics to be returned with PageRank scores
    Returns:
        ranked_topic_records ({str:val}): dict with key for top_n_url or top_n_value and the URL or value of the topic
    """
    enriched_chunks = get_wikipedia_topic_features(text, api_key)
    ranked_topic_records = get_ranked_topics(enriched_chunks, "pageRank", top_n)

    return ranked_topic_records


def get_coverage_wiki_features(text, api_key, top_n):
    """ returns top-n most covered Wikipedia topics with cosine similarity scores.
    Calculated using http://www.wikifier.org/
    Args:
        text (str): text that needs to be Wikified for coverage
        api_key (str): API key for Wikifier obtained from http://www.wikifier.org/register.html
        top_n (int): n top ranking topics to be returned with cosine scores
    Returns:
        ranked_topic_records ({str:val}): dict with key for top_n_url or top_n_value and the URL or value of the topic
    """
    enriched_chunks = get_wikipedia_topic_features(text, api_key)
    ranked_topic_records = get_ranked_topics(enriched_chunks, "cosine", top_n)

    return ranked_topic_records



import requests
import spacy
from collections import Counter
import matplotlib.pyplot as plt
import spacy

import argparse
import sys
import string
import spacy
import xml.etree.ElementTree as ET



def convert_string(hj):
  new_s=[]
  for token in hj:
      new_s.append(str(token).strip(string.punctuation))
  return ''.join(new_s)

def get_concept_list(response):
  concepts=[]
  for i in range(len(response["annotation_data"])):
    url = response["annotation_data"][i]['url']
    cos = response["annotation_data"][i]['cosine']
    pr  = response["annotation_data"][i]['pageRank']


    concepts.append([convert_string(' '.join(url.split('/')[-1].split('_'))),cos,pr])
  return concepts

In [None]:
def get_wikipedia(doc_list):
    key_api= """boifszokpjjzgdxacixjqudgagqpgb"""
    text=[]

    for t in doc_list.text:
      text.append(str(t))

    gh=(''.join(text))
    print(gh)
    
    get_response=wikify(gh,key_api,50,50)
    converted_concepts=get_concept_list(get_response)
    print(converted_concepts)
    return converted_concepts



def update_xml(xml, nouns, f_name):
    n_queries = 0

    tree = ET.parse(xml)
    for node in tree.iter(): 
        if node.tag == 'query':
            n_queries += 1
            print(node.text)  
            gather = ' '.join(list(set(nouns[n_queries-1])))
            tmp = node.text + ' ' + gather
            print(tmp)
            tmp = tmp.lower()
            node.text = tmp
    if n_queries != len(nouns):
        print('Warning: n_queries do not match with number '
                'of descriptions parsed', file=sys.stderr)

    tree.write(f_name)



top="/content/drive/MyDrive/IRDM/ent_test.xml"
nlp = spacy.load("en_core_web_sm")
tree = ET.parse(top)


In [None]:
# Create a query dict 

queries_dict={}
for node in tree.iter():
  if node.tag=='num':
    key=node.text
  if node.tag=="query":
        queries_dict.update({key:get_concept_list(wikify(node.text,'boifszokpjjzgdxacixjqudgagqpgb',50,50))})
  


In [None]:
import numpy as np

In [None]:
np.save('/content/drive/MyDrive/IRDM/queries_doc_test3.npy', queries_dict) 

In [None]:
# Reading all the batch file locations to merge them 
import os
iter=[]
name_dir="/content/drive/MyDrive/IRDM/"
for root, folder, data in os.walk(name_dir):
    for a in data:
        if ".npy" in a:
          if "batch" in a:
            iter.append((os.path.join(root,a)))
          

In [None]:
# Merger function
def mergmast(list_of_dicts):
    master={}
    for i in range(len(list_of_dicts)):
      open = np.load(list_of_dicts[i],allow_pickle='TRUE').item()
      master.update(open)
    return master

In [None]:
#Delete the test batch
iter.remove("/content/drive/MyDrive/IRDM/batch_o.npy")

'/content/drive/MyDrive/IRDM/batch_o.npy'

In [None]:
#Generating the entire batch
total_batch_156k=mergmast(iter)

In [None]:
len(total_batch_156k)

150270

In [None]:
# Saving the entire batch
np.save('/content/drive/MyDrive/IRDM/master_set_docs.npy', total_batch_156k) 