In [1]:
import pyterrier as pt
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re

from tira.third_party_integrations import persist_and_normalize_run,  ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

In [2]:
# Initialize PyTerrier
if not pt.started():
    pt.init()

ensure_pyterrier_is_loaded()

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

In [4]:
docs =  pt_dataset.get_corpus_iter()

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 0/126958 [00:00<?, ?it/s]

expermient with dataset

In [5]:
for i,doc in enumerate(docs):
    print(doc['text'])
    if i == 3:
        break

A Study on Word Similarity using Context Vector Models


 There is a need to measure word similarity when processing natural languages, especially when using generalization, classification, or example -based approaches. Usually, measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy . The taxonomy approaches are more or less semantic -based that do not consider syntactic similarit ies. However, in real applications, both semantic and syntactic similarities are required and weighted differently. Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies. In this paper, we propose using only syntactic related co-occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision. The probabilistic distribution of co-occurrence context features is derived by parsing the contextual environment of each word ,

Tokenizer

In [6]:
#This is our ngram tokenizer. It takes a string and returns a dict of all ngrams, where each ngram is seperated by $$ so it will be parsed as one token

def tokenize_ngrams_to_dict(text, n1=1, n2=3):
    # Replace spaces with dollar signs
    text_with_dollar_signs = re.sub(r'\s+', '$', text)
    
    # Tokenize the text into words
    words = text_with_dollar_signs.split('$')
    
    # Initialize an empty Counter to hold all n-grams
    all_ngram_counts = Counter()
    
    # Loop through each n from n1 to n2
    for n in range(n1, n2 + 1):
        # Generate n-grams for the current n
        ngrams = ['$$'.join(words[i:i+n]) for i in range(len(words)-n+1)]
        
        # Update the Counter with the current n-grams
        all_ngram_counts.update(ngrams)
    
    return dict(all_ngram_counts)

In [7]:
# Apply n-gram tokenization to the dataset
documents = list()

for i,doc in enumerate(docs):
    if i == 1000: #limit to 1000 for now since kernel crashes if performed on whole set of docs
        break
    doc_1gram = tokenize_ngrams_to_dict(doc['text'], n1=1, n2=3)

    doc['toks'] = doc_1gram
    del doc['text']  # Remove the 'text' field as it's not needed anymore
    #if doc['docno'] == 'N09-1041':
    #    print("N09-1041:",doc)
    documents.append(doc)

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 585/126958 [00:00<00:38, 3293.72it/s]

In [10]:
for i,doc in enumerate(documents):
    if i == 3:
        break
    print(doc)

{'docno': 'L02-1309', 'toks': {'Proposal': 1, 'of': 1, 'a': 1, 'very-large-corpus': 1, 'acquisition': 1, 'method': 1, 'by': 1, 'cell-formed': 1, 'registration': 1, 'Proposal$$of': 1, 'of$$a': 1, 'a$$very-large-corpus': 1, 'very-large-corpus$$acquisition': 1, 'acquisition$$method': 1, 'method$$by': 1, 'by$$cell-formed': 1, 'cell-formed$$registration': 1, 'Proposal$$of$$a': 1, 'of$$a$$very-large-corpus': 1, 'a$$very-large-corpus$$acquisition': 1, 'very-large-corpus$$acquisition$$method': 1, 'acquisition$$method$$by': 1, 'method$$by$$cell-formed': 1, 'by$$cell-formed$$registration': 1}}
{'docno': 'R13-1044', 'toks': {'Recognizing': 1, 'semantic': 3, 'relations': 1, 'within': 2, '{P}olish': 1, 'noun': 2, 'phrase:': 1, 'A': 2, 'rule-based': 2, 'approach': 2, 'The': 2, 'paper': 1, '1': 1, 'presents': 1, 'a': 1, 'to': 1, 'relation': 1, 'recognition': 1, 'the': 4, 'Polish': 2, 'phrase.': 1, 'set': 1, 'of': 2, 'relations,': 2, 'including': 1, 'some': 1, 'thematic': 1, 'has': 1, 'been': 1, 'dete

In [15]:
import nltk
from nltk.corpus import stopwords

# Ensure stopwords are downloaded
nltk.download('stopwords')

# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from a single document
def remove_stopwords(doc):
    docno = doc['docno']
    toks = doc['toks']
    filtered_toks = {token: freq for token, freq in toks.items() if token.lower() not in stop_words}
    return {'docno': docno, 'toks': filtered_toks}

# Apply the stopword removal to the entire document collection
filtered_collection = [remove_stopwords(doc) for doc in documents]

# Display the filtered collection
for i,doc in enumerate(filtered_collection):
    if i == 3:
        break
    print(doc)

[{'docno': 'L02-1309', 'toks': {'Proposal': 1, 'very-large-corpus': 1, 'acquisition': 1, 'method': 1, 'cell-formed': 1, 'registration': 1, 'Proposal$$of': 1, 'of$$a': 1, 'a$$very-large-corpus': 1, 'very-large-corpus$$acquisition': 1, 'acquisition$$method': 1, 'method$$by': 1, 'by$$cell-formed': 1, 'cell-formed$$registration': 1, 'Proposal$$of$$a': 1, 'of$$a$$very-large-corpus': 1, 'a$$very-large-corpus$$acquisition': 1, 'very-large-corpus$$acquisition$$method': 1, 'acquisition$$method$$by': 1, 'method$$by$$cell-formed': 1, 'by$$cell-formed$$registration': 1}}, {'docno': 'R13-1044', 'toks': {'Recognizing': 1, 'semantic': 3, 'relations': 1, 'within': 2, '{P}olish': 1, 'noun': 2, 'phrase:': 1, 'rule-based': 2, 'approach': 2, 'paper': 1, '1': 1, 'presents': 1, 'relation': 1, 'recognition': 1, 'Polish': 2, 'phrase.': 1, 'set': 1, 'relations,': 2, 'including': 1, 'thematic': 1, 'determined': 1, 'need': 1, 'experiments.': 1, 'method': 1, 'consists': 1, 'two': 2, 'steps:': 1, 'first': 1, 'syst

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index

In [12]:

# Initialize the IterDictIndexer with pretokenised set to True
iter_indexer = pt.IterDictIndexer("./pretokindex", overwrite=True, meta={'docno': 20}, pretokenised=True)

# Index the pretokenized dataset
index_ref = iter_indexer.index(documents)

print(f"Indexing complete: {index_ref}")
# Now you can use the index_ref as usual
index = pt.IndexFactory.of(index_ref)

print(index.getCollectionStatistics())

# Access the MetaIndex and Lexicon
meta = index.getMetaIndex()
lexicon = index.getLexicon()

11:23:35.512 [ForkJoinPool-3-worker-3] ERROR org.terrier.structures.seralization.FixedSizeTextFactory - Term 1$$https://github.com/UCDenver-ccp/$$CRAFT/releases/tag/v3.1.3; written in 64 bytes. Max expected size was 61. Increase relevant property: max.term.length for Lexicon, or indexer.meta.forward.keylens for metadata
java.lang.ArrayIndexOutOfBoundsException: arraycopy: length -3 is negative
	at java.base/java.lang.System.arraycopy(Native Method)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:129)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.terrier.structures.seralization.FixedSizeTextFactory$FixedSizeText.write(FixedSizeTextFactory.java:75)
	at org.terrier.structures.collections.FSOrderedMapFile$1.write(FSOrderedMapFile.java:1163)
	at org.terrier.structures.FSOMapFileLexiconOutputStreamGeneric.writeNextEntry(FSOMapFileLexiconOutputStreamGeneric.java:100)
	at org.terrier.structures.indexing.LexiconMap.storeToStream(Lexic

JavaException: JVM exception occurred: java.util.concurrent.ExecutionException: java.nio.BufferUnderflowException java.lang.RuntimeException

In [14]:
import nltk
from nltk.corpus import stopwords

# Ensure stopwords are downloaded
nltk.download('stopwords')

# Get the set of stopwords for the English language
stop_words = set(stopwords.words('english'))

# Sample document collection
document_collection = [
    {'docno': 'L02-1309', 'toks': {'Proposal': 1, 'of': 1, 'a': 1, 'very-large-corpus': 1, 'acquisition': 1}},
    # Add more documents as needed
]

# Function to remove stopwords from a single document
def remove_stopwords(doc):
    docno = doc['docno']
    toks = doc['toks']
    filtered_toks = {token: freq for token, freq in toks.items() if token.lower() not in stop_words}
    return {'docno': docno, 'toks': filtered_toks}

# Apply the stopword removal to the entire document collection
filtered_collection = [remove_stopwords(doc) for doc in document_collection]

# Display the filtered collection
print(filtered_collection)

[{'docno': 'L02-1309', 'toks': {'Proposal': 1, 'very-large-corpus': 1, 'acquisition': 1}}]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
