# Test Synset-Based Refactor

This notebook tests the refactored WordsDacc to ensure synset-based indexing works correctly.

In [None]:
# Set project rootdir here
rootdir = ""

# ... or, if you'll be sharing this notebook, make it so that the rootdir will be entered by the user
# and placed in a config file...
if not rootdir:
    from config2py import config_getter  # pip install config2py

    # If the env variable is not set, running this will ask the user to enter the rootdir
    # and it will save it for them for future use
    rootdir = config_getter('WORDNET_WORDS_PROJECT_ROOTDIR')

In [None]:
from imbed_data_prep.wordnet_words import WordsDacc

dacc = WordsDacc(rootdir)

## Test 1: word_and_synset (bipartite mapping)

In [None]:
word_synset = dacc.word_and_synset
print(f"Shape: {word_synset.shape}")
print(f"Columns: {list(word_synset.columns)}")
print(f"\nFirst 10 rows:")
print(word_synset.head(10))

# Check for a specific word
print(f"\nSynsets for 'dog':")
print(word_synset[word_synset['word'] == 'dog'])

## Test 2: wordnet_metadata (synset-indexed with lemmas column)

In [None]:
metadata = dacc.wordnet_metadata
print(f"Shape: {metadata.shape}")
print(f"Index name: {metadata.index.name}")
print(f"Columns: {list(metadata.columns)}")
print(f"\nFirst row:")
print(metadata.iloc[0])

# Check a specific synset
if 'dog.n.01' in metadata.index:
    print(f"\nMetadata for 'dog.n.01':")
    print(metadata.loc['dog.n.01'])

## Test 3: synset_definition_links (synset→synset edges)

In [None]:
links = dacc.synset_definition_links
print(f"Shape: {links.shape}")
print(f"Columns: {list(links.columns)}")
print(f"\nFirst 10 links:")
print(links.head(10))

# Check that both source and target are synset names
print(f"\nAll sources are synset names: {links['source'].str.contains(r'\.[nvasr]\.\d+$').all()}")
print(f"All targets are synset names: {links['target'].str.contains(r'\.[nvasr]\.\d+$').all()}")

# Count unique synsets
print(f"\nUnique source synsets: {links['source'].nunique()}")
print(f"Unique target synsets: {links['target'].nunique()}")
print(f"Total edges: {len(links)}")

## Test 4: word_indexed_metadata (word view)

In [None]:
word_meta = dacc.word_indexed_metadata
print(f"Shape: {word_meta.shape}")
print(f"Index name: {word_meta.index.name}")
print(f"Columns: {list(word_meta.columns)}")
print(f"\nFirst row:")
print(word_meta.iloc[0])

# Check a specific word
if 'dog' in word_meta.index:
    print(f"\nMetadata for 'dog':")
    print(word_meta.loc['dog'])

## Test 5: words_used_in_definition_of_words (word→word links)

In [None]:
word_links = dacc.words_used_in_definition_of_words
print(f"Shape: {word_links.shape}")
print(f"Columns: {list(word_links.columns)}")
print(f"\nFirst 10 links:")
print(word_links.head(10))

# Example: what words are used in 'computer' definition?
if 'computer' in word_links['source'].values:
    comp_links = word_links[word_links['source'] == 'computer']
    print(f"\nWords in 'computer' definition ({len(comp_links)} words):")
    print(list(comp_links['target'].head(20)))

## Test 6: Helper methods

In [None]:
# synset_to_lemmas
synset_lemmas = dacc.synset_to_lemmas
print(f"Type: {type(synset_lemmas)}")
print(f"Number of synsets: {len(synset_lemmas)}")
if 'dog.n.01' in synset_lemmas:
    print(f"\nLemmas for 'dog.n.01': {synset_lemmas['dog.n.01']}")

# word_to_synsets
word_synsets = dacc.word_to_synsets
print(f"\nType: {type(word_synsets)}")
print(f"Number of words: {len(word_synsets)}")
if 'dog' in word_synsets:
    print(f"\nSynsets for 'dog': {word_synsets['dog']}")

## Summary

If all tests pass, the refactoring is successful:
- word_and_synset is a bipartite mapping (no lemma column)
- wordnet_metadata is synset-indexed with a lemmas column
- synset_definition_links has synset→synset edges only
- word_indexed_metadata provides word-level view
- Helper methods provide mappings between words and synsets