# Tesserae v5 Demo

This demo will go over the basics of Tesserae v5 development up through February 5, 2019.

In [None]:
import json

from tesserae.db import TessMongoConnection
from tesserae.db.entities import Frequency, Match, Text, Token, Unit
from tesserae.utils import TessFile
from tesserae.tokenizers import GreekTokenizer, LatinTokenizer
from tesserae.unitizer import Unitizer
from tesserae.matchers import AggregationMatcher

# Set up the connection and clean up the database
connection = TessMongoConnection('127.0.0.1', 27017, None, None, 'tesstest')

# Clean up the previous demo
connection.connection['feature_sets'].delete_many({})
connection.connection['frequencies'].delete_many({})
connection.connection['matches'].delete_many({})
connection.connection['match_sets'].delete_many({})
connection.connection['texts'].delete_many({})
connection.connection['tokens'].delete_many({})
connection.connection['units'].delete_many({})

## Loading and Storing New Texts

The Tesserae database catalogs metadata, including the title, author, and year published, as well as integrity information like filepath, MD5 hash, and CTS URN.

We start by loading in some metadata from `text_metadata.json`.

In [None]:
with open('text_metadata.json', 'r') as f:
    text_meta = json.load(f)

print('{}{}{}{}'.format('Title'.ljust(15), 'Author'.ljust(15), 'Language'.ljust(15), 'Year'))
print('{}{}{}{}'.format('-----'.ljust(15), '------'.ljust(15), '--------'.ljust(15), '----'))
for t in text_meta:
    print('{}{}{}{}'.format(t['title'].ljust(15), t['author'].ljust(15), t['language'].ljust(15), str(t['year']).ljust(15)))

Then insert the new texts with `TessMongoConnection.insert` after converting the raw JSON to Tesserae `Text` entities.

In [None]:
texts = []
for t in text_meta:
    texts.append(Text.json_decode(t))
result = connection.insert(texts)
print('Inserted {} texts.'.format(len(result.inserted_ids)))
print(result.inserted_ids)

We can retrieve the inserted texts with `TessMongoConnection.find`. These texts will be converted to objects representing the database entries. The returned text list can be filtered by any valid field in the text database.

In [None]:
texts = connection.find('texts', _id=result.inserted_ids)

print('{}{}{}{}'.format('Title'.ljust(15), 'Author'.ljust(15), 'Language'.ljust(15), 'Year'))
for t in texts:
    print('{}{}{}{}'.format(t.title.ljust(15), t.author.ljust(15), t.language.ljust(15), t.year))

## Loading .tess Files

Text metadata includes the path to the .tess file on the local filesystem. Using a Text retrieved from the database, the file can be loaded for further processing.

In [None]:
tessfile = TessFile(texts[0].path, metadata=texts[0])

print(tessfile.path)
print(len(tessfile))
print(tessfile[270])

We can iterate through the file line-by-line.

In [None]:
lines = tessfile.readlines()
for i in range(10):
    print(next(lines))

We can also iterate token-by-token.

In [None]:
tokens = tessfile.read_tokens()
for i in range(10):
    print(next(tokens))

## Tokenizing a Text

Texts can be tokenized with `tesserae.tokenizers` objects. These objects are designed to normalize and compute features for tokens of a specific language.

In [None]:
tokenizer = GreekTokenizer(connection) if tessfile.metadata.language == 'greek' else LatinTokenizer(connection)

tokens, tags, frequencies, feature_sets = tokenizer.tokenize(tessfile.read(), text=tessfile.metadata)

tokens = tokenizer.tokens
print(len(tokens), len(frequencies), len(feature_sets))

print('{}{}{}{}'.format('Raw'.ljust(15), 'Normalized'.ljust(15), 'Lemmata'.ljust(20), 'Frequency'))
print('{}{}{}{}'.format('---'.ljust(15), '----------'.ljust(15), '-------'.ljust(20), '---------'))
for i in range(20):
    if tokenizer.tokens[i].feature_set is not None and not isinstance(tokenizer.tokens[i].feature_set, str):
        print('{}{}{}{}'.format(tokenizer.tokens[i].display.ljust(15),
                              str(tokenizer.tokens[i].feature_set.form).ljust(20),
                              str(tokenizer.tokens[i].feature_set.lemmata).ljust(20),
                              tokenizer.tokens[i].frequency.frequency))

Processed tokens can then be stored in and retrieved from the database, similar to text metadata.

In [None]:
result = connection.insert(feature_sets)
print('Inserted {} feature set entities out of {}'.format(len(result.inserted_ids), len(feature_sets)))

result = connection.insert(frequencies)
print('Inserted {} frequency entities out of {}'.format(len(result.inserted_ids), len(frequencies)))

## Unitizing a Text

Texts can be unitized into lines and phrases, and the intertext matches are found between units of text.


In [None]:
# Unitizing lines of a poem
unitizer = Unitizer()
lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)

print('Lines\n-----')
for line in lines[:20]:
        print(''.join([str(line.tags), ': '] + [t.display for t in line.tokens]))
        
print('\n\nPhrases\n-------')
for phrase in phrases[:20]:
        print(''.join([str(phrase.tags), ': '] + [t.display for t in phrase.tokens]))

In [None]:
# Unitizing phrases of a poem or prose
result = connection.insert(lines + phrases)
print('Inserted {} units out of {}.'.format(len(result.inserted_ids), len(lines + phrases)))


result = connection.insert(tokens)
print('Inserted {} tokens out of {}.'.format(len(result.inserted_ids), len(tokens)))

In [None]:
for text in texts[1:]:
    tessfile = TessFile(text.path, metadata=text)
    tokenizer = GreekTokenizer(connection) if tessfile.metadata.language == 'greek' else LatinTokenizer(connection)

    
    tokens, tags, frequencies, feature_sets = tokenizer.tokenize(tessfile.read(), text=tessfile.metadata)
        
    tokens = tokenizer.tokens
    result = connection.insert(feature_sets)
    result = connection.insert(frequencies)
    
    unitizer = Unitizer()
    lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
    result = connection.insert(lines + phrases)
    
    result = connection.insert(tokens)

## Matching

Once the Texts, Tokens, and Units are in the database, we can then find intertext matches.

In [None]:
import time
matcher = AggregationMatcher(connection)
match_texts = [t for t in texts if t.language == 'greek']

start = time.time()
matches, match_set = matcher.match(match_texts, 'phrase', 'form', distance_metric='span', stopwords=20, max_distance=10)
print("Completed matching in {0:.2f}s".format(time.time() - start))

matches.sort(key=lambda x: x.score, reverse=True)

# result = connection.insert(match_set)
# print('Inserted {} match set entities out of {}'.format(len(result.inserted_ids), 1))
result = connection.insert(matches)
print('Inserted {} match entities out of {}'.format(len(result.inserted_ids), len(matches)))

In [None]:
matches = connection.aggregate('matches', [
    {'$match': {'match_set': match_set.id}},
    {'$sort': {'score': -1}},
    {'$limit': 20},
    {'$lookup': {
        'from': 'units',
        'let': {'m_units': '$units'},
        'pipeline': [
            {'$match': {'$expr': {'$in': ['$_id', '$$m_units']}}},
            {'$lookup': {
                'from': 'tokens',
                'localField': '_id',
                'foreignField': 'phrase',
                'as': 'tokens'
            }},
            {'$sort': {'index': 1}}
        ],
        'as': 'units'
    }},
    {'$lookup': {
        'from': 'tokens',
        'localField': 'tokens',
        'foreignField': '_id',
        'as': 'tokens'
    }},
    {'$project': {
        'units': True,
        'score': True,
        'tokens': '$tokens.feature_set'
    }},
    {'$lookup': {
        'from': 'feature_sets',
        'localField': 'tokens',
        'foreignField': '_id',
        'as': 'tokens'
    }}
])

print('\n')
print('{}{}'.format('Score'.ljust(15), 'Match Tokens'.ljust(15)))
print('{}{}'.format('-----'.ljust(15), '------------'.ljust(15)))
for m in matches:
    print('{}{}'.format(('%.3f'%(m.score)).ljust(15), ', '.join(list(set([t['form'] for t in m.tokens])))))
    print('{} {} {}: {}'.format(match_texts[0].author, match_texts[0].title, m.units[0]['tags'], ''.join([t['display'] for t in m.units[0]['tokens']])))
    print('{} {} {}: {}'.format(match_texts[1].author, match_texts[1].title, m.units[1]['tags'], ''.join([t['display'] for t in m.units[1]['tokens']])))
    print('\n')