# Tesserae v5 Demo

This demo will go over the basics of Tesserae v5 development up through October 11, 2018.

In [None]:
import json

from tesserae.db import TessMongoConnection
from tesserae.db.entities import Frequency, Match, Text, Token, Unit
from tesserae.utils import TessFile
from tesserae.tokenizers import GreekTokenizer, LatinTokenizer
from tesserae.unitizer import Unitizer
from tesserae.matchers import DefaultMatcher

# Set up the connection and clean up the database
connection = TessMongoConnection('127.0.0.1', 27017, None, None, 'tesstest')

# Clean up the previous demo
connection.connection['frequencies'].delete_many({})
connection.connection['matches'].delete_many({})
connection.connection['texts'].delete_many({})
connection.connection['tokens'].delete_many({})
connection.connection['units'].delete_many({})

## Loading and Storing New Texts

The Tesserae database catalogs metadata, including the title, author, and year published, as well as integrity information like filepath, MD5 hash, and CTS URN.

We start by loading in some metadata from `text_metadata.json`.

In [None]:
with open('text_metadata.json', 'r') as f:
    text_meta = json.load(f)

print('{}{}{}{}'.format('Title'.ljust(15), 'Author'.ljust(15), 'Language'.ljust(15), 'Year'))
print('{}{}{}{}'.format('-----'.ljust(15), '------'.ljust(15), '--------'.ljust(15), '----'))
for t in text_meta:
    print('{}{}{}{}'.format(t['title'].ljust(15), t['author'].ljust(15), t['language'].ljust(15), str(t['year']).ljust(15)))

Then insert the new texts with `TessMongoConnection.insert` after converting the raw JSON to Tesserae `Text` entities.

In [None]:
texts = []
for t in text_meta:
    texts.append(Text.json_decode(t))
result = connection.insert(texts)
print('Inserted {} texts.'.format(len(result.inserted_ids)))
print(result.inserted_ids)

We can retrieve the inserted texts with `TessMongoConnection.find`. These texts will be converted to objects representing the database entries. The returned text list can be filtered by any valid field in the text database.

In [None]:
texts = connection.find('texts', _id=result.inserted_ids)

print('{}{}{}{}'.format('Title'.ljust(15), 'Author'.ljust(15), 'Language'.ljust(15), 'Year'))
for t in texts:
    print('{}{}{}{}'.format(t.title.ljust(15), t.author.ljust(15), t.language.ljust(15), t.year))

## Loading .tess Files

Text metadata includes the path to the .tess file on the local filesystem. Using a Text retrieved from the database, the file can be loaded for further processing.

In [None]:
tessfile = TessFile(texts[0].path, metadata=texts[0])

print(tessfile.path)
print(len(tessfile))
print(tessfile[270])

We can iterate through the file line-by-line.

In [None]:
lines = tessfile.readlines()
for i in range(10):
    print(next(lines))

We can also iterate token-by-token.

In [None]:
tokens = tessfile.read_tokens()
for i in range(10):
    print(next(tokens))

## Tokenizing a Text

Texts can be tokenized with `tesserae.tokenizers.get_token_info`. This function takes a token and the language to use for lemmatization, etc.

In [None]:
tokenizer = GreekTokenizer() if tessfile.metadata.language == 'greek' else LatinTokenizer()

for i, line in enumerate(tessfile.readlines(include_tag=False)):
    if i > 9:
        break
    tokens, frequencies = tokenizer.tokenize(line, text=tessfile.metadata)

tokens = tokenizer.tokens    

print('{}{}{}'.format('Raw'.ljust(15), 'Normalized'.ljust(15), 'Lemmata'))
print('{}{}{}'.format('---'.ljust(15), '----------'.ljust(15), '-------'))
for i in range(10):
    print('{}{}{}'.format(tokenizer.tokens[i].display.ljust(15),
                          str(tokenizer.tokens[i].form).ljust(15),
                          tokenizer.tokens[i].lemmata))


Processed tokens can then be stored in and retrieved from the database, similar to text metadata.

In [None]:
result = connection.insert(tokens)
print('Inserted {} tokens out of {}.'.format(len(result.inserted_ids), len(tokens)))

result = connection.insert(frequencies)
print('Inserted {} frequency entities out of {}'.format(len(result.inserted_ids), len(frequencies)))

## Unitizing a Text

Texts can be unitized into lines and phrases, and the intertext matches are found between units of text.


In [None]:
# Unitizing lines of a poem
unitizer = Unitizer()
lines, phrases = unitizer.unitize(tokens, tessfile.metadata)

print('Lines\n-----')
for line in lines:
        print(''.join([tokens[t].display for t in line.tokens]))
        
print('\n\nPhrases\n-------')
for phrase in phrases:
        print(''.join([tokens[t].display for t in phrase.tokens]))

In [None]:
# Unitizing phrases of a poem or prose
result = connection.insert(lines + phrases)
print('Inserted {} units out of {}.'.format(len(result.inserted_ids), len(lines + phrases)))

In [None]:
for text in texts[1:]:
    tessfile = TessFile(text.path, metadata=text)
    tokenizer = GreekTokenizer() if tessfile.metadata.language == 'greek' else LatinTokenizer()

    for i, line in enumerate(tessfile.readlines(include_tag=False)):
        if i > 9:
            break
        tokens, frequencies = tokenizer.tokenize(line, text=tessfile.metadata)
        
    tokens = tokenizer.tokens
    result = connection.insert(tokens)
    result = connection.insert(frequencies)
    
    unitizer = Unitizer()
    lines, phrases = unitizer.unitize(tokens, tessfile.metadata)
    result = connection.insert(lines + phrases)

## Matching

Once the Texts, Tokens, and Units are in the database, we can then find intertext matches.

In [None]:
matcher = DefaultMatcher(connection)
match_texts = [t for t in texts if t.language == 'latin']
matches = matcher.match(match_texts, 'phrase', 'lemmata', distance_metric='frequency', max_distance=999)
matches.sort(key=lambda x: x.score, reverse=True)

tokens = [connection.find('tokens', text=t.path) for t in match_texts]

print('\n')
print('{}{}{}'.format('Score'.ljust(15), 'Match Tokens'.ljust(15), 'Source Text'.ljust(15), 'Target Text'))
print('{}{}{}'.format('-----'.ljust(15), '------------'.ljust(15), '-----------'.ljust(15), '-----------'))
for m in matches:
    print('{}{}'.format(('%.3f'%(m.score)).ljust(15), ', '.join(list(set([t.form for t in m.match_tokens[0]])))))
    print('Source Text: {}'.format(''.join([tokens[0][t].display for t in m.units[0].tokens])))
    print('Target Text: {}'.format(''.join([tokens[1][t].display for t in m.units[1].tokens])))
    print('\n')