In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import logging
from collections import defaultdict, Counter
import random
import copy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hydra import initialize, compose
import hydra
from greek_accentuation.characters import strip_accents, strip_breathing

from cgpos.utils.util import import_pkl, export_pkl

In [3]:
# Reset hydra
hydra.core.global_hydra.GlobalHydra.instance().clear()
# Load hydra params
initialize("../conf/", version_base=None)
config = compose(config_name='main')
# Init logger
logging.basicConfig(level=logging.INFO) 

In [4]:
# Load data
data_raw = import_pkl(config.perseus.featurized)
# Clean
data = [word for word in data_raw if 'pos' in word]

INFO:cgpos.utils.util:Importing /home/tejomay/cgpos/data/processed/perseus_featurized.pkl


In [5]:
def ngram(sequence, n):
    """
    Generate ngram from a sequence of tokens.
    """
    len_sequence = len(sequence)
    n_passes = max(1, len_sequence - n + 1)
    return (tuple(sequence[i:(i + n)]) for i in range(n_passes))

def ngram_bos(data, n=1):
    """
    Generate n-gram bag of syllables for given data.
    """
    bos = defaultdict(Counter)
    for word in data:
        pos = word['pos']
        syllables = word['syllables']
        for gram in ngram(syllables, n):
            bos[gram][pos] +=1 
    return bos

In [187]:
# Bag of syllables
def ngram_bos_model(data, n=2, train=0.8, verbose=True, seed=20):
    """
    Implement n-gram bag of syllable model.
    """
    logger = logging.getLogger(__name__)
    logger.info(f"N-gram bag of syllables model: n={n}")
    
    # Train test split
    random.seed(seed)
    shuffled = random.sample(data, len(data))
    train_ind = int(len(data) * train)
    train_data = shuffled[:train_ind] 
    test_data = shuffled[train_ind:]

    if verbose:
        logger.info(f"Train-test split {train}: (train={len(train_data)}, test={len(test_data)}) [seed={seed}]")

    # Train
    train_grams = {}
    for i in range(1, n + 1):
        train_gram = ngram_bos(train_data, n=i)
        train_grams[i] = train_gram

    # Test
    unseen_grams = []
    for word in test_data:
        gram_preds = []
        word_grams = ngram(word['syllables'], n)
        for gram in word_grams:
            if (gram in train_grams[n]):
                gram_dist = train_grams[n][gram]
                gram_pred = max(gram_dist, key=gram_dist.get)    
                gram_preds.append(gram_pred)
            else:
                subgrams = ngram(gram, n - 1)
                for subgram in subgrams:
                    subgram_preds = []
                    if (subgram in train_grams[n - 1]):
                        subgram_dist = train_grams[n - 1][subgram]
                        subgram_pred = max(subgram_dist, key=subgram_dist.get)
                        subgram_preds.append(subgram_pred)
                    else:
                        subsubgrams = ngram(subgram, n - 2)
                        for subsubgram in subsubgrams:
                            subsubgram_preds = []
                            if (subsubgram in train_grams[n - 2]):
                                subsubgram_dist = train_grams[n - 2][subsubgram]
                                subsubgram_pred = max(subsubgram_dist, key=subsubgram_dist.get)
                                subsubgram_preds.append(subsubgram_pred)
                        if subsubgram_preds:
                            subsubgram_pos = max(set(subsubgram_preds), key=subsubgram_preds.count)
                            subgram_preds.append(subsubgram_pos)
                if subgram_preds:
                    subgram_pos = max(set(subgram_preds), key=subgram_preds.count)
                    gram_preds.append(subgram_pos)
                else:
                    unseen_grams.append(gram)
                # unseen_grams.append(gram)
        if gram_preds:
            pos_pred = max(set(gram_preds), key=gram_preds.count)
        else:
            pos_pred = None
        word['pos_pred'] = pos_pred
    
    accuracy = sum([(word['pos'] == word['pos_pred']) for word in test_data]) / len(test_data)
    
    if verbose:
        logger.info(f"Accuracy: {accuracy * 100:.2f}%, unseen grams: {len(unseen_grams)}")

    results = {
        'train_data': train_data,
        'test_data': test_data,
        'train_grams': train_grams,
        'unseen_grams': unseen_grams,
        'accuracy': accuracy,
        'seed': seed,
    }
    
    return results


def cv(data, model=ngram_bos_model, folds=10, **kwargs):
    logger = logging.getLogger(__name__)
    logger.info(f"Running {folds}-fold CV on n-gram bag-of-syllables model:")
    results = []
    accuracies = []
    for i in range(1, folds + 1):
        logger.info(f"Training fold {i}:")
        seed = random.randint(1, 2**10)
        result = model(data, seed=seed, **kwargs)
        accuracy = result['accuracy']
        results.append(result)
        accuracies.append(accuracy)
    logger.info(f"{folds}-fold CV results: accuracy {np.mean(accuracies) * 100:.2f}%, std {np.std(accuracies):.2f}")
    accuracies.append(accuracy)
    return results

In [197]:
results = ngram_bos_model(data, n=3)

INFO:__main__:N-gram bag of syllables model: n=3
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=20]
INFO:__main__:Accuracy: 90.24%, unseen grams: 298


In [58]:
# Try removing accents
data_no_accent = copy.deepcopy(data)
for word in data_no_accent:
    if 'syllables' in word:
        word['syllables'] = [strip_breathing(strip_accents(syllable)) for syllable in word['syllables']] 

In [146]:
[word['syllables'] for word in data][:10]

[['Θου', 'κυ', 'δί', 'δης'],
 ['Ἀ', 'θη', 'ναῖ', 'ος'],
 ['ξυ', 'νέ', 'γρα', 'ψε'],
 ['τὸν'],
 ['πό', 'λε', 'μον'],
 ['τῶν'],
 ['Πε', 'λο', 'πον', 'νη', 'σί', 'ων'],
 ['καὶ'],
 ['Ἀ', 'θη', 'ναί', 'ων'],
 [',']]

In [147]:
[word['syllables'] for word in data_no_accent][:10]

[['Θου', 'κυ', 'δι', 'δης'],
 ['Α', 'θη', 'ναι', 'ος'],
 ['ξυ', 'νε', 'γρα', 'ψε'],
 ['τον'],
 ['πο', 'λε', 'μον'],
 ['των'],
 ['Πε', 'λο', 'πον', 'νη', 'σι', 'ων'],
 ['και'],
 ['Α', 'θη', 'ναι', 'ων'],
 [',']]

In [196]:
_ = cv(data_no_accent, folds=1, n=3)

INFO:__main__:Running 1-fold CV on n-gram bag-of-syllables model:
INFO:__main__:Training fold 1:
INFO:__main__:N-gram bag of syllables model: n=3
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=565]
INFO:__main__:Accuracy: 88.93%, unseen grams: 129
INFO:__main__:1-fold CV results: accuracy 88.93%, std 0.00
