In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
import os
import logging
from collections import defaultdict, Counter
import random
import copy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hydra import initialize, compose
import hydra
from greek_accentuation.characters import strip_accents, strip_breathing

from cgpos.utils.util import import_pkl, export_pkl, get_abs_dir

In [3]:
# Reset hydra
hydra.core.global_hydra.GlobalHydra.instance().clear()
# Load hydra params
initialize("../conf/", version_base=None)
config = compose(config_name='main')
# Init logger
logging.basicConfig(level=logging.INFO) 

In [4]:
# Load data
data_raw = import_pkl(config.perseus.featurized)
# Clean
data = [word for word in data_raw if 'pos' in word]

INFO:cgpos.utils.util:Importing /home/tejomay/cgpos/data/processed/perseus_featurized.pkl


In [5]:
def ngram(sequence, n):
    """
    Generate ngram from a sequence of tokens.
    """
    len_sequence = len(sequence)
    n_passes = max(1, len_sequence - n + 1)
    return (tuple(sequence[i:(i + n)]) for i in range(n_passes))

def ngram_bos(data, n=1):
    """
    Generate n-gram bag of syllables for given data.
    """
    bos = defaultdict(Counter)
    for word in data:
        pos = word['pos']
        syllables = word['syllables']
        for gram in ngram(syllables, n):
            bos[gram][pos] +=1 
    return bos

In [11]:
# Bag of syllables
def ngram_bos_model(data, n=2, train=0.8, verbose=True, seed=20):
    """
    Implement n-gram bag of syllable model.
    """
    logger = logging.getLogger(__name__)
    logger.info(f"N-gram bag of syllables model: n={n}")
    
    # Train test split
    random.seed(seed)
    shuffled = random.sample(data, len(data))
    train_ind = int(len(data) * train)
    train_data = shuffled[:train_ind] 
    test_data = shuffled[train_ind:]

    if verbose:
        logger.info(f"Train-test split {train}: (train={len(train_data)}, test={len(test_data)}) [seed={seed}]")

    # Train
    train_grams = {}
    for i in range(1, n + 1):
        train_gram = ngram_bos(train_data, n=i)
        train_grams[i] = train_gram

    # Test
    for word in test_data:
        gram_dist = Counter()
        for gram in ngram(word['syllables'], n):
            if (gram in train_grams[n]):
                gram_dist += train_grams[n][gram]
            else:
                for subgram in ngram(gram, n - 1):
                    if (subgram in train_grams[n - 1]):
                        gram_dist += train_grams[n - 1][subgram]
                    else:
                        for subsubgram in ngram(subgram, n - 2):
                            if (subsubgram in train_grams[n - 2]):
                                gram_dist += train_grams[n - 2][subsubgram]
        if gram_dist:
            pos_pred = max(gram_dist, key=gram_dist.get)
        else:
            pos_pred = None
        word['pos_pred'] = pos_pred
        word['gram_dist'] = gram_dist

    accuracy = sum([(word['pos'] == word['pos_pred']) for word in test_data]) / len(test_data)
    
    if verbose:
        logger.info(f"Accuracy: {accuracy * 100:.2f}%")

    results = {
        'train_data': train_data,
        'test_data': test_data,
        'train_grams': train_grams,
        'accuracy': accuracy,
        'seed': seed,
    }
    
    return results


def cv(data, model=ngram_bos_model, folds=10, **kwargs):
    logger = logging.getLogger(__name__)
    logger.info(f"Running {folds}-fold CV on n-gram bag-of-syllables model:")
    results = []
    accuracies = []
    for i in range(1, folds + 1):
        logger.info(f"Training fold {i}:")
        seed = random.randint(1, 2**10)
        result = model(data, seed=seed, **kwargs)
        accuracy = result['accuracy']
        results.append(result)
        accuracies.append(accuracy)
    logger.info(f"{folds}-fold CV results: accuracy {np.mean(accuracies) * 100:.2f}%, std {np.std(accuracies):.2f}")
    accuracies.append(accuracy)
    return results

In [12]:
results = ngram_bos_model(data, n=3)

INFO:__main__:N-gram bag of syllables model: n=3
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=20]
INFO:__main__:Accuracy: 89.38%


In [20]:
df = pd.DataFrame(results['test_data'])

In [27]:
export_dir = get_abs_dir(config.data.results + '/additive_ngram_bos_output.csv')
df[['uid', 'form', 'norm', 'syllables', 'pos', 'pos_pred', 'gram_dist']].to_csv(export_dir)