In [1]:
%load_ext autoreload
%autoreload 2

In [19]:
import os
import logging
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hydra import initialize, compose
import hydra

from cgpos.utils.util import import_pkl

In [149]:
from greek_accentuation.characters import strip_accents

In [3]:
# Reset hydra
hydra.core.global_hydra.GlobalHydra.instance().clear()
# Load hydra params
initialize("../conf/", version_base=None)
config = compose(config_name='main')
# Init logger
logging.basicConfig(level=logging.INFO) 

In [4]:
# Load data
data = import_pkl(config.perseus.featurized)

In [150]:
# Remove accents
data_no_accent = data.copy()
for word in data_no_accent:
    if 'syllables' in word:
        word['syllables'] = [strip_accents(syllable) for syllable in word['syllables']] 

In [151]:
# Bag of syllables
def bos(data, train=0.8): 
    logger = logging.getLogger(__name__)
    # Train test split
    logger.info(f"Train-test split: {train * 100}%")
    shuffled = random.sample(data, len(data))
    train_ind = int(len(data) * train)
    train_data = shuffled[:train_ind] 
    test_data = shuffled[train_ind:]
    logger.info(f"Train obs: {len(train_data)}")
    logger.info(f"Test obs:  {len(test_data)}")

    # Build bag of syllables feature
    bos = defaultdict(Counter)
    for word in train_data:
        if 'pos' in word:
            pos = word['pos']
            for syllable in word['syllables']:
                bos[syllable][pos] +=1
    
    results = []
    for obs in test_data:
        try:
            syl_data = []
            pred_syl = []
            for syl in obs['syllables']:
                syl_dict = bos[syl]
                syl_data.append((syl, syl_dict))
                pred_syl.append(max(syl_dict, key=syl_dict.get))
            true = obs['pos']    
            pred = max(set(pred_syl), key=pred_syl.count)
            results.append({
                'true': true,
                'pred': pred,
                'pred_syl': pred_syl,
                'syl_data': syl_data,
            })
        except (KeyError, ValueError):
            pass

    acc = np.mean([res['true'] == res['pred'] for res in results]) * 100
    logger.info(f"Accuracy {acc:.{2}f}%")

    return [train_data, test_data, bos, test, results]

In [157]:
data_results = bos(data)

INFO:__main__:Train-test split: 80.0%
INFO:__main__:Train obs: 446334
INFO:__main__:Test obs:  111584
INFO:__main__:Accuracy 62.58%


In [158]:
data_no_accent_results = bos(data_no_accent, 0.1)

INFO:__main__:Train-test split: 10.0%
INFO:__main__:Train obs: 55791
INFO:__main__:Test obs:  502127
INFO:__main__:Accuracy 62.93%
