In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import logging
from collections import defaultdict, Counter
import random
import copy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hydra import initialize, compose
import hydra
from greek_accentuation.characters import strip_accents, strip_breathing
from nltk import edit_distance

from cgpos.utils.util import import_pkl, export_pkl

In [3]:
# Reset hydra
hydra.core.global_hydra.GlobalHydra.instance().clear()
# Load hydra params
initialize("../conf/", version_base=None)
config = compose(config_name='main')
# Init logger
logging.basicConfig(level=logging.INFO) 

In [4]:
# Load data
data = import_pkl(config.perseus.featurized)

INFO:cgpos.utils.util:Importing /home/tejomay/cgpos/data/processed/perseus_featurized.pkl


In [5]:
# Bag of syllables
def bos(data, train=0.8, verbose=True, seed=20):
    logger = logging.getLogger(__name__)

    # Drop data missing part-of-speech 
    data = [word for word in data if 'pos' in word]
    
    # Train test split
    random.seed(seed)
    shuffled = random.sample(data, len(data))
    train_ind = int(len(data) * train)
    train_data = shuffled[:train_ind] 
    test_data = shuffled[train_ind:]

    if verbose:
        logger.info(f"Train-test split {train}: (train={len(train_data)}, test={len(test_data)}) [seed={seed}]")

    # Train
    bos = defaultdict(Counter)
    for word in train_data:
        if 'pos' in word:
            pos = word['pos']
            for syllable in word['syllables']:
                bos[syllable][pos] +=1

    # Test
    unseen_syllables = []
    word_data = []
    for word in test_data:
        uid = word['uid']
        form = word['form']
        norm = word['norm']
        syllables = word['syllables']
        true_pos = word['pos']
        syllable_data = []
        syllable_preds = []
        has_unseen = False
        for syllable in syllables:
            if (syllable in bos):
                syllable_bos = bos[syllable]
                syllable_pred = max(syllable_bos, key=syllable_bos.get)    
                syllable_data.append((syllable, syllable_bos))
                syllable_preds.append(syllable_pred)
            else:
                has_unseen = True                
                unseen_syllables.append({
                    'uid': uid, 
                    'form': form, 
                    'norm': norm, 
                    'pos': pos, 
                    'syllable': syllable, 
                    'syllables': syllables,
                })
        pred_pos = None
        if syllable_preds:
            pred_pos = max(set(syllable_preds), key=syllable_preds.count)
        word_data.append({
            'has_unseen_syllable': has_unseen,
            'uid': uid,
            'form': form,
            'norm': norm,
            'sylables': syllables,
            'true_pos': true_pos,
            'pred_pos': pred_pos,
            'syllable_preds': syllable_preds,
            'syllable_data': syllable_data,
        })
    
    accuracy = sum([(word['true_pos'] == word['pred_pos']) for word in word_data]) / len(word_data)
    
    if verbose:
        len_unseen_syllables = len(unseen_syllables)
        percent_unseen_syllables = len_unseen_syllables / sum([len(word['syllables']) for word in test_data])
        logger.info(f"Accuracy: {accuracy * 100:.2f}%, unseen syllables: {len_unseen_syllables} ({percent_unseen_syllables * 100:.2f}% missing)")

    results = {
        'train_data': train_data,
        'test_data': test_data,
        'word_data': word_data,
        'bos': bos,
        'unseen_syllables': unseen_syllables,
        'accuracy': accuracy,
        'seed': seed,
    }

    return results

def cv(data, model=bos, folds=10, **kwargs):
    logger = logging.getLogger(__name__)
    logger.info(f"Running {folds}-fold CV on Bag-of-Syllables model:")
    results = []
    accuracies = []
    for i in range(1, folds + 1):
        logger.info(f"Training fold {i}:")
        seed = random.randint(1, 2**10)
        result = model(data, seed=seed, **kwargs)
        accuracy = result['accuracy']
        results.append(result)
        accuracies.append(accuracy)
    logger.info(f"Final {folds}-fold CV accuracy: accuracy {np.mean(accuracies) * 100:.2f}%, std {np.std(accuracies):.2f}")
    return results

In [6]:
results = cv(data)

INFO:__main__:Running 10-fold CV on Bag-of-Syllables model:
INFO:__main__:Training fold 1:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=345]
INFO:__main__:Accuracy: 69.01%, unseen syllables: 466 (0.21% missing)
INFO:__main__:Training fold 2:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=94]
INFO:__main__:Accuracy: 68.96%, unseen syllables: 488 (0.22% missing)
INFO:__main__:Training fold 3:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=517]
INFO:__main__:Accuracy: 69.00%, unseen syllables: 478 (0.21% missing)
INFO:__main__:Training fold 4:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=18]
INFO:__main__:Accuracy: 68.90%, unseen syllables: 498 (0.22% missing)
INFO:__main__:Training fold 5:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=86]
INFO:__main__:Accuracy: 68.68%, unseen syllables: 452 (0.20% missing)
INFO:__main__:Training fold 6:
INFO:__main__:Train-test split 0

In [7]:
results[0]['word_data'][:10]

[{'has_unseen_syllable': False,
  'uid': '76364',
  'form': 'τέ',
  'norm': 'τέ',
  'sylables': ['τέ'],
  'true_pos': 'particle',
  'pred_pos': 'verb',
  'syllable_preds': ['verb'],
  'syllable_data': [('τέ',
    Counter({'verb': 920,
             'adjective': 799,
             'noun': 726,
             'particle': 163,
             'adverb': 158,
             'pronoun': 15,
             'conjunction': 4,
             'irregular': 1,
             'N/A': 1}))]},
 {'has_unseen_syllable': False,
  'uid': '374327',
  'form': 'εἴ',
  'norm': 'εἴ',
  'sylables': ['εἴ'],
  'true_pos': 'conjunction',
  'pred_pos': 'verb',
  'syllable_preds': ['verb'],
  'syllable_data': [('εἴ',
    Counter({'verb': 613,
             'conjunction': 427,
             'adverb': 60,
             'noun': 56,
             'adjective': 52,
             'adposition': 32,
             'numeral': 19,
             'interjection': 5,
             'N/A': 2}))]},
 {'has_unseen_syllable': False,
  'uid': '240086',
  'form': 

In [8]:
# Export results
export_pkl(results, config.results.bos)

INFO:cgpos.utils.util:Exporting /home/tejomay/cgpos/data/results/bos.pkl


In [9]:
# Try removing accents
data_no_accent = copy.deepcopy(data)
for word in data_no_accent:
    if 'syllables' in word:
        word['syllables'] = [strip_breathing(strip_accents(syllable)) for syllable in word['syllables']] 

In [10]:
[word['syllables'] for word in data][:10]

[['Θου', 'κυ', 'δί', 'δης'],
 ['Ἀ', 'θη', 'ναῖ', 'ος'],
 ['ξυ', 'νέ', 'γρα', 'ψε'],
 ['τὸν'],
 ['πό', 'λε', 'μον'],
 ['τῶν'],
 ['Πε', 'λο', 'πον', 'νη', 'σί', 'ων'],
 ['καὶ'],
 ['Ἀ', 'θη', 'ναί', 'ων'],
 [',']]

In [11]:
[word['syllables'] for word in data_no_accent][:10]

[['Θου', 'κυ', 'δι', 'δης'],
 ['Α', 'θη', 'ναι', 'ος'],
 ['ξυ', 'νε', 'γρα', 'ψε'],
 ['τον'],
 ['πο', 'λε', 'μον'],
 ['των'],
 ['Πε', 'λο', 'πον', 'νη', 'σι', 'ων'],
 ['και'],
 ['Α', 'θη', 'ναι', 'ων'],
 [',']]

In [13]:
_ = cv(data_no_accent)

INFO:__main__:Running 10-fold CV on Bag-of-Syllables model:
INFO:__main__:Training fold 1:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=555]
INFO:__main__:Accuracy: 61.54%, unseen syllables: 225 (0.10% missing)
INFO:__main__:Training fold 2:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=388]
INFO:__main__:Accuracy: 61.78%, unseen syllables: 180 (0.08% missing)
INFO:__main__:Training fold 3:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=258]
INFO:__main__:Accuracy: 61.62%, unseen syllables: 214 (0.09% missing)
INFO:__main__:Training fold 4:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=451]
INFO:__main__:Accuracy: 61.58%, unseen syllables: 192 (0.08% missing)
INFO:__main__:Training fold 5:
INFO:__main__:Train-test split 0.8: (train=439916, test=109980) [seed=661]
INFO:__main__:Accuracy: 61.40%, unseen syllables: 198 (0.09% missing)
INFO:__main__:Training fold 6:
INFO:__main__:Train-test spli