In [1]:
from abc import ABC, abstractmethod
import re
from sacremoses import MosesTokenizer
import Levenshtein
import spacy
import nltk
import pickle
import urllib
import os
import tarfile
import zipfile
import seaborn as sns
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import numpy as np
# import paths
from string import punctuation
import matplotlib.pyplot as plt
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk import word_tokenize
RESOURCES_DIR = Path('../resources')
DATASETS_PATH = RESOURCES_DIR / "datasets"
WORD_EMBEDDINGS_NAME ='combined_320'
DUMPS_DIR = RESOURCES_DIR / "DUMPS"

import gensim
stopwords = set(stopwords.words("dutch"))

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [2]:
def ControlDivisionByZero(numerator, denominator):
    return numerator / denominator if denominator != 0 else 0


class FeatureAbstract(ABC):

    @abstractmethod
    def get_ratio(self, kwargs):
        pass

    @abstractmethod
    def calculate_ratio(self, simple_text, original_text):
        pass


class Feature(FeatureAbstract):

    def __init__(self, split, target_ratio):
        self.split = split
        self.target_ratio = target_ratio

    def get_ratio(self, kwargs):
        if not 'original_text_preprocessed' in kwargs:
            kwargs['original_text_preprocessed'] = ""

        if self.split == "train":
            simple_text = kwargs.get('simple_text')
            original_text = kwargs.get('original_text')
            result_ratio = self.calculate_ratio(simple_text, original_text)

        elif self.split == "valid" or self.split == "test":
            result_ratio = self.target_ratio
        else:
            raise ValueError("stage value not supported")
        kwargs['original_text_preprocessed'] += f'{self.name}_{result_ratio} '
        return kwargs

    @property
    def name(self):
        class_name = self.__class__.__name__
        name = ""
        for word in re.findall('[A-Z][^A-Z]*', class_name):
            if word: name += word[0]
        if not name: name = class_name
        return name


class WordLengthRatio(Feature):

    def __init__(self, stage, target_ratio):
        super().__init__(stage, target_ratio)
        if stage == "train":
            # THIS IS A WORD TOKENIZER, we need one for dutch
            # nl_core_news_sm spacy  spacy.nl_core_news_sm 
            # nltk.word_tokenize
            self.tokenizer =  MosesTokenizer(lang='nl') #  nltk.word_tokenize(language='dutch')  # Moses Tokenizer for Dutch language

    def calculate_ratio(self, simple_text, original_text):
        return round(ControlDivisionByZero(
            len(self.tokenizer.tokenize(simple_text)),
            len(self.tokenizer.tokenize(original_text))), 2)


class CharLengthRatio(Feature):

    def __init__(self, stage, target_ratio):
        super().__init__(stage, target_ratio)

    def calculate_ratio(self, simple_text, original_text):
        return round(ControlDivisionByZero(len(simple_text),
            len(original_text)), 2)


class LevenshteinRatio(Feature):

    def __init__(self, stage, target_ratio):
        super().__init__(stage, target_ratio)

    def calculate_ratio(self, simple_text, original_text):
        simple_text = word_tokenize(simple_text,language='dutch')
        original_text = word_tokenize(original_text,language='dutch')
        # complex_sentence = tokenize(complex_sentence) # ,language='dutch')

        # simple_sentence = tokenize(simple_sentence) # ,language='dutch')
        return round(Levenshtein.seqratio(original_text,
                                       simple_text), 2)


class DependencyTreeDepthRatio(Feature):

    def __init__(self, stage, target_ratio):
        super().__init__(stage, target_ratio)
        if stage == "train":
            self.nlp = self.get_spacy_model()

    def get_spacy_model(self):

        model = 'nl_core_news_sm'  # from spacy, Dutch pipeline optimized for CPU. Components: tok2vec, morphologizer, tagger, parser, lemmatizer (trainable_lemmatizer), senter, ner.
        if not spacy.util.is_package(model):
            spacy.cli.download(model)
            spacy.cli.link(model, model, force=True, model_path=spacy.util.get_package_path(model))
        return spacy.load(model)

    def calculate_ratio(self, simple_text, original_text):

        result_ratio = round(ControlDivisionByZero(
            self.get_dependency_tree_depth(simple_text),
            self.get_dependency_tree_depth(original_text)), 2)

        return result_ratio

    def get_dependency_tree_depth(self, sentence):

        def get_subtree_depth(node):
            if len(list(node.children)) == 0:
                return 0
            return 1 + max([get_subtree_depth(child) for child in node.children])

        tree_depths = [get_subtree_depth(spacy_sentence.root) for spacy_sentence in self.nlp(sentence).sents]
        if len(tree_depths) == 0:
            return 0
        return max(tree_depths)


class WordRankRatio(Feature):
    # single underscore = internally

    def __init__(self, stage, target_ratio): # constructor of the class 
        super().__init__(stage, target_ratio)
        if stage == "train":
            self.tokenizer = MosesTokenizer(lang='nl')
            self.word2rank = self._get_word2rank()
            print('finished get word2rank')
            # store in file
            # if not present, make file
            # with open ("./resources/DUMPS/word2rank.txt", "a", encoding="utf8") as file: 
            #     file.writelines(self.word2rank)
            #     file.write("\n")
            #     file.close()
            print('length of word2rank', len(self.word2rank))
            self.length_rank = len(self.word2rank) # hier length of the file! 

    def calculate_ratio(self, simple_text, original_text):

        result_ratio = round(min(ControlDivisionByZero(self.get_lexical_complexity_score(simple_text),
                                                       self.get_lexical_complexity_score(original_text)),
                                 2), 2)

        return result_ratio

    def get_lexical_complexity_score(self, sentence, quantile_value=0.75):

        words = self.tokenizer.tokenize(self._remove_stopwords(self._remove_punctuation(sentence)))
        words = [word for word in words if word in self.word2rank]
        if len(words) == 0:
            return np.log(1 + self.length_rank)
        return np.quantile([self._get_rank(word) for word in words], quantile_value)

    def _remove_punctuation(self, text):
        return ' '.join([word for word in self.tokenizer.tokenize(text) if not self._is_punctuation(word)])

    def _remove_stopwords(self, text):
        return ' '.join([w for w in self.tokenizer.tokenize(text) if w.lower() not in stopwords])

    def _is_punctuation(self, word):
        return ''.join([char for char in word if char not in punctuation]) == ''

    def _get_rank(self, word):
        rank = self.word2rank.get(word, self.length_rank)
        return np.log(1 + rank)

    def _get_word2rank(self, vocab_size=np.inf):
        model_filepath = DUMPS_DIR / f"{WORD_EMBEDDINGS_NAME}.pk"
        if model_filepath.exists():
            with open(model_filepath, 'rb') as f:
                model = pickle.load(f)
            return model
        else:
            print("Opening conll model...")
            print("Preprocessing word2rank...")
            DUMPS_DIR.mkdir(parents=True, exist_ok=True)
            WORD_EMBEDDINGS_PATH = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
            lines_generator = self._yield_lines(WORD_EMBEDDINGS_PATH)
            word2rank = {}
            # next(lines_generator)
            for i, line in enumerate(lines_generator):
                if i >= vocab_size: break
                word = line.split(' ')[0]
                word2rank[word] = i

            pickle.dump(word2rank, open(model_filepath, 'wb'))
            # txt_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
            # zip_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.zip'
            # if txt_file.exists(): txt_file.unlink()
            # if zip_file.exists(): zip_file.unlink()
            return word2rank
        
        # else:            
        #     print("Downloading dutch embeddings ...") # pretrained vectors
        #     self._download_twitter_embeddings(model_name='coostco', dest_dir=str(DUMPS_DIR))
        #     print("Preprocessing word2rank...")
        #     DUMPS_DIR.mkdir(parents=True, exist_ok=True)
        #     WORD_EMBEDDINGS_PATH = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.bin'
        #     model = self._load_word_embeddings(WORD_EMBEDDINGS_PATH) # returns index_to_key
        #     # store into file
        #     lines_generator = model # self._yield_lines(model) # (WORD_EMBEDDINGS_PATH)
            
        #     word2rank = {}
        #     # next(lines_generator)
        #     print('vocab_size', vocab_size)
        #     for i, line in enumerate(lines_generator):
        #         if i >= vocab_size: break # its not vocab size any more but  # len(model.key_to_index)
        #         word = line.split(',')[0]
        #         print('word', word)
        #         word2rank[word] = i
        #         print('ranked word?', word2rank[word])
                
        #     pickle.dump(word2rank, open(model_filepath, 'wb'))
        #     txt_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
        #     zip_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.zip'
        #     # if txt_file.exists(): txt_file.unlink()
        #     # if zip_file.exists(): zip_file.unlink()
        #     # print(word2rank)
        #     return word2rank
    
    def _load_word_embeddings(self, filepath):
        model = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=True) # '../resources/DUMPS/model.bin'
        model_indexes = model.index_to_key
        return model_indexes
    
    def _download_twitter_embeddings(self, model_name, dest_dir): # pretrained rankings
        url = ''
        if model_name == 'coosto_model':
            url = 'https://github.com/coosto/dutch-word-embeddings/releases/download/v1.0/model.bin'
        file_path = self._download_url(url, dest_dir)
        out_filepath = Path(file_path)
        out_filepath = out_filepath.parent / f'{out_filepath.stem}.txt'
        # print(out_filepath, out_filepath.exists())
        if not out_filepath.exists():
            print("Extracting: ", Path(file_path).name)
            self._unzip(file_path, dest_dir)

    def _yield_lines(self, filepath):
        filepath = Path(filepath)
        with filepath.open('r', encoding="latin-1") as f:
            for line in f:
                # print(line)
                yield line.rstrip()

    def _download_url(self, url, output_path):
        name = url.split('/')[-1]
        file_path = f'{output_path}/{name}'
        if not Path(file_path).exists():
            with tqdm(unit='B', unit_scale=True, leave=True, miniters=1,
                      desc=name) as t:  # all optional kwargs
                urllib.request.urlretrieve(url, filename=file_path, reporthook=self._download_report_hook(t), data=None)
        return file_path

    def _unzip(self, file_path, dest_dir=None):
        if dest_dir is None:
            dest_dir = os.path.dirname(file_path)
        if file_path.endswith('.zip'):
            with zipfile.ZipFile(file_path, "r") as zip_ref:
                zip_ref.extractall(dest_dir)
        elif file_path.endswith("tar.gz") or file_path.endswith("tgz"):
            tar = tarfile.open(file_path, "r:gz")
            tar.extractall(dest_dir)
            tar.close()
        elif file_path.endswith("tar"):
            tar = tarfile.open(file_path, "r:")
            tar.extractall(dest_dir)
            tar.close()

    def _download_report_hook(self, t):
        last_b = [0]

        def inner(b=1, bsize=1, tsize=None):
            if tsize is not None:
                t.total = tsize
            t.update((b - last_b[0]) * bsize)
            last_b[0] = b

        return inner

In [3]:
complex_sentence = "Sommige steden aan de Eyre Highway in de zuidoostelijke hoek van West-Australië, tussen de Zuid-Australische grens bijna tot aan Caiguna, volgen de officiële West-Australische tijd niet."
simple_sentence = "Sommige steden in West-Australië gebruiken geen West-Australische tijd."

In [4]:
wordRank = WordRankRatio("train", 0.8)

finished get word2rank
length of word2rank 989820


In [5]:
wordRank.get_lexical_complexity_score(complex_sentence)

6.9917687012441645

In [6]:
wordRank.get_lexical_complexity_score(simple_sentence)

6.579682160499392

In [7]:
complex_sentence = "Het ruimtevaartuig bestaat uit twee hoofdelementen: de NASA Cassini-orbiter, genoemd naar de Italiaans-Franse astronoom Giovanni Domenico Cassini, en de ESA Huygens-sonde, genoemd naar de Nederlandse astronoom, wiskundige en natuurkundige Christiaan Huygens."
simple_sentence = "Het ruimtevaartuig heeft twee hoofdelementen: de NASA Cassini-orbiter en de ESA Huygens-sonde."

In [8]:
wordRank.get_lexical_complexity_score(complex_sentence)

8.972449192984996

In [9]:
wordRank.get_lexical_complexity_score(simple_sentence)

11.278365319538235

In [10]:
complex_sentence = "Dit was het gebied ten oosten van de monding van de rivier de Vistula, later ook wel 'eigenlijk Pruisen' genoemd."
simple_sentence="Pruisen was eigenlijk de plaats ten oosten van de monding van de rivier de Vistula."

In [11]:
wordRank.get_lexical_complexity_score(complex_sentence)

7.927081784052815

In [12]:
wordRank.get_lexical_complexity_score(simple_sentence)

6.842432913359531

In [67]:
model = gensim.models.KeyedVectors.load_word2vec_format('../resources/DUMPS/combined_320.txt', binary=False, limit=980000, encoding='utf8') # 989820

In [68]:
model.most_similar("de")

[('van', 0.6756626963615417),
 ('in', 0.6316277384757996),
 ('een', 0.6282400488853455),
 ('en', 0.5985819101333618),
 ('eveneens', 0.5922880172729492),
 ('honderddagenoffensief', 0.5661090612411499),
 ('die', 0.565862238407135),
 ('maslenica', 0.562157928943634),
 ('tourseizoen', 0.5596410036087036),
 ('andreaspenning', 0.5588655471801758)]

In [69]:
model.most_similar("monding")

[('rivier', 0.8399328589439392),
 ('benedenloop', 0.8128615021705627),
 ('bovenloop', 0.7880266308784485),
 ('estuarium', 0.7785959243774414),
 ('samenvloeiing', 0.778411865234375),
 ('uitmonding', 0.7749341726303101),
 ('stroomafwaarts', 0.7470329403877258),
 ('pjasina', 0.7449982166290283),
 ('stroomopwaarts', 0.7386379837989807),
 ('baai', 0.7342506051063538)]

In [70]:
model.most_similar("schiereiland")

[('eiland', 0.7068321704864502),
 ('koerileneiland', 0.7001560926437378),
 ('tsjoektsjenschiereiland', 0.6942179799079895),
 ('iberisch', 0.6905044913291931),
 ('vasteland', 0.6888741254806519),
 ('reykjanes', 0.6877791285514832),
 ('noordkust', 0.6849595904350281),
 ('zuidkust', 0.6840841174125671),
 ('westelijkste', 0.6813684105873108),
 ('snæfellsnes', 0.680156409740448)]

In [71]:
model.most_similar("chitine") # what to do with that? 

[('keratine', 0.8063088059425354),
 ('exoskelet', 0.7815277576446533),
 ('peptidoglycaan', 0.777212917804718),
 ('polysachariden', 0.7538168430328369),
 ('suberine', 0.7455270290374756),
 ('polysacharide', 0.7441681027412415),
 ('conchyoline', 0.7357071042060852),
 ('hemicellulose', 0.7300746440887451),
 ('celwanden', 0.7300340533256531),
 ('bètakeratine', 0.7290903329849243)]

In [72]:
model.most_similar("muiterij")

[('muiters', 0.6793466806411743),
 ('sepoy', 0.6448593735694885),
 ('hmav', 0.6437531113624573),
 ('oproer', 0.6362000107765198),
 ('opstand', 0.6251309514045715),
 ('vlieterincident', 0.6169967651367188),
 ('rebellie', 0.6056457757949829),
 ('scheepsmacht', 0.6023922562599182),
 ('mantotmangevechten', 0.6016296148300171),
 ('torpedering', 0.6006870269775391)]

In [73]:
model.most_similar("moslimminderheid")

[('pomaken', 0.7338395118713379),
 ('turkomannen', 0.6837456822395325),
 ('beloetsjen', 0.6833829283714294),
 ('gorani', 0.6724549531936646),
 ('arbëreshë', 0.672112762928009),
 ('aroemenen', 0.666069746017456),
 ('vlachen', 0.6630237102508545),
 ('gagaoezen', 0.6590104103088379),
 ('janjevci', 0.6576855778694153),
 ('pomakken', 0.6495280861854553)]

In [74]:
model.most_similar("benedenverdieping") 

[('bovenverdieping', 0.8295695781707764),
 ('beletage', 0.763651430606842),
 ('verdieping', 0.7369603514671326),
 ('bovenverdiepingen', 0.7291617393493652),
 ('oostvleugel', 0.7169917821884155),
 ('binnenkoer', 0.7102579474449158),
 ('dienstruimten', 0.6976855397224426),
 ('hoofdverdieping', 0.6954277157783508),
 ('begane', 0.6929664015769958),
 ('dienstruimte', 0.692493736743927)]

In [75]:
model.most_similar("tandheelkundige")

[('chirurgische', 0.7050020098686218),
 ('medische', 0.6709998846054077),
 ('orthodontische', 0.64791339635849),
 ('trombolyse', 0.6358797550201416),
 ('orthopedische', 0.6326757669448853),
 ('fysiotherapeutische', 0.6326483488082886),
 ('interventionele', 0.6310393810272217),
 ('minimaalinvasieve', 0.6308104395866394),
 ('anesthesioloog', 0.6302860379219055),
 ('medischspecialistische', 0.6259549260139465)]

In [76]:
model.key_to_index["impasse"]

16965

In [77]:
model.key_to_index["gitaarsnaar"]

KeyError: 'gitaarsnaar'

In [78]:
model.key_to_index["aardnoten"]

154282

In [79]:
model.key_to_index["van"]

1

In [None]:
model.most_similar("Warmtekrachtkoppeling")

KeyError: "Key 'Warmtekrachtkoppeling' not present in vocabulary"

In [80]:
model.most_similar("van")

[('de', 0.6756627559661865),
 ('en', 0.63153475522995),
 ('in', 0.6282373070716858),
 ('eveneens', 0.626492977142334),
 ('tevens', 0.6048368811607361),
 ('cralingen', 0.6007931232452393),
 ('notabelenvergadering', 0.5996974110603333),
 ('evenals', 0.597119927406311),
 ('burense', 0.59560626745224),
 ('voocht', 0.5947650671005249)]