In [30]:
from abc import ABC, abstractmethod
import re
from sacremoses import MosesTokenizer
import Levenshtein
import spacy
import nltk
import pickle
import urllib
import os
import tarfile
import zipfile
import seaborn as sns
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import numpy as np
# import paths
import gensim
from string import punctuation
import matplotlib.pyplot as plt
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk import word_tokenize
RESOURCES_DIR = Path('../resources')
DATASETS_PATH = RESOURCES_DIR / "datasets"
WORD_EMBEDDINGS_NAME = 'coosto_model' 
DUMPS_DIR = RESOURCES_DIR / "DUMPS"

stopwords = set(stopwords.words("dutch"))

In [31]:
def ControlDivisionByZero(numerator, denominator):
    return numerator / denominator if denominator != 0 else 0


class FeatureAbstract(ABC):

    @abstractmethod
    def get_ratio(self, kwargs):
        pass

    @abstractmethod
    def calculate_ratio(self, simple_text, original_text):
        pass


class Feature(FeatureAbstract):

    def __init__(self, split, target_ratio):
        self.split = split
        self.target_ratio = target_ratio

    def get_ratio(self, kwargs):
        if not 'original_text_preprocessed' in kwargs:
            kwargs['original_text_preprocessed'] = ""

        if self.split == "train":
            simple_text = kwargs.get('simple_text')
            original_text = kwargs.get('original_text')
            result_ratio = self.calculate_ratio(simple_text, original_text)

        elif self.split == "valid" or self.split == "test":
            result_ratio = self.target_ratio
        else:
            raise ValueError("stage value not supported")
        kwargs['original_text_preprocessed'] += f'{self.name}_{result_ratio} '
        return kwargs

    @property
    def name(self):
        class_name = self.__class__.__name__
        name = ""
        for word in re.findall('[A-Z][^A-Z]*', class_name):
            if word: name += word[0]
        if not name: name = class_name
        return name


class WordLengthRatio(Feature):

    def __init__(self, stage, target_ratio):
        super().__init__(stage, target_ratio)
        if stage == "train":
            # THIS IS A WORD TOKENIZER, we need one for dutch
            # nl_core_news_sm spacy  spacy.nl_core_news_sm 
            # nltk.word_tokenize
            self.tokenizer =  MosesTokenizer(lang='nl') #  nltk.word_tokenize(language='dutch')  # Moses Tokenizer for Dutch language

    def calculate_ratio(self, simple_text, original_text):
        return round(ControlDivisionByZero(
            len(self.tokenizer.tokenize(simple_text)),
            len(self.tokenizer.tokenize(original_text))), 2)


class CharLengthRatio(Feature):

    def __init__(self, stage, target_ratio):
        super().__init__(stage, target_ratio)

    def calculate_ratio(self, simple_text, original_text):
        return round(ControlDivisionByZero(len(simple_text),
            len(original_text)), 2)


class LevenshteinRatio(Feature):

    def __init__(self, stage, target_ratio):
        super().__init__(stage, target_ratio)

    def calculate_ratio(self, simple_text, original_text):
        simple_text = word_tokenize(simple_text,language='dutch')
        original_text = word_tokenize(original_text,language='dutch')
        return round(Levenshtein.seqratio(original_text,
                                       simple_text), 2)


class DependencyTreeDepthRatio(Feature):

    def __init__(self, stage, target_ratio):
        super().__init__(stage, target_ratio)
        if stage == "train":
            self.nlp = self.get_spacy_model()

    def get_spacy_model(self):

        model = 'nl_core_news_sm'  # from spacy, Dutch pipeline optimized for CPU. Components: tok2vec, morphologizer, tagger, parser, lemmatizer (trainable_lemmatizer), senter, ner.
        if not spacy.util.is_package(model):
            spacy.cli.download(model)
            spacy.cli.link(model, model, force=True, model_path=spacy.util.get_package_path(model))
        return spacy.load(model)

    def calculate_ratio(self, simple_text, original_text):

        result_ratio = round(ControlDivisionByZero(
            self.get_dependency_tree_depth(simple_text),
            self.get_dependency_tree_depth(original_text)), 2)

        return result_ratio

    def get_dependency_tree_depth(self, sentence):

        def get_subtree_depth(node):
            if len(list(node.children)) == 0:
                return 0
            return 1 + max([get_subtree_depth(child) for child in node.children])

        tree_depths = [get_subtree_depth(spacy_sentence.root) for spacy_sentence in self.nlp(sentence).sents]
        if len(tree_depths) == 0:
            return 0
        return max(tree_depths)


class WordRankRatio(Feature):
    # single underscore = internally

    def __init__(self, stage, target_ratio): # constructor of the class 
        super().__init__(stage, target_ratio)
        if stage == "train":
            self.tokenizer = MosesTokenizer(lang='nl')
            self.word2rank = self._get_word2rank()
            print('finished get word2rank')
            # store in file
            # if not present, make file
            # with open ("./resources/DUMPS/word2rank.txt", "a", encoding="utf8") as file: 
            #     file.writelines(self.word2rank)
            #     file.write("\n")
            #     file.close()
            print('length of word2rank', len(self.word2rank))
            self.length_rank = len(self.word2rank) # hier length of the file! 

    def calculate_ratio(self, simple_text, original_text):

        result_ratio = round(min(ControlDivisionByZero(self.get_lexical_complexity_score(simple_text),
                                                       self.get_lexical_complexity_score(original_text)),
                                 2), 2)

        return result_ratio

    def get_lexical_complexity_score(self, sentence, quantile_value=0.75):

        words = self.tokenizer.tokenize(self._remove_stopwords(self._remove_punctuation(sentence)))
        words = [word for word in words if word in self.word2rank]
        if len(words) == 0:
            return np.log(1 + self.length_rank)
        return np.quantile([self._get_rank(word) for word in words], quantile_value)

    def _remove_punctuation(self, text):
        return ' '.join([word for word in self.tokenizer.tokenize(text) if not self._is_punctuation(word)])

    def _remove_stopwords(self, text):
        return ' '.join([w for w in self.tokenizer.tokenize(text) if w.lower() not in stopwords])

    def _is_punctuation(self, word):
        return ''.join([char for char in word if char not in punctuation]) == ''

    def _get_rank(self, word):
        rank = self.word2rank.get(word, self.length_rank)
        return np.log(1 + rank)

    def _get_word2rank(self, vocab_size=np.inf):
        model_filepath = DUMPS_DIR / f"{WORD_EMBEDDINGS_NAME}.pk"
        if model_filepath.exists():
            with open(model_filepath, 'rb') as f:
                model = pickle.load(f)
            return model
        else:            
            print("Downloading dutch embeddings ...") # pretrained vectors
            self._download_twitter_embeddings(model_name='coostco', dest_dir=str(DUMPS_DIR))
            print("Preprocessing word2rank...")
            DUMPS_DIR.mkdir(parents=True, exist_ok=True)
            WORD_EMBEDDINGS_PATH = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.bin'
            model = self._load_word_embeddings(WORD_EMBEDDINGS_PATH) # returns index_to_key
            # store into file
            lines_generator = model
            
            word2rank = {}
            print('vocab_size', vocab_size)
            for i, line in enumerate(lines_generator):
                if i >= vocab_size: break # its not vocab size any more but  # len(model.key_to_index)
                word = line.split(',')[0]
                word2rank[word] = i
            pickle.dump(word2rank, open(model_filepath, 'wb'))
            txt_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
            zip_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.zip'
            return word2rank
    
    def _load_word_embeddings(self, filepath):
        model = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=True) # '../resources/DUMPS/model.bin'
        model_indexes = model.index_to_key
        return model_indexes
    
    def _download_twitter_embeddings(self, model_name, dest_dir): # pretrained rankings
        url = ''
        if model_name == 'coosto_model':
            url = 'https://github.com/coosto/dutch-word-embeddings/releases/download/v1.0/model.bin'
        file_path = self._download_url(url, dest_dir)
        out_filepath = Path(file_path)
        out_filepath = out_filepath.parent / f'{out_filepath.stem}.txt'
        # print(out_filepath, out_filepath.exists())
        if not out_filepath.exists():
            print("Extracting: ", Path(file_path).name)
            self._unzip(file_path, dest_dir)

    def _yield_lines(self, filepath):
        filepath = Path(filepath)
        with filepath.open('r', encoding="latin-1") as f:
            for line in f:
                print(line)
                yield line.rstrip()

    def _download_url(self, url, output_path):
        name = url.split('/')[-1]
        file_path = f'{output_path}/{name}'
        if not Path(file_path).exists():
            with tqdm(unit='B', unit_scale=True, leave=True, miniters=1,
                      desc=name) as t:  # all optional kwargs
                urllib.request.urlretrieve(url, filename=file_path, reporthook=self._download_report_hook(t), data=None)
        return file_path

    def _unzip(self, file_path, dest_dir=None):
        if dest_dir is None:
            dest_dir = os.path.dirname(file_path)
        if file_path.endswith('.zip'):
            with zipfile.ZipFile(file_path, "r") as zip_ref:
                zip_ref.extractall(dest_dir)
        elif file_path.endswith("tar.gz") or file_path.endswith("tgz"):
            tar = tarfile.open(file_path, "r:gz")
            tar.extractall(dest_dir)
            tar.close()
        elif file_path.endswith("tar"):
            tar = tarfile.open(file_path, "r:")
            tar.extractall(dest_dir)
            tar.close()

    def _download_report_hook(self, t):
        last_b = [0]

        def inner(b=1, bsize=1, tsize=None):
            if tsize is not None:
                t.total = tsize
            t.update((b - last_b[0]) * bsize)
            last_b[0] = b

        return inner

In [32]:
complex_sentence = "Sommige steden aan de Eyre Highway in de zuidoostelijke hoek van West-Australië, tussen de Zuid-Australische grens bijna tot aan Caiguna, volgen de officiële West-Australische tijd niet."
simple_sentence = "Sommige steden in West-Australië gebruiken geen West-Australische tijd."

In [33]:
wordRank = WordRankRatio("train", 0.8)

finished get word2rank
length of word2rank 250479


In [34]:
wordRank.get_lexical_complexity_score(complex_sentence)

7.3459952252974965

In [35]:
wordRank.get_lexical_complexity_score(simple_sentence)

6.840188941986389

In [36]:
complex_sentence = "Het ruimtevaartuig bestaat uit twee hoofdelementen: de NASA Cassini-orbiter, genoemd naar de Italiaans-Franse astronoom Giovanni Domenico Cassini, en de ESA Huygens-sonde, genoemd naar de Nederlandse astronoom, wiskundige en natuurkundige Christiaan Huygens."
simple_sentence = "Het ruimtevaartuig heeft twee hoofdelementen: de NASA Cassini-orbiter en de ESA Huygens-sonde."

In [37]:
wordRank.get_lexical_complexity_score(complex_sentence)

11.734924361416946

In [38]:
wordRank.get_lexical_complexity_score(simple_sentence)

10.280871668478664

In [39]:
complex_sentence = "Dit was het gebied ten oosten van de monding van de rivier de Vistula, later ook wel 'eigenlijk Pruisen' genoemd."
simple_sentence="Pruisen was eigenlijk de plaats ten oosten van de monding van de rivier de Vistula."

In [40]:
wordRank.get_lexical_complexity_score(complex_sentence)

9.43182170096659

In [41]:
wordRank.get_lexical_complexity_score(simple_sentence)

8.520619551491778

In [42]:
model = gensim.models.KeyedVectors.load_word2vec_format('../resources/DUMPS/coosto_model.bin', binary=True)

In [43]:
model.most_similar("monding")

[('rivier', 0.6764729619026184),
 ('oever', 0.657546877861023),
 ('ten_zuiden', 0.6298789381980896),
 ('ten_noorden', 0.6290280818939209),
 ('stroomafwaarts', 0.6211031675338745),
 ('ten_oosten', 0.619391918182373),
 ('elbe', 0.6125018000602722),
 ('oevers', 0.6068341732025146),
 ('drooggevallen', 0.6008706092834473),
 ('ten_westen', 0.5969594717025757)]

In [44]:
model.most_similar("schiereiland")

[('eiland', 0.7548738718032837),
 ('baai', 0.7172025442123413),
 ('kustlijn', 0.7141824960708618),
 ('vasteland', 0.7004197239875793),
 ('eilandengroep', 0.7000138163566589),
 ('eilanden', 0.6944781541824341),
 ('gebergte', 0.6839443445205688),
 ('zuidkust', 0.6789382100105286),
 ('eilandje', 0.6788842678070068),
 ('klein_eilandje', 0.6703616380691528)]

In [45]:
model.most_similar("muiterij")

[('oproer', 0.5124261975288391),
 ('opstand', 0.5085952281951904),
 ('staatsgreep', 0.4826657474040985),
 ('kapitein', 0.4724387526512146),
 ('rebellie', 0.4609268009662628),
 ('volksopstand', 0.45814624428749084),
 ('revolte', 0.4464385509490967),
 ('coup', 0.4457050561904907),
 ('machtsovername', 0.44217342138290405),
 ('petrograd', 0.44001343846321106)]

In [46]:
model.most_similar("moslimminderheid")

[('rohingya', 0.7361916303634644),
 ('rohingyas', 0.7077074646949768),
 ('myanmar', 0.6966372132301331),
 ('rakhine', 0.6898940801620483),
 ('birma', 0.6730502247810364),
 ('rohingyamoslims', 0.6431218981742859),
 ('burma', 0.6042240858078003),
 ('#rohingya', 0.5963991284370422),
 ('#myanmar', 0.5848981738090515),
 ('oeigoeren', 0.5661291480064392)]

In [47]:
model.most_similar("vaginale") 

[('vagina', 0.6811878085136414),
 ('genitale', 0.5975792407989502),
 ('prostaat', 0.5925841331481934),
 ('schimmelinfecties', 0.5768560767173767),
 ('vaginaal', 0.571540355682373),
 ('rectale', 0.5708329081535339),
 ('schaamlippen', 0.5556153655052185),
 ('baarmoederhals', 0.5504263639450073),
 ('plasbuis', 0.5503061413764954),
 ('clitoris', 0.5497023463249207)]

In [48]:
model.most_similar("tandheelkundige")

[('tandheelkunde', 0.6549472212791443),
 ('chirurgische', 0.6509051322937012),
 ('chirurgie', 0.6212051510810852),
 ('medische', 0.619494616985321),
 ('dental', 0.5601959824562073),
 ('klinische', 0.5597237944602966),
 ('orthopedische', 0.5496146082878113),
 ('behandelingen', 0.547005832195282),
 ('mondhygiene', 0.5408808588981628),
 ('orthodontie', 0.5370855927467346)]

In [49]:
model.key_to_index["impasse"]

26538

In [50]:
model.key_to_index["gitaarsnaar"]

KeyError: 'gitaarsnaar'

In [55]:
model.key_to_index["aardnoten"]

KeyError: 'aardnoten'

In [56]:
model.most_similar("Warmtekrachtkoppeling")

KeyError: "Key 'Warmtekrachtkoppeling' not present in vocabulary"

In [57]:
model.most_similar("chitine") # what to do with that? 

KeyError: "Key 'chitine' not present in vocabulary"