In [24]:
import csv
import re
import random
from Levenshtein import distance as levenshtein_distance, editops, seqratio
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.corpus import stopwords
import nltk
from nltk import SnowballStemmer
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from scipy.sparse import csr_matrix
from telwoord import ordinal, cardinal
from datetime import datetime
import re
import math
from stop_words import get_stop_words

In [25]:
#load Dutch spacy model

nlp = spacy.load("nl_core_news_lg")

In [26]:
#import Dutch stop words

stop_words = get_stop_words('nl')

In [27]:
#import list of common Dutch words

dutch_dictionary = open("..\\Data\\data_processing_knowledge_input\\dutch_dictionary.txt", encoding='utf-8', newline='')

dutch_words = dutch_dictionary.readlines()
dutch_words = [word.replace('\n', '').lower() for word in dutch_words]

In [28]:
#function that splits string into array of sentences based on regex
def split_text(text):
    text_splitted = []
    text_parts = re.split("\n", text)
                    
    for part in text_parts:
        sentences = re.split("(?:(?<=[\\!\\?\\.])|(?<=\\.”)|(?<=\\.”)|(?<=\\.')|(?<=\\?\")|(?<=\\?”)|(?<=\\?')|(?<=\\!\")|(?<=\\!')|(?<=\\!”)) |\\u00a0|\n", part)
        text_splitted.append(sentences)
        
    return text_splitted

In [29]:
#help function to compute features related to numerical information in the atomic change
def compare_numbers(original_number, new_number):
    if original_number > new_number:
        return 1
            
    if original_number == new_number:
        return 2
                
    if original_number < new_number:
        return 3

In [30]:
#lists of doubt words, certain words, negation words and temporary/approximate words that can be used to compute several features

doubt_words = ['vaak', 'meestal', 'dikwijls', 'vooral',
               'soms', 'misschien', 'mogelijk', 'mogelijks', 'waarschijnlijk', 'wellicht', 'eventueel',
                  'in principe', 'op zich', 'zouden', 'kunnen', 'zou', 'kan', 'lijkt', 'lijken',
               'voorlopig', 'tijdelijk', 'even',
              'doorgaans', 'typisch', 'af en toe', 'zelden', 'sporadisch', 'incidenteel', 'weleens', 
              'deels', 'gedeeltelijk', 'bijna', 'onzeker', 'sommige',  'quasi',
              'een paar', 'enkele', 'ettelijke', 'weinige', 'enkele', 'een paar', 'enige',
                 'bijvoorbeeld', 'onder meer', 'onder andere', 'bv.', 'gans', 'een deel', 'vermoedelijk']

certain_words = ['altijd', 'nooit', 'volledig', 'helemaal', 'zeker', 'absoluut', 'alleszins', 'beslist', 'gegarandeerd', 'gewis', 'natuurlijk', 'ongetwijfeld', 'ontwijfelbaar', 'overtuigd', 
                'stellig', 'verzekerd', 'wis', 'zekerlijk', 'meeste', 'alle', 'geen enkele', 'niemand', 'iedereen', 
                'niets', 'alles', 'gans', 'duidelijk', 'afgetekend',
                 'apert', 'evident', 'flagrant', 'helder', 'klaarblijkend', 'klaarblijkelijk', 'merkelijk',
                 'onbetwistbaar', 'ondubbelzinnig', 'onloochenbaar', 'onmiskenbaar', 'onomstotelijk', 'ontegensprekelijk',
                 'ontegenzeggelijk', 'ontegenzeglijk', 'onweerlegbaar', 'overduidelijk', 'zonneklaar']

negation_words = ['geen', 'niet']

temporary_words = ['minstens', 'bijna', 'zeker', 'circa', 'ongeveer', 'zo''n', 'tal', 'tallen']

In [31]:
#import Dutch names for colors

colors = {}
color_file = open('..\\Data\\data_processing_knowledge_input\\colors.csv', newline='', encoding='utf-8')
reader = csv.reader(color_file, delimiter=',', quotechar='"')

for line in reader:
    colors[line[0].lower()] = line[1].lower()
    colors[line[1].lower()] = line[0].lower()  

In [32]:
#import Dutch names for days of the week

days = set()
days_file = open('..\\Data\\data_processing_knowledge_input\\days_of_the_week.csv', newline='', encoding='utf-8')
reader = csv.reader(days_file, delimiter=',', quotechar='"')

for line in reader:
    days.add(line[0].lower())

In [33]:
#import Dutch names for currencies

currencies = set()
currencies_file = open('..\\Data\\data_processing_knowledge_input\\currencies.csv', newline='', encoding='utf-8')
reader = csv.reader(days_file, delimiter=',', quotechar='"')

for line in reader:
    currencies.add(line[0].lower())

In [34]:
#import Dutch names for months

months = set()
months_file = open('..\\Data\\data_processing_knowledge_input\\months.csv', newline='', encoding='utf-8')
reader = csv.reader(months_file, delimiter=',', quotechar='"')

for line in reader:
    months.add(line[0].lower())

In [35]:
#import Dutch names for wind directions

wind_directions = set()
wind_directions_file = open('..\\Data\\data_processing_knowledge_input\\wind_directions.csv', newline='', encoding='utf-8')
reader = csv.reader(wind_directions_file, delimiter=',', quotechar='"')

for line in reader:
    wind_directions.add(line[0].lower())

In [36]:
#import Dutch names for states and provinces

states_and_provinces = {}
states_and_provinces_file = open('..\\Data\\data_processing_knowledge_input\\states_and_provinces.csv', newline='', encoding='utf-8')
reader = csv.reader(states_and_provinces_file, delimiter=',', quotechar='"')

for line in reader:
    for i in range(0, 5):
        for j in range(0, 5):
            if i != j:
                if not(line[i] in states_and_provinces):
                    states_and_provinces[line[i]] = set()
                    states_and_provinces[line[i]].add(line[j])

                else:
                    states_and_provinces[line[i]].add(line[j])

In [37]:
#import Dutch names for countries

countries = set()
countries_file = open('..\\Data\\input\\countries.csv', newline='')
reader = csv.reader(countries_file, delimiter=',', quotechar='"')

for line in reader:
    countries.add(line[0])

In [38]:
#import Dutch names for world cities

world_cities = []
world_cities_file = open('..\\Data\\input\\worldcities.csv', newline='')
reader = csv.reader(world_cities_file, delimiter=',', quotechar='"')

for line in reader:
    if not(line[0] in world_cities):
        world_cities.append(line[0])

In [39]:
#import Dutch names for Belgian communities

belgian_communities = {}
belgian_communities_file = open('..\\Data\\input\\belgian_communities.csv', newline='', encoding='utf-8')
reader = csv.reader(belgian_communities_file, delimiter=',', quotechar='"')

for line in reader:
    
    if not(line[1].lower() in belgian_communities):
        belgian_communities[line[1]] = set()
        belgian_communities[line[1]].add(line[0])
        
    else:
        belgian_communities[line[1]].add(line[0])

In [40]:
#import Dutch names for nationalities

nationalities = {}
nationalities_file = open('..\\Data\\input\\nationalities.csv', newline='')
reader = csv.reader(nationalities_file, delimiter=',', quotechar='"')

for line in reader:
    for i in range(0, 5):
        for j in range(0, 5):
            if i != j:
                if not(line[i] in nationalities):
                    nationalities[line[i]] = set()
                    nationalities[line[i]].add(line[j])

                else:
                    nationalities[line[i]].add(line[j])

In [41]:
#help function that takes as an input a tokenized text and takes as an output arrays containing the colors, days, ... it contains
#this function can be used to calculate features concerning entities being present in atomic changes

def entity_spotting(doc):
    col = []
    day = []
    curr = []
    mon = []
    winds = []
    states = []
    countr = []
    cit = []
    belgian = []
    nationality = []
    
    for token in doc:
        text = token.text.lower()       
        
        #colors
        if text in colors:
            col.append(text)
            col.append(colors[text])
            
        else:
            found = [s for s in colors if s in text]
           
            for s in found:
                col.append(s)
                col.append(colors[s])
            
            
        #days
        if text in days:
            day.append(text)
            
        else:
            found = [s for s in days if s in text]
           
            for s in found:
                day.append(s)
        
        
        #currencies
        if text in currencies:
            curr.append(text)
        
        #months
        if text in months:
            mon.append(text)
            
        else:
            found = [s for s in months if s in text]
           
            for s in found:
                mon.append(s)
            
            
        #wind directions
        if text in wind_directions:
            winds.append(text)
            
        else:
            found = [s for s in wind_directions if s in text]
           
            for s in found:
                winds.append(s)
        
        
        #states
        if text in states_and_provinces:
            states.append(text)
            
        else:
            stats = [st for st in states_and_provinces if st in text]
           
            for b in stats:
                states.append(b)
                
                for statesje in states_and_provinces[b]:
                    states.append(statesje)
            
            
        #countries
        if text in countries:
            countr.append(text)
           
        
        #world cities
        if text in world_cities:
            cit.append(text)
            
        #belgian communities
        if text in belgian_communities:
            belgian.append(text)
            
            for community in belgian_communities[text]:
                belgian.append(community)
            
        #nationalities
        if text in nationalities:
            nationality.append(text)
            
            for nationalit in nationalities[text]:
                nationality.append(nationalit)
            
    return col, day, curr, mon, winds, states, countr, cit, belgian, nationality

In [42]:
#help function that takes two arrays as an input (typically containing entities of the same category, one for the original article version and one for the new article version, such as e.g. colors) 
#and gives as an output a number describing the extent to which entities were added and/or removed to/from the atomic change

#information on the meaning of the values that these features can hold can be found in the input data model documentation 

def array_comparison(array1, array2, result):
    if (len(array1) == 0) and (len(array2) == 0):
        result.append(0)
    
    else:
        if (array1 == array2):
            result.append(1)
            
        else:
            if set(array1).issubset(set(array2)):
                result.append(2)
                
            else:
                if set(array2).issubset(set(array1)):
                    result.append(3)
                    
                else:
                    if (len(array1) == 1) and (len(array2) == 1): 
                        result.append(4)
                        
                    else:
                        result.append(5)
                    
    return result

In [43]:
#function calculating all features related to the spaCy model, as described in the input data model documentation

def use_spacy(true, original_full, original_diff, new_full, new_diff, original_textpart, new_textpart,  \
                                                        original_changed_sentences, new_changed_sentences):

    numbers_mapping = {'anderhalf': '1.5', 'anderhalve': '1.5', 'kwart': '0.25', 'drie kwart': '0.75', 'driekwart': '0.75',
                       'de helft': '0.5', '1/2': '0.5', '1/2e': '0.5',
                       'een derde': '0.3333333', 'één derde': '0.3333333', '1/3': '0.3333333', '1/3e': '0.3333333',
                       'een vierde': '0.25', 'één vierde': '0.25', '1/4': '0.25', '1/4e': '0.25',
                       'een vijfde': '0.2', 'één vijfde': '0.2', '1/5': '0.2', '1/5e': '0.2', 
                       'een zesde': '0.16666667', 'één zesde': '0.16666667', '1/6': '0.16666667', '1/6e': '0.16666667',
                       'een zevende': '0.14286', 'één zevende': '0.14286', '1/7': '0.14286', '1/7e': '0.14286',
                       'een achtste': '0.125', 'één achtste': '0.125', '1/8': '0.125', '1/8e': '0.125',
                       'een negende': '0.1111111111', 'één negende': '0.1111111111', '1/9': '0.1111111111', '1/9e': '0.1111111111',
                       'een tiende': '0.1', 'één tiende': '0.1', '1/10': '0.1', '1/10e': '0.1',
                       'een zestiende': '0.0625', 'één zestiende': '0.0625','1/16': '0.0625', '1/16e': '0.0625',
                       'nul': '0', 'nulde': '0', '0de': '0',
                       'één': '1', 'een': '1', 'eén': '1', 'eerste': '1', '1ste': '1', '1e': '1',
                       'twee': '2','tweede': '2', '2de': '2', '2e': '2', 'tweeën': '2',
                       'drie': '3', 'derde': '3', '3de': '3', '3e': '3', 'drieën': '3',
                       'vier': '4', 'vierde': '4', '4de': '4', '4e': '4', 'vieren': '4',
                       'vijf': '5', 'vijfde': '5', '5de': '5', '5e': '5', 'vijven': '5',
                       'zes': '6', 'zesde': '6', '6de': '6', '6e': '6', 'zessen': '6',
                      'zeven': '7', 'zevende': '7', '7de': '7', '7e': '7', 'zevenen': '7',
                       'acht': '8', 'achtste': '8', '8ste': '8', '8e': '8', 'achten': '8',
                       'negen': '9', 'negende': '9', '9de': '9', '9e': '9', 'negenen': '9',
                       'tien': '10', 'tiende': '10', '10de': '10', '10e': '10', 'tienen': '10',
                       'elf': '11', 'elfde': '11', '11de': '11', '11e': '11', 'elven': '11',
                       'twaalf': '12', 'twaalfde': '12', '12de': '12', '12e': '12', 'twaalven': '12',
                       'dertien': '13', 'dertiende': '13', '13de': '13', '13e': '13', 'dertienen': '13',
                      'veertien': '14', 'veertiende': '14', '14de': '14', '14e': '14', 'veertienen': '14',
                       'vijftien': '15', 'vijftiende': '15', '15de': '15', '15e': '15', 'vijftienen': '15',
                       'zestien': '16', 'zestiende': '16', '16de': '16', '16e': '16', 'zestienen': '16',
                       'zeventien': '17', 'zeventiende': '17', '17de': '17', '17e': '17', 'zeventienen': '17',
                       'achttien': '18', 'achttiende': '18', '18de': '18', '18e': '18', 'achttienen': '18',
                      'negentien': '19', 'negentiende': '19', '19de': '19', '19e': '19', 'negentienen': '19',
                       'twintig': '20', 'twintigste': '20', '20ste': '20', '20e': '20',
                       'dertig': '30', 'dertigste': '30', '30ste': '30', '30e': '30',
                       'veertig': '40', 'veertigste': '40', '40ste': '40', '40e': '40',
                       'vijftig': '50', 'vijftigste': '50', '50ste': '50', '50e': '50',
                       'zestig': '60', 'zestigste': '60', '60ste': '60', '60e': '60',
                       'zeventig': '70','zeventigste': '70', '70ste': '70', '70e': '70',
                       'tachtig': '80', 'tachtigste': '80', '80ste': '80', '80e': '80',
                       'negentig': '90', 'negentigste': '90', '90ste': '90', '90e': '90',
                       'honderd': '100', 'honderdste': '100', '100ste': '100', '100e': '100',
                       'tweehonderd': '200', 'tweehonderdste': '200', '20ste': '200', '200e': '200',
                       'driehonderd': '300', 'driehonderdste': '300', '300ste': '300', '300e': '300',
                       'vierhonderd': '400', 'vierhonderdste': '400', '400ste': '400', '400e': '400',
                      'vijfhonderd': '500', 'vijfhonderdste': '500', '500ste': '500', '500e': '500',
                       'zeshonderd': '600', 'zeshonderdste': '600', '600ste': '600', '600e': '600',
                       'zevenhonderd': '700', 'zevenhonderdste': '700', '700ste': '700', '700e': '700',
                       'achthonderd': '800', 'achthonderdste': '800', '800ste': '800', '800e': '800',
                       'negenhonderd': '900', 'negenhonderdste': '900', '900ste': '900', '900e': '900',
                       'duizend': '1000', 'duizendste': '1000', '1000ste': '1000', '1000e': '1000', 
                       'tweeduizend': '2000', 'tweeduizendste': '2000', '2000ste': '2000', '2000e': '2000', 
                       'drieduizend': '3000', 'drieduizendste': '3000', '3000ste': '3000', '3000e': '3000', 
                       'vierduizend': '4000', 'vierduizendste': '4000', '4000ste': '4000', '4000e': '4000', 
                       'vijfduizend': '5000', 'vijfduizendste': '5000', '5000ste': '5000', '5000e': '5000', 
                       'zesduizend': '6000', 'zesduizendste': '6000', '6000ste':  '6000', '6000e': '6000', 
                       'zevenduizend': '7000', 'zevenduizendste': '7000', '7000ste': '7000', '7000e': '7000', 
                       'achtduizend': '8000', 'achtduizendste': '8000', '8000ste': '8000', '8000e': '8000', 
                       'negenduizend': '9000', 'negenduizendste': '9000', '9000ste': '9000', '9000e': '9000', 
                       'tienduizend': '10000', 'tienduizendste': '100000', '10000ste': '10000', '100000e': '10000',
                       'honderdduizend': '100000', 'honderdduizendste': '100000', '100000ste': '100000', '100000e': '100000',
                       'tweehonderdduizend': '200000', 'tweehonderdduizendste': '200000', '200000ste': '200000', '200000e': '200000',
                       'driehonderdduizend': '300000', 'driehonderdduizendste': '300000', '300000ste': '300000', '300000e': '300000',
                       'vierhonderdduizend': '400000', 'vierhonderdduizendste': '400000', '400000ste': '400000', '400000e': '400000',
                       'vijfhonderdduizend': '500000', 'vijfhonderdduizendste': '400000', '500000ste': '500000', '500000e': '500000',
                       'zeshonderdduizend': '600000', 'zeshonderdduizendste': '600000', '600000ste': '600000', '600000e': '600000',
                       'zevenhonderdduizend': '700000', 'zevenhonderdduizendste': '700000', '700000ste': '700000', '700000e': '700000',
                       'achthonderdduizend': '800000', 'achthonderdduizendste': '800000', '800000ste': '800000', '800000e': '800000',
                       'negenhonderdduizend': '900000', 'negenhonderdduizendste': '900000', '900000ste': '900000', '900000e': '900000',
                       'één miljoen': '1000000', 'een miljoen': '1000000', '1 miljoen': '1000000',
                       'twee miljoen': '1000000', '2 miljoen': '1000000',
                       'drie miljoen': '1000000', '3 miljoen': '1000000',
                       'vier miljoen': '1000000', '4 miljoen': '1000000',
                       'vijf miljoen': '1000000', '5 miljoen': '1000000',
                       'zes miljoen': '1000000', '6 miljoen': '1000000',
                       'zeven miljoen': '1000000', '7 miljoen': '1000000',
                       'acht miljoen': '1000000', '8 miljoen': '1000000',
                       'negen miljoen': '1000000', '9 miljoen': '1000000',
                       'tien miljoen': '1000000', '10 miljoen': '1000000',
                        'één miljard': '1000000', 'een miljard': '1000000', '1 miljard': '1000000',
                       'twee miljard': '1000000', '2 miljard': '1000000',
                       'drie miljard': '1000000', '3 miljard': '1000000',
                       'vier miljard': '1000000', '4 miljard': '1000000',
                       'vijf miljard': '1000000', '5 miljard': '1000000',
                       'zes miljard': '1000000', '6 miljard': '1000000',
                       'zeven miljard': '1000000', '7 miljard': '1000000',
                       'acht miljard': '1000000', '8 miljard': '1000000',
                       'negen miljard': '1000000', '9 miljard': '1000000',
                       'tien miljard': '1000000', '10 miljard': '1000000',
                      }
    
    punct_mapping = {"'": '"', '“': '"', '”': '"', '`': '"', '‘': '"', '’': '"'}
    
    pos_tags = ['ADV', 'NOUN', '.', ',', '"', '(', ':', '-', '...', 'PUNCT', 'X', 'PROPN', 'PRON', 
                'DET', 'SCONJ', 'SPACE', 'SYM', 'NUM', 'ADP', 'INTJ', 'AUX', 'inf', 'pv_verl_ev', 'pv_verl_mv', 'pv_tgw_ev',
                'pv_tgw_mv',
                'od_prenom', 'od_nom', 'od_postnom', 'od_vrij', 'vd_vrij', 'vd_prenom', 'vd_postnom', 'vd_nom', 'VERB', 'CCONJ',
                'adj_sup', 'adj_comp', 'adj_basis']

    result = []
    
    doc_original = nlp(original_full.strip())
    doc_new = nlp(new_full.strip())
    
    
    doc_diff_original = nlp(original_diff.strip())
    doc_diff_new = nlp(new_diff.strip())
    
    
    total_words = len(doc_diff_original) + len(doc_diff_new)
    total_stop_words = 0
    
    for token in doc_diff_original:
        if token.text.lower() in stop_words:
            total_stop_words = total_stop_words + 1
            
    for token in doc_diff_new:
        if token.text.lower() in stop_words:
            total_stop_words = total_stop_words + 1
            
    if total_words > 0:
        result.append(float(total_stop_words)/total_words)
    else:
        result.append(0)

    
    ent_original = 0
    ent_new = 0
    
    startindex = 0
    
    numbers_original = set()
    numbers_new = set()
    
    person_original = set()
    person_new = set()
    
    date_original = set()
    date_new = set()
    
    #ORIGINAL ENTS
    
    temp_original_diff = original_diff
    temp_new_diff = new_diff
    
    other_ent_occurences_in_new = True
    
    nrs = re.findall(r'\d+', original_diff)
    for nr in nrs:
        numbers_original.add(nr)
    

    for ent in doc_original.ents:
        ents_tokenized = word_tokenize(ent.text)
        
        for ent_tok in ents_tokenized:
            index =temp_original_diff.find(ent_tok)
        
            if index >= 0:
                if ent.text.lower() in numbers_mapping:
                    numbers_original.add(numbers_mapping[ent.text.lower()])
                else:
                    if ent_tok.lower() in numbers_mapping:
                        numbers_original.add(numbers_mapping[ent_tok.lower()])
                    else:
                        if (ent.label_ == 'CARDINAL') or (ent.label_ == 'ORDINAL') \
                        or (ent.label_ == 'PERCENT') or (ent.label_ == 'TIME'):
                            
                            if any(map(str.isdigit, ent_tok.lower())):
                                numbers_original.add(ent_tok.lower())
                        else:

                            if (ent.label_ == 'DATE') and ent_tok.isnumeric():
                                date_original.add(ent_tok.lower())

                            else:
                                if (ent.label_ == 'PERSON'):
                                    person_original.add(ent_tok.lower())

                                else:
                                    ent_original = ent_original + 1 
                
                startindex = startindex + len(ent_tok)
                temp_original_diff = temp_original_diff[startindex:]
                    
            #look if entity is present in same text part at different position
            if not(ent_tok) in new_textpart:
                other_ent_occurences_in_new = False
            
    if len(doc_diff_original) == 0:
        result.append(0)
        
    else:
        result.append(float(ent_original)/len(doc_diff_original))
    
    result.append(len(doc_diff_original))
    
    #ORIGINAL TOKENS
    original_pos_counts = {}
    original_diff_lookup = [token.text for token in doc_diff_original]
    
    original_spelling_ok = 1
    
    original_double_word = False
    
    
    temp_original_diff = original_diff
    temp_new_diff = new_diff

    for (i, token) in enumerate(doc_original):
        index = temp_original_diff.find(token.text)
        
        if index >= 0:
            temp_original_diff = temp_original_diff[len(token.text):]

    
        if (token.pos_ != 'PUNCT') and (index >= 0) and ((new_full.find(token.text) >= 0) or len(new_full) == 0) and (temp_original_diff.find(token.text) < 0) and (((i > 0) and token.text == doc_original[i - 1].text) or ((i < len(doc_original) - 1) and (token.text == doc_original[i + 1].text))):
            original_double_word = True
        
        if (len(original_diff_lookup) > 0) and (token.text == original_diff_lookup[0]):
            
            pos = token.pos_
            
            if token.pos_ == 'PUNCT':
                if token.text == '.' or  token.text == '!' or token.text == '?':
                    pos = '.'
                
                else: 
                    if token.text in punct_mapping or token.text == '"':
                        pos = '"'
                    
                    else:
                        if token.text == ',':
                            pos = ','
                            
                        else:
                            if '(' in token.text or ')' in token.text:
                                pos = '('
                                
                            else:
                                if ':' in token.text:
                                    pos = ':'
                                    
                                else:
                                    if token.text == '-' or token.text == ';' or token.text == '–':
                                        pos = '-'
                                        
                                    else:
                                        
                                        if token.text == '..' or token.text == '...' or token.text == '....':
                                            pos = '...'
                                            
            if token.pos_ == 'ADJ':
                if ('sup' in token.tag_):
                    pos = 'adj_sup'
                    
                else:
                    if ('comp' in token.tag_):
                        pos = 'adj_comp'
                    
                    else:
                        pos = 'adj_basis'
                
            if token.pos_ == 'VERB'  or token.pos == 'AUX':
                if 'inf|' in token.tag_:
                    pos = 'inf'
                
                else:
                    if 'pv|' in token.tag_:
                        if '|tgw|' in token.tag_:
                            if ('|ev' in token.tag_) or ('|met-t' in token.tag_):
                                pos = 'pv_tgw_ev'
                            
                            else:
                                pos = 'pv_tgw_mv'
                        
                        if '|verl|' in token.tag_:
                            if ('|ev' in token.tag_) or ('|met-t' in token.tag_):
                                pos = 'pv_verl_ev'
                            
                            else:
                                pos = 'pv_verl_mv'
                    
                    else:
                        if 'od|' in token.tag_:
                            if '|prenom|' in token.tag_:
                                pos = 'od_prenom'
                                
                            if '|postnom|' in token.tag_:
                                pos = 'od_postnom'
                                
                            if '|nom|' in token.tag_:
                                pos = 'od_nom'
                                
                            if '|vrij|' in token.tag_:
                                pos = 'od_vrij'
                                
                        else:
                            if 'vd|' in token.tag_:
                                if '|prenom|' in token.tag_:
                                    pos = 'od_prenom'

                                if '|postnom|' in token.tag_:
                                    pos = 'od_postnom'

                                if '|nom|' in token.tag_:
                                    pos = 'od_nom'

                                if '|vrij|' in token.tag_:
                                    pos = 'od_vrij'
        

            if pos in original_pos_counts:
                original_pos_counts[pos] = original_pos_counts[pos] + 1
            else:
                original_pos_counts[pos] = 1
                
                
            original_diff_lookup = original_diff_lookup[1:]
            
    original_pos_counts['SPACE'] = original_diff.count(' ')
            
    original_token_sum = sum(original_pos_counts.values())
    
    if original_token_sum  - original_pos_counts['SPACE'] == 1:
        token = doc_diff_original[0]
        if not(token.text.lower() in dutch_words) and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and token.pos_ != 'NUM':
                original_spelling_ok = 0
    
    if original_token_sum == 0:
        
        for i in range(0, 39):
            result.append(0)
        
    else:
        for pos_tag in pos_tags:
            if pos_tag in original_pos_counts:
                result.append(float(original_pos_counts[pos_tag])/original_token_sum)
                
            else:
                result.append(0)          
    
    #NEW ENTS
    startindex = 0
    
    temp_original_diff = original_diff
    temp_new_diff = new_diff
    
    other_ent_occurences_in_original = True
    
    nrs = re.findall(r'\d+', new_diff)
    for nr in nrs:
        numbers_new.add(nr)
    
    for ent in doc_new.ents:
        ents_tokenized = word_tokenize(ent.text)
        
        for ent_tok in ents_tokenized:
            index =temp_new_diff.find(ent_tok)
        
            if index >= 0:
                if ent.text.lower() in numbers_mapping:
                    numbers_new.add(numbers_mapping[ent.text.lower()])
                else:
                    if ent_tok.lower() in numbers_mapping:
                        numbers_new.add(numbers_mapping[ent_tok.lower()])
                    else:
                        if (ent.label_ == 'CARDINAL') or (ent.label_ == 'ORDINAL') \
                        or (ent.label_ == 'PERCENT') or (ent.label_ == 'TIME'):
                           
                            if any(map(str.isdigit, ent_tok.lower())):
                                numbers_new.add(ent_tok.lower())
                        else:

                            if (ent.label_ == 'DATE') and ent_tok.isnumeric():
                                date_new.add(ent_tok.lower())

                            else:
                                if (ent.label_ == 'PERSON'):
                                    person_new.add(ent_tok.lower())

                                else:
                                    ent_new = ent_new + 1 
                
                startindex = startindex + len(ent_tok)
                temp_new_diff = temp_new_diff[startindex:]
                
            #look if entity is present in same text part at different position
            if not(ent_tok) in original_textpart:
                other_ent_occurences_in_original = False
    
    if len(doc_diff_new) == 0:
        result.append(0)
        
    else:
        result.append(float(ent_new)/len(doc_diff_new))
        
    result.append(len(doc_diff_new))
    
    #NEW TOKENS
    new_pos_counts = {}
    new_diff_lookup = [token.text for token in doc_diff_new]
    
    new_spelling_ok = 1
    
    new_double_word = False
    
    temp_original_diff = original_diff
    temp_new_diff = new_diff
    
    for i, token in enumerate(doc_new):
        
        index = new_diff.find(token.text)
        
        if index >= 0:
            temp_new_diff = temp_new_diff[len(token.text):]
        
        if (token.pos_ != 'PUNCT') and (index >= 0) and ((original_full.find(token.text) >= 0) or len(original_full) == 0) and (temp_new_diff.find(token.text) < 0) and (((i > 0) and token.text == doc_new[i - 1].text) or ((i < len(doc_new) - 1) and (token.text == doc_new[i + 1].text))):
            new_double_word = True

        if (len(new_diff_lookup) > 0) and (token.text == new_diff_lookup[0]):
            
            pos = token.pos_
            
            if token.pos_ == 'PUNCT':
                if token.text == '.' or  token.text == '!' or token.text == '?':
                    pos = '.'
                
                else: 
                    if token.text in punct_mapping or token.text == '"':
                        pos = '"'
                    
                    else:
                        if token.text == ',':
                            pos = ','
                            
                        else:
                            if '(' in token.text or ')' in token.text:
                                pos = '('
                                
                            else:
                                if ':' in token.text:
                                    pos = ':'
                                    
                                else:
                                    if token.text == '-' or token.text == ';' or token.text == '–':
                                        pos = '-'
                                        
                                    else:
                                        
                                        if token.text == '..' or token.text == '...' or token.text == '....':
                                            pos = '...'
                                            
            if token.pos_ == 'ADJ':
                if ('sup' in token.tag_):
                    pos = 'adj_sup'
                    
                else:
                    if ('comp' in token.tag_):
                        pos = 'adj_comp'
                    
                    else:
                        pos = 'adj_basis'
                        
            if token.pos_ == 'VERB' or token.pos == 'AUX':
                if 'inf|' in token.tag_:
                    pos = 'inf'
                
                else:
                    if 'pv|' in token.tag_:
                        if '|tgw|' in token.tag_:
                            if ('|ev' in token.tag_) or ('|met-t' in token.tag_):
                                pos = 'pv_tgw_ev'
                            
                            else:
                                pos = 'pv_tgw_mv'
                        
                        if '|verl|' in token.tag_:
                            if ('|ev' in token.tag_) or ('|met-t' in token.tag_):
                                pos = 'pv_verl_ev'
                            
                            else:
                                pos = 'pv_verl_mv'
                    
                    else:
                        if 'od|' in token.tag_:
                            if '|prenom|' in token.tag_:
                                pos = 'od_prenom'
                                
                            if '|postnom|' in token.tag_:
                                pos = 'od_postnom'
                                
                            if '|nom|' in token.tag_:
                                pos = 'od_nom'
                                
                            if '|vrij|' in token.tag_:
                                pos = 'od_vrij'
                                
                        else:
                            if 'vd|' in token.tag_:
                                if '|prenom|' in token.tag_:
                                    pos = 'od_prenom'

                                if '|postnom|' in token.tag_:
                                    pos = 'od_postnom'

                                if '|nom|' in token.tag_:
                                    pos = 'od_nom'

                                if '|vrij|' in token.tag_:
                                    pos = 'od_vrij'
            
            if pos in new_pos_counts:
                new_pos_counts[pos] = new_pos_counts[pos] + 1
            else:
                new_pos_counts[pos] = 1
            
            new_diff_lookup = new_diff_lookup[1:]
            
            
    new_pos_counts['SPACE'] = new_diff.count(' ')
    new_token_sum = sum(new_pos_counts.values())
    
    if new_token_sum - new_pos_counts['SPACE'] == 1:
        token = doc_diff_new[0]
        if not(token.text.lower() in dutch_words) and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and token.pos_ != 'NUM':
                new_spelling_ok = 0
    
    else:
        original_spelling_ok = 1
    
    if new_token_sum == 0:
        
        for i in range(0, 39):
            result.append(0)
        
    else:
        for pos_tag in pos_tags:
            if pos_tag in new_pos_counts:
                result.append(float(new_pos_counts[pos_tag])/new_token_sum)

            else:
                result.append(0)
    
    if original_double_word and not(new_double_word):
        result.append(1)
    else:
        result.append(0)
    
    #FINAL COMPARISON
    
    temp_original_diff = original_diff
    temp_new_diff = new_diff
    
    temp_original_full = original_full
    temp_new_full = new_full
    
    for token in doc_diff_original:
        if token.text.lower() in numbers_mapping:
            
            while token.text in temp_original_diff:
                temp_original_diff = temp_original_diff.replace(token.text, numbers_mapping[token.text.lower()])
                temp_original_full = temp_original_full.replace(token.text, numbers_mapping[token.text.lower()])
                
        if token.text in punct_mapping:
            
            while token.text in temp_original_diff:
                temp_original_diff = temp_original_diff.replace(token.text, punct_mapping[token.text])
                temp_original_full = temp_original_full.replace(token.text, punct_mapping[token.text])
                
    for token in doc_diff_new:
        if token.text.lower() in numbers_mapping:
            
            while token.text in temp_new_diff:
                temp_new_diff = temp_new_diff.replace(token.text, numbers_mapping[token.text.lower()])
                temp_new_full = temp_new_full.replace(token.text, numbers_mapping[token.text.lower()])
                
        if token.text in punct_mapping:
            
            while token.text in temp_new_diff:
                temp_new_diff = temp_new_diff.replace(token.text, punct_mapping[token.text])
                temp_new_full = temp_new_full.replace(token.text, punct_mapping[token.text])
    
    if temp_original_diff == temp_new_diff:
        result.append(1)
        
        doc_temp_original_full = [token.text for token in nlp(temp_original_full)]
        doc_temp_new_full = [token.text for token in nlp(temp_new_full)]
        
        bool2 = True
        
        for token in nlp(temp_original_diff):
            try:
                index_original = doc_temp_original_full.index(token.text)
            except:
                index_original = 0
            
            try:
                index_new = doc_temp_new_full.index(token.text)
            except:
                index_new = 0
            
            if ((index_original > 0) and (index_new > 0) and (doc_temp_original_full[index_original - 1]  != doc_temp_new_full[index_new - 1])) \
            or ((index_original < len(doc_temp_original_full)-1) and (index_new < len(doc_temp_new_full)-1) and (doc_temp_original_full[index_original + 1]  != doc_temp_new_full[index_new + 1])) \
            or ((index_original == 0) and (index_new != 0)) or ((index_original == len(doc_temp_original_full) -1) and (index_new != len(doc_temp_new_full) - 1)) \
            or ((index_original != 0) and (index_new == 0)) or ((index_original != len(doc_temp_original_full) -1) and (index_new == len(doc_temp_new_full) - 1)):
                bool2 = False
                break
        if bool2 == True:
            result.append(1)    
        else:
            result.append(0)
        
    else:
        result.append(0)
        result.append(0)
        
        
    #look at the type word before and after diff
    original_tokens = [token.text for token in doc_diff_original]
    
    if (len(original_tokens) > 0):
        first_token_original = original_tokens[0]
        last_token_original = original_tokens[len(original_tokens) - 1]
    
    new_tokens = [token.text for token in doc_diff_new]
    if (len(new_tokens) > 0):
        first_token_new = new_tokens[0]
        last_token_new = new_tokens[len(new_tokens) - 1]
    
    first = 'other'
    last = 'other'
    
    if (original_diff == ',') and (new_diff == ''):
    
        if (len(original_tokens) > 0):
            for i, token in enumerate(doc_original):
                if (token.text == first_token_original) and first_token_original ==',' and (i > 0) and (first == 'other'):
                    first = doc_original[i - 1].pos_

                if (token.text == last_token_original) and last_token_original ==',' and (i < len(doc_original) - 1) and (last == 'other'):
                    last = doc_original[i + 1].pos_
    
    if (original_diff == '') and (new_diff == ','):
        if (len(new_tokens) > 0):
            for i, token in enumerate(doc_new):
                if (token.text == first_token_new) and first_token_new ==',' and (i > 0) and (first =='other'):
                    first = doc_new[i - 1].pos_

                if (token.text == last_token_new)  and last_token_new ==',' and (i < len(doc_new) - 1) and (last == 'other'):
                    last = doc_new[i + 1].pos_
        
    result.append(first)
    result.append(last)
    
    result.append(original_spelling_ok)
    result.append(new_spelling_ok)
    
    #one edit change
    if (new_token_sum - new_pos_counts['SPACE'] == 1) and (original_token_sum - original_pos_counts['SPACE'] == 1):
        if ((len(new_diff) == len(original_diff) - 1) and (new_diff in original_diff)) \
        or ((len(original_diff) == len(new_diff) - 1) and (original_diff in new_diff)) \
        or set(sorted(original_diff)) == set(sorted(new_diff)):
            result.append(1)
            
        else: result.append(0)
            
    else:
        result.append(0)
        
    #comparison of numbers
    succeeded_original = True
    succeeded_new = True
    original_number = None
    new_number = None
    
    if (new_token_sum - new_pos_counts['SPACE'] == 1) and (original_token_sum - original_pos_counts['SPACE'] == 1):
        try:
            original_number = float(original_diff.replace(' ', ''))
            
        except Exception as e:
            succeeded_original = False
            
        try:
            new_number = float(new_diff.replace(' ', ''))
            
        except Exception as e:
            succeeded_new = False
            
        if (succeeded_original == True) and (succeeded_new == True):
            result.append(compare_numbers(original_number, new_number))
                
        else:
            if (succeeded_original == False) and (succeeded_new == True):
                if original_diff.replace(' ', '').lower() in numbers_mapping:
                    result.append(compare_numbers(float(numbers_mapping[original_diff.replace(' ', '').lower()]), new_number))
                    
                else:
                    result.append(0)
                    
            else:
                if (succeeded_original == True) and (succeeded_new == False):
                    if new_diff.replace(' ', '').lower() in numbers_mapping:
                        already_done = True
                        result.append(compare_numbers(original_number, float(numbers_mapping[new_diff.replace(' ', '').lower()])))

                    else:
                        result.append(0)
                        
                else:
                    if (succeeded_original == False) and (succeeded_original == False):
                        if (original_diff.replace(' ', '').lower() in numbers_mapping) and (new_diff.replace(' ', '').lower() in numbers_mapping):
                            result.append(compare_numbers(float(numbers_mapping[original_diff.replace(' ', '').lower()]), float(numbers_mapping[new_diff.replace(' ', '').lower()])))                      
            
                        else: 
                            result.append(0)
    else:
        if (len(numbers_original) == 0) and (len(numbers_new) == 0):
            result.append(0)
            
        else:
            
            if (numbers_original == numbers_new) and (len(numbers_original) > 0):
                result.append(2)

            else:
                numbers_o = set()
                numbers_n = set()
                others = False

                for token in numbers_original:
                    token = token.replace(',', '.')
                    try:
                        token_o = float(token)

                    except:
                        if token.lower() in numbers_mapping:
                            token_o = float(numbers_mapping[token_o.lower()])
                        else:
                            token_o = None
                            others = True

                    if not(token_o is None):
                        numbers_o.add(token_o)

                for token in numbers_new:
                    token = token.replace(',', '.')
                    try:
                        token_n = float(token)

                    except:
                        if token.lower() in numbers_mapping:
                            token_n = float(numbers_mapping[token_n.lower()])
                        else:
                            token_n = None
                            others = True

                    if not(token_n is None):
                        numbers_n.add(token_n)

                if (numbers_o == numbers_n) and (len(numbers_original) > 0) and (others == False):
                    result.append(2)

                else:
                    if (others == True):
                        result.append(0)
                        
                    else:
                        result.append(4)
                            
    temporary_word_in_o = False
    temporary_word_in_n = False
    
    for word in temporary_words:
        if word in original_diff.lower():
            temporary_word_in_o = True
            
        if word in new_diff.lower():
            temporary_word_in_n = True
                    
    if (len(numbers_original) > 0) and (len(numbers_new) > 0):
        if temporary_word_in_o == True and temporary_word_in_n == True:
            result.append(1)
            
        if temporary_word_in_o == True and temporary_word_in_n == False:
            result.append(2)
            
        if temporary_word_in_o == False and temporary_word_in_n == True:
            result.append(3)
            
        if temporary_word_in_o == False and temporary_word_in_n == False:
            result.append(4)
            
    else:
        result.append(0)       
            
    
    result.append(nlp(original_full).similarity(nlp(new_full)))
    result.append(nlp(original_diff).similarity(nlp(new_diff)))
    
    doubt_words_o = 0
    doubt_words_total = 0
    
    for token in doubt_words:
        if token in original_diff.lower():
            doubt_words_o = doubt_words_o + 1
            doubt_words_total = doubt_words_total + 1
            
    for token in certain_words: 
        if token in original_diff.lower():
            doubt_words_o = doubt_words_o - 1
            doubt_words_total = doubt_words_total - 1
    
    if len(doc_diff_original) > 0:
        result.append(abs(float(doubt_words_o)/len(doc_diff_original)))
    else:
        result.append(0)
        
    doubt_words_n = 0
    for token in doubt_words:
        if token in new_diff.lower():
            doubt_words_n = doubt_words_n - 1
            doubt_words_total = doubt_words_total - 1
            
    for token in certain_words: 
        if token in new_diff.lower():
            doubt_words_n = doubt_words_n + 1
            doubt_words_total = doubt_words_total + 1
            
    if len(doc_diff_new) > 0:
        result.append(abs(float(doubt_words_n)/len(doc_diff_new)))
    else:
        result.append(0)
    
    #total doubt words
    result.append(abs(doubt_words_total))

    if other_ent_occurences_in_original == True:
        result.append(1)
        
    else:
        result.append(0)
        
    if other_ent_occurences_in_new == True:
        result.append(1)
        
    else:
        result.append(0)
        
    #look if word has changed its position or not    
    if (new_token_sum - new_pos_counts['SPACE'] == 1) and (original_token_sum - original_pos_counts['SPACE'] == 1):
        
        if original_changed_sentences[0][2] == new_changed_sentences[0][2]:
            result.append(2)
            
        else:
            if (original_full[max(original_changed_sentences[0][2] - 4, 0): original_changed_sentences[0][2]] == new_full[max(new_changed_sentences[0][2] - 4, 0): new_changed_sentences[0][2]]) \
            and (original_full[original_changed_sentences[0][3] + 1: min(original_changed_sentences[0][3] + 4, len(original_full))] == new_full[new_changed_sentences[0][3] + 1: min(new_changed_sentences[0][3] + 4, len(new_full))]):
                result.append(2)
                
            else:
                result.append(1)
                
    else:
        result.append(0)
        
    
    #detailed entity change analysis
    colors_o, days_o, curr_o, months_o, winds_o, states_o, countries_o, cities_o, belgian_o, nationality_o = entity_spotting(doc_diff_original)
    colors_n, days_n, curr_n, months_n, winds_n, states_n, countries_n, cities_n, belgian_n, nationality_n = entity_spotting(doc_diff_new)
    
    #0 means no color data, 1 means no change between original and new, 2 means deletion, 3 means insertion, 4 means change into other color)
    if (len(doc_diff_original) < 10) and (len(doc_diff_new) < 10): 
        result = array_comparison(colors_o, colors_n, result)
        result = array_comparison(days_o, days_n, result)
        result = array_comparison(curr_o, curr_n, result)
        result = array_comparison(months_o, months_n, result)
        result = array_comparison(winds_o, winds_n, result)
        result = array_comparison(states_o, states_n, result)
        result = array_comparison(countries_o, countries_n, result)
        result = array_comparison(cities_o, cities_n, result)
        result = array_comparison(belgian_o, belgian_n, result)
        result = array_comparison(nationality_o, nationality_n, result)   
        result = array_comparison(date_original, date_new, result)
        result = array_comparison(person_original, person_new, result)
        
    else:
        for i in range(0, 12):
            result.append(6)
    
    negation_words_o = 0
    for token in negation_words:
        if token in negation_words:
            negation_words_o = negation_words_o + 1

    if len(doc_diff_original) > 0:
        result.append(float(negation_words_o)/len(doc_diff_original))
    else:
        result.append(0)

    negation_words_n = 0
    for token in negation_words:
        if token in new_diff.lower():
            negation_words_n = negation_words_n + 1

    if len(doc_diff_new) > 0:
        result.append(float(negation_words_n)/len(doc_diff_new))
    else:
        result.append(0)

    return result

In [44]:
#help function for the lemmatization of text

def lemmatize_text(text):
    doc = nlp(text)
    arr = [w.lemma_ for w in doc]
    return ' '.join(arr)

In [45]:
#code block calculating all features as described in the input data model dataset documentation
#the code takes the raw data as an input and writes the manually constructed feature data to separate csv files

#with open('..\\Data\\raw_data_input_obj.csv', newline='', encoding='utf-8') as csvfile:
#with open('..\\Data\\raw_data_input_subj.csv', newline='', encoding='utf-8') as csvfile:
with open('..\\Data\\raw_data_input_ling.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    #new_file = open('..\\Data\\input_data_model_obj_manual.csv', 'w', newline='', encoding='utf-8')
    #new_file = open('..\\Data\\input_data_model_subj_manual.csv', 'w', newline='', encoding='utf-8')
    new_file = open('..\\Data\\input_data_model_ling_manual.csv', 'w', newline='', encoding='utf-8')
    
    writer = csv.writer(new_file, delimiter=',', quotechar='"')
    header = ['newspaper', 'topic', 'textpart', 'length_original', 'length_new', 'max_version_number', 'version_number_progress',
                     'first_time', 'original_time', 'dates_difference', 'time_difference', 'original_title_length', 'original_intro_length',
               'original_text_length', 'new_title_length', 'new_intro_length', 'new_text_length', 'new_time', 
              'fraction_original_title_changed', 'fraction_new_title_changed',
              'fraction_original_intro_changed', 'fraction_new_intro_changed',
              'fraction_original_text_changed', 'fraction_new_text_changed',
              'fraction_total_changed_original', 'fraction_total_changed_new',
              'fraction_total_changed_title_original', 'fraction_total_changed_title_new',
              'fraction_total_changed_intro_original', 'fraction_total_changed_intro_new',
              'fraction_total_changed_text_original', 'fraction_total_changed_text_new',
              'type',
                     'original_changed_fraction_text_part', 'new_changed_fraction_text_part', 
              'levenshtein_maximalized', 'nr_insert_max', 'nr_delete_max', 'nr_replace_max',
              'levenshtein_minimalized', 'nr_insert_min', 'nr_delete_min', 'nr_replace_min', 'capitalized_equality',
                     'jaccard', 'seqratio', 'text_overlap_original', 'text_overlap_new', 
              'original_lemmatized_minimized_changed_text', 'new_lemmatized_minimized_changed_text', 'stop_words_ratio',
              'ent_original', 'original_token_length', 'adv_orig', 'noun_orig', 'point_orig', 'comma_orig',
              'accent_orig', 'haakje_orig', 'doublepoint_orig', 'hyphen_orig', 'threepoints_orig', 'punct_orig', 'x_orig', 'propn_orig',
              'pron_orig', 'det_orig', 'sconj_orig', 'space_orig', 'sym_orig', 'num_orig', 'adp_orig',
              'intj_orig', 'aux_orig', 'inf_orig', 'pv_verl_ev_orig', 'pv_verl_mv_orig', 'pv_tgw_ev_orig', 'pv_tgw_mv_orig',
                'od_prenom_orig', 'od_nom_orig', 'od_postnom_orig', 'od_vrij_orig', 'vd_vrij_orig', 'vd_prenom_orig', 
              'vd_postnom_orig', 'vd_nom_orig', 'verb_orig', 'cconj_orig', 'adj_sup_orig', 'adj_comp_orig', 'adj_basis_orig',
              'ent_new', 'new_token_length', 'adv_new', 'noun_new', 'point_new', 'comma_new',
              'accent_new', 'haakje_new', 'doublepoint_new', 'hyphen_new', 'threepoints_new', 'punct_new', 'x_new', 'propn_new',
              'pron_new', 'det_new', 'sconj_new', 'space_new', 'sym_new', 'num_new', 'adp_new',
              'intj_new', 'aux_new', 'inf_new', 'pv_verl_ev_new', 'pv_verl_mv_new', 'pv_tgw_ev_new', 'pv_tgw_mv_new',
                'od_prenom_new', 'od_nom_new', 'od_postnom_new', 'od_vrij_new', 'vd_vrij_new', 'vd_prenom_new', 
              'vd_postnom_new', 'vd_nom_new', 'verb_new', 'cconj_new', 'adj_sup_new', 'adj_comp_new', 'adj_basis_new',
              'double_word', 'equal_after_subst', 'globally_equal_after_subst',
              'first_wordtype', 'last_wordtype', 'one_edit_change',
               'orginal_spelling_ok', 'new_spelling_ok', 'number_comparison', 'temporary', 'sentence_sim', 'diff_sim',
              'doubt_words_orig', 'doubt_words_new', 'doubt_words_total',
              'entity_present_in_original', 'entity_present_in_new',
              'changed_position',
              'colors', 'days', 'currencies', 'months', 'winds', 'states', 'countries', 'cities', 'belgian', 'nationality',
              'date_diff', 'person_diff',
              'negation_original', 'negation_new',
              'nr_red_parts', 'nr_green_parts', 'nr_full_sentences_original', 'nr_full_sentences_new',
              'orig_part_of_new', 'new_part_of_orig', 
              'original_minimized_changed_text', 'new_minimized_changed_text',
              'original_changed_text', 'new_changed_text'
             ]
        
    writer.writerow(header)
    
    categorizationid = -1
    original_article_text = []
    new_article_text = []
    original_changed_sentences = set()
    new_changed_sentences = set()
    new_row = []
    in_other_text = False
    
    
    #enumerate over all raw data records
    for i, row in enumerate(reader):
        if i != 0:
            original = row[6]
            
            #if categorizationid of last row is not equal to that of the current row, we have reached the end of the previous atomic change
            # because the raw data input files are sorted on categorizationid
            #we can now calculate all features that require all atomic change information instead of only individual text part information
            if (categorizationid != row[0]):
                
                if (categorizationid != -1):
                    
                    #original_changed_fraction_text_part
                    if text_part == 'title':
                        if len(original_title) > 0:
                            new_row.append(new_row[3]/len(original_title))
                        else:
                            new_row.append(1.0)
                    
                    if text_part == 'intro':
                        if len(original_intro) > 0:
                            new_row.append(new_row[3]/len(original_intro))
                        else:
                            new_row.append(1.0)
                        
                    if text_part == 'text':
                        if len(original_text) > 0:
                            new_row.append(new_row[3]/len(original_text))
                        else:
                            new_row.append(1.0)
                    
                    #new_changed_fraction_text_part
                    if text_part == 'title':
                        if len(new_title) > 0:
                            new_row.append(new_row[4]/len(new_title))
                        else:
                            new_row.append(1.0)
                    
                    if text_part == 'intro':
                        if len(new_intro) > 0:
                            new_row.append(new_row[4]/len(new_intro))
                        else:
                            new_row.append(1.0)
                        
                    if text_part == 'text':
                        if len(new_text) > 0:
                            new_row.append(new_row[4]/len(new_text))
                        else:
                            new_row.append(1.0)
                    
                    original_changed_sentences = sorted([(int(n1), int(n2), int(n3), int(n4)) for (n1, n2, n3, n4) in original_changed_sentences])
                    new_changed_sentences = sorted([(int(n1), int(n2), int(n3), int(n4)) for (n1, n2, n3, n4) in new_changed_sentences])
                    
                    original_changed_text = ""
                    new_changed_text = ""
                    original_minimalized_changed_text = ""
                    new_minimalized_changed_text = ""
                    numeric = False
                    nr_red_parts = 0
                    nr_green_parts = 0
                    previous_full = False
                    
                    nr_full_sentences_original = 0
                    nr_full_sentences_new = 0
                    original_sent_added = set()
                    new_sent_added = set()
                    
                    #use collected unique identifiers of text parts belonging to the same atomic change in order
                    #to construct the original full sentence, original minimalized sentence, new full sentence and new minimalized sentence
                    
                    #text parts that do not appear right after each other in the original text are separated by a single space
                    for (i, (parnumber, sentencenumber, startindex, endindex)) in enumerate(original_changed_sentences):
                        if i == 0:
                            if not((parnumber, sentencenumber) in original_sent_added):
                                original_changed_text = original_changed_text + original_article_text[int(parnumber)][int(sentencenumber)].strip()
                                original_sent_added.add((parnumber, sentencenumber))
                                
                            original_minimalized_changed_text = original_minimalized_changed_text + original_article_text[int(parnumber)][int(sentencenumber)].strip()[int(startindex): int(endindex) + 1]
                     
                        else:
                            if not((parnumber, sentencenumber) in original_sent_added):
                                original_changed_text = original_changed_text + ' ' + original_article_text[int(parnumber)][int(sentencenumber)].strip()
                                original_sent_added.add((parnumber, sentencenumber))
                                
                            original_minimalized_changed_text = original_minimalized_changed_text + ' ' + original_article_text[int(parnumber)][int(sentencenumber)].strip()[int(startindex): int(endindex) + 1]
                    
                        if (int(startindex) != 0) or (int(endindex) != len(original_article_text[int(parnumber)][int(sentencenumber)]) - 1):
                            nr_red_parts = nr_red_parts + 1
                            previous_full = False
                        
                        else:
                            nr_full_sentences_original = nr_full_sentences_original + 1
                            
                            if not(previous_full):
                                nr_red_parts = nr_red_parts + 1
                                previous_full = True
                    
                    previous_full = False
                    
                    for (i, (parnumber, sentencenumber, startindex, endindex)) in enumerate(new_changed_sentences):
                        if i == 0: 
                            if not((parnumber, sentencenumber) in new_sent_added):
                                new_changed_text = new_changed_text + new_article_text[int(parnumber)][int(sentencenumber)].strip()
                                new_sent_added.add((parnumber, sentencenumber))
                            
                            new_minimalized_changed_text = new_minimalized_changed_text + new_article_text[int(parnumber)][int(sentencenumber)].strip()[int(startindex): int(endindex) + 1]
                                                
                        else:
                            if not((parnumber, sentencenumber) in new_sent_added):
                                new_changed_text = new_changed_text + ' ' + new_article_text[int(parnumber)][int(sentencenumber)].strip()
                                new_sent_added.add((parnumber, sentencenumber))
                                
                            new_minimalized_changed_text = new_minimalized_changed_text + ' ' + new_article_text[int(parnumber)][int(sentencenumber)].strip()[int(startindex): int(endindex) + 1]
                    
                        if (int(startindex) != 0) or (int(endindex) != len(new_article_text[int(parnumber)][int(sentencenumber)]) - 1):
                            nr_green_parts = nr_green_parts + 1
                            previous_full = False
                        
                        else:
                            nr_full_sentences_new = nr_full_sentences_new + 1
                            
                            if not(previous_full):
                                nr_green_parts = nr_green_parts + 1
                                previous_full = True
                                
                    while '  ' in original_minimalized_changed_text:
                        original_minimalized_changed_text = original_minimalized_changed_text.replace('  ', ' ')
                        
                    while '  ' in new_minimalized_changed_text:
                        new_minimalized_changed_text = new_minimalized_changed_text.replace('  ', ' ')
                    
                    #add levenshtein distance
                    levenshtein_distance_max = levenshtein_distance(original_changed_text.lower(), new_changed_text.lower())               
                    
                    new_row.append(levenshtein_distance_max)
                    
                    editops_list = editops(original_changed_text, new_changed_text)
                    
                    #add number of inserts
                    new_row.append(len([op for (op, s, d) in editops_list if op == 'insert']))
                    
                    #add number of deletes
                    new_row.append(len([op for (op, s, d) in editops_list if op == 'delete']))
                    
                    #add number of replacements
                    new_row.append(len([op for (op, s, d) in editops_list if op == 'replace']))
                    
                    #add minimalized Levenshtein distance
                    levenshtein_distance_min = levenshtein_distance(original_minimalized_changed_text.lower(), new_minimalized_changed_text.lower())
                    
                    new_row.append(levenshtein_distance_min)
                    
                    editops_list_minimalized = editops(original_minimalized_changed_text.lower(), new_minimalized_changed_text.lower())
                    
                    #add number of inserts
                    new_row.append(len([op for (op, s, d) in editops_list_minimalized if op == 'insert']))
                    
                    #add number of deletes
                    new_row.append(len([op for (op, s, d) in editops_list_minimalized if op == 'delete']))
                    
                    #add number of replacements
                    new_row.append(len([op for (op, s, d) in editops_list_minimalized if op == 'replace']))
                    
                    
                    #add capitalized_equality:
                    if (original_minimalized_changed_text == new_minimalized_changed_text):
                        new_row.append(0)
                        
                    else:
                        if (original_minimalized_changed_text.lower() == new_minimalized_changed_text.lower()):
                            new_row.append(1)
                            
                        else:
                            if (original_minimalized_changed_text.strip() == new_minimalized_changed_text.strip()):
                                new_row.append(2)
                                
                            else:
                                str1 = original_minimalized_changed_text.lower().replace(' ', '').replace('-', '').replace('.','') 
                                str2 = new_minimalized_changed_text.lower().replace(' ', '').replace('-', '').replace('.','') 
                                
                                if (str1 == str2):
                                    new_row.append(3)
                                    
                                else:
                                    new_row.append(4)
                    
                    #compute jaccard similarity
                    set1 = set([string.lower() for string in word_tokenize(original_changed_text)])
                    set2 = set([string.lower() for string in word_tokenize(new_changed_text)])
                    
                    jac_sim = float(len(set1.intersection(set2))) / len(set1.union(set2))

                    
                    new_row.append(jac_sim)
                    
                    new_row.append(seqratio([string for string in word_tokenize(original_changed_text)], [string for string in word_tokenize(new_changed_text)]))
                    
                    #add boolean whether at least one sentence is present in other text
                    
                    displaced_length_original = 0
                    
                    for (parnumber, sentencenumber, startindex, endindex) in original_changed_sentences:
                        sentence = original_article_text[int(parnumber)][int(sentencenumber)]
                        
                        if ((sentence in new_title) or (sentence in new_intro) or (sentence in new_text)) and not(sentence in new_changed_text):
                            displaced_length_original = displaced_length_original + len(sentence)
                    
                    displaced_length_new = 0        
                            
                    for (parnumber, sentencenumber, startindex, endindex) in new_changed_sentences:
                        sentence = new_article_text[int(parnumber)][int(sentencenumber)]
                            
                        if ((sentence in original_title) or (sentence in original_intro) or (sentence in original_text)) and not (sentence in original_changed_text):
                            displaced_length_new = displaced_length_new + len(sentence)      
                                
                    
                    #new_row.append(in_other_text)
                    if len(original_changed_text) == 0:
                        new_row.append(0)
                        
                    else:
                            
                        new_row.append(float(displaced_length_original)/len(original_changed_text))
                        
                    if len(new_changed_text) == 0:
                        new_row.append(0)   
                        
                    else:
                        new_row.append(float(displaced_length_new)/len(new_changed_text))
                    
                    levenshtein_insert_string = ''
                    levenshtein_delete_string = ''
                    levenshtein_replace_string = ''
                    
                    insert_previous = None
                    delete_previous = None
                    replace_previous = None
                    
                    for (op, s, d) in editops_list:
                        if op == 'insert':
                            
                            if(insert_previous != d - 1):
                                levenshtein_insert_string = levenshtein_insert_string + ' ' + new_changed_text[d]
                                
                            else:
                                levenshtein_insert_string = levenshtein_insert_string + new_changed_text[d]
                                
                            insert_previous = d
                            
                        if op == 'delete':
                            if(delete_previous != s - 1):
                                levenshtein_delete_string = levenshtein_delete_string + ' ' + original_changed_text[s]
                                
                            else:
                                levenshtein_delete_string = levenshtein_delete_string + original_changed_text[s]
                                
                            delete_previous = s
                            
                        if op == 'replace':
                            
                            if(replace_previous != s - 1):
                                levenshtein_replace_string = levenshtein_replace_string + ' ' + original_changed_text[s]
                                
                            else: 
                                levenshtein_replace_string = levenshtein_replace_string + original_changed_text[s]
                                
                            replace_previous = s
                    
                    
                    #lemmatized minimized sentences
                    new_row.append(lemmatize_text(original_minimalized_changed_text))
                    new_row.append(lemmatize_text(new_minimalized_changed_text))
                    
                    verbose = False
                    
                    #spaCy features are added
                        
                    if text_part == 'title':
                    
                        additional_features = use_spacy(verbose, original_changed_text, original_minimalized_changed_text, 
                                                    new_changed_text, new_minimalized_changed_text, original_title, new_title, \
                                                        original_changed_sentences, new_changed_sentences)
                    
                    if text_part == 'intro':
                    
                        additional_features = use_spacy(verbose, original_changed_text, original_minimalized_changed_text, 
                                                    new_changed_text, new_minimalized_changed_text, original_intro, new_intro, \
                                                        original_changed_sentences, new_changed_sentences)
                                    
                    if text_part == 'text':
                    
                        additional_features = use_spacy(verbose, original_changed_text, original_minimalized_changed_text, 
                                                    new_changed_text, new_minimalized_changed_text, original_text, new_text, \
                                                        original_changed_sentences, new_changed_sentences)
                    
                    for feature in additional_features:
                        new_row.append(feature)
                        
                        
                    #number_red_parts
                    new_row.append(nr_red_parts)
                    
                    #number_green_parts
                    new_row.append(nr_green_parts)
                    
                    #number of original full sentences
                    new_row.append(nr_full_sentences_original)
                    
                    #number of new full sentences
                    new_row.append(nr_full_sentences_new)
                    
                    
                    #original subpart of new
                    if original_minimalized_changed_text.strip() in new_minimalized_changed_text.strip() and len(original_minimalized_changed_text.strip()) > 1:
                        new_row.append(1)
                        
                    else:
                        new_row.append(0)
                    
                    #new subpart of original
                    if new_minimalized_changed_text.strip() in original_minimalized_changed_text.strip() and len(new_minimalized_changed_text.strip()) > 1:
                        new_row.append(1)
                    else:
                        new_row.append(0)
                    
                    #minimized_changed_text
                    new_row.append(original_minimalized_changed_text)
                    new_row.append(new_minimalized_changed_text)
                    
                    #changed_text 
                    new_row.append(original_changed_text)
                    new_row.append(new_changed_text)  
                
                    writer.writerow(new_row)
                    new_row = []
                    original_changed_sentences = set()
                    new_changed_sentences = set()
                    
                 
                text_part = row[5]
                
                while u'\xa0\xa0' in row[11]:
                    row[11] = row[11].replace(u'\xa0\xa0', u'\xa0')
                    
                while u'\xa0\xa0' in row[12]:
                    row[12] = row[12].replace(u'\xa0\xa0', u'\xa0')
                    
                while u'\xa0\xa0' in row[13]:
                    row[13] = row[13].replace(u'\xa0\xa0', u'\xa0')
                    
                while u'\xa0\xa0' in row[14]:
                    row[14] = row[14].replace(u'\xa0\xa0', u'\xa0')
                    
                while u'\xa0\xa0' in row[15]:
                    row[15] = row[15].replace(u'\xa0\xa0', u'\xa0')
                    
                while u'\xa0\xa0' in row[16]:
                    row[16] = row[16].replace(u'\xa0\xa0', u'\xa0')
                
                original_title = row[11]
                new_title = row[12]
                original_intro = row[13]
                new_intro = row[14]
                original_text = row[15]
                new_text = row[16]
                    
                if text_part == 'title':
                    original_article_text = split_text(row[11])
                    new_article_text = split_text(row[12])

                if text_part == 'intro':
                    original_article_text = split_text(row[13])
                    new_article_text = split_text(row[14])

                if text_part == 'text':
                    original_article_text = split_text(row[15])
                    new_article_text = split_text(row[16])
                    
                in_other_text = False
                    
                categorizationid = row[0]
                original_article = []
                new_article = []
                                
                new_row.append(row[3]) #newspaper
                new_row.append(row[4]) #topic
                new_row.append(text_part) #textpart
                
                #original length
                if original == 'true':
                    new_row.append(int(row[10]) - int(row[9]) + 1)
                    new_row.append(0)
                    
                #new length
                else:
                    new_row.append(0)
                    new_row.append(int(row[10]) - int(row[9]) + 1)
                    
                #max_version_number
                new_row.append(row[30])
                
                #version_number_progress
                if original == 'true':
                    new_row.append((float(row[2]) + 1)/float(row[30]))
                    
                else: 
                    new_row.append(float(row[2])/float(row[30]))
                
                #first_time
                new_row.append(row[19])
                
                #original_time
                new_row.append(row[17])
                
                #check if publication date of both versions is the same
                original_date = datetime.strptime(row[17], '%Y-%m-%d %H:%M:%S.%f')
                new_date = datetime.strptime(row[18], '%Y-%m-%d %H:%M:%S.%f')
                
                if original_date.date() == new_date.date():
                    new_row.append(1)
                else:
                    new_row.append(0)
                                
                #time_difference
                new_row.append(row[32])
            
                #original_title_length
                new_row.append(len(row[11]))
                
                #original_intro_length
                new_row.append(len(row[13]))
                
                #original_text_length
                new_row.append(len(row[15]))
                
                #original_title_length
                new_row.append(len(row[12]))
                
                #original_intro_length
                new_row.append(len(row[14]))
                
                #original_text_length
                new_row.append(len(row[16]))
                
                #new_time
                new_row.append(row[18])
                
                #store parts of atomic change by their unique indexes (parnumber, sentencenumber, startindex and endindex)
                if original == 'true':
                    original_changed_sentences.add((row[7], row[8], row[9], row[10]))
                else:
                    new_changed_sentences.add((row[7], row[8], row[9], row[10]))
                    
                #add changed fractions for different article parts
                if (len(original_title) == 0):
                    new_row.append(1.0)
                else:
                    new_row.append(float(row[34])/len(original_title))
                    
                if (len(new_title) == 0):
                    new_row.append(1.0)
                else:
                    new_row.append(float(row[35])/len(new_title))
                    
                if (len(original_intro) == 0):
                    new_row.append(1.0)
                else:
                    new_row.append(float(row[36])/len(original_intro))
                    
                if (len(new_intro) == 0):
                    new_row.append(1.0)
                else:
                    new_row.append(float(row[37])/len(new_intro))
                    
                if (len(original_text) == 0):
                    new_row.append(1.0)
                else:
                    new_row.append(float(row[38])/len(original_text))
                    
                if (len(new_text) == 0):
                    new_row.append(1.0)
                else:
                    new_row.append(float(row[39])/len(new_text))
                
                #add total changed fractions to text
                new_row.append(row[20])
                new_row.append(row[21])
                new_row.append(row[22])
                new_row.append(row[23])
                new_row.append(row[24])
                new_row.append(row[25])
                new_row.append(row[26])
                new_row.append(row[27])
                
                #label (an error or not?)
                new_row.append(row[29])
                
            else:
                #original length
                if original == 'true':
                    new_row[3] = new_row[3] + int(row[10]) - int(row[9]) + 1
                    
                #new length
                else:
                    new_row[4] = new_row[4] +  int(row[10]) - int(row[9]) + 1
                    
                    
                #store parts of atomic change by their unique indexes (parnumber, sentencenumber, startindex and endindex)
                if original == 'true':
                    original_changed_sentences.add((row[7], row[8], row[9], row[10]))
                else:
                    new_changed_sentences.add((row[7], row[8], row[9], row[10]))
                    
    new_file.close()

  result.append(nlp(original_full).similarity(nlp(new_full)))
  result.append(nlp(original_diff).similarity(nlp(new_diff)))


In [46]:
import csv
import random
from transformers import AutoTokenizer, AutoModel, TFAutoModel
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")
import torch

Some weights of the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.poole

In [None]:
#code block that takes manual features as an input and that calculates BERTje embeddings on full textual data
#these embeddings are subsequently written to a separate file

#this code block should only be ran once! (embeddings are the same for objective, subjective and linguistic errors)

bertje_file = open("..\\Data\\bertje.csv", 'w+', newline='')
inputfile = open("..\\Data\\input_data_model_ling_manual.csv", encoding='utf-8')

objbertjewriter = csv.writer(bertje_file, delimiter=';')
reader = csv.reader(inputfile, delimiter=',')

for i, row in enumerate(reader):
    
    if i != 0:
        original_changed_text = row[-2]
        new_changed_text = row[-1]

        original_df = tokenizer(original_changed_text, padding = True, truncation = True, return_tensors="pt")
        new_df = tokenizer(new_changed_text, padding = True, truncation = True, return_tensors="pt")

        with torch.no_grad():
            hidden_original = model(**original_df)
            hidden_new = model(**new_df)

            #get only the [CLS] hidden states
            cls = hidden_original.last_hidden_state[:,0,:].tolist()[0]
            cls_new = hidden_new.last_hidden_state[:,0,:].tolist()[0]
            cls.extend(cls_new)

            objbertjewriter.writerow(cls)

bertje_file.close()

In [None]:
#code block that takes manual features as an input and that calculates BERTje embeddings on minimized (but not lemmatized) textual data
#these embeddings are subsequently written to a separate file

#this code block should only be ran once! (embeddings are the same for objective, subjective and linguistic errors)

bertje_file = open("..\\Data\\bertje_minimized.csv", 'w+', newline='')
inputfile = open("..\\Data\\input_data_model_ling_manual.csv", encoding='utf-8')

objbertjewriter = csv.writer(bertje_file, delimiter=';')
reader = csv.reader(inputfile, delimiter=',')

for i, row in enumerate(reader):
    
    if i != 0:
        original_changed_text = row[-4]
        new_changed_text = row[-3]

        original_df = tokenizer(original_changed_text, padding = True, truncation = True, return_tensors="pt")
        new_df = tokenizer(new_changed_text, padding = True, truncation = True, return_tensors="pt")

        with torch.no_grad():
            hidden_original = model(**original_df)
            hidden_new = model(**new_df)

            #get only the [CLS] hidden states
            cls = hidden_original.last_hidden_state[:,0,:].tolist()[0]
            cls_new = hidden_new.last_hidden_state[:,0,:].tolist()[0]
            cls.extend(cls_new)

            objbertjewriter.writerow(cls)
    
bertje_file.close()

In [None]:
#code block that takes manual features as an input and that calculates BERTje embeddings on lemmatized and minimized textual data
#these embeddings are subsequently written to a separate file

#this code block should only be ran once! (embeddings are the same for objective, subjective and linguistic errors)

bertje_file = open("..\\Data\\bertje_lemmatized.csv", 'w+', newline='')
inputfile = open("..\\Data\\input_data_model_ling_manual.csv", encoding='utf-8')

objbertjewriter = csv.writer(bertje_file, delimiter=';')
reader = csv.reader(inputfile, delimiter=',')

for i, row in enumerate(reader):
    
    if i != 0:
        original_changed_text = row[48]
        new_changed_text = row[49]

        original_df = tokenizer(original_changed_text, padding = True, truncation = True, return_tensors="pt")
        new_df = tokenizer(new_changed_text, padding = True, truncation = True, return_tensors="pt")

        with torch.no_grad():
            hidden_original = model(**original_df)
            hidden_new = model(**new_df)

            #get only the [CLS] hidden states
            cls = hidden_original.last_hidden_state[:,0,:].tolist()[0]
            cls_new = hidden_new.last_hidden_state[:,0,:].tolist()[0]
            cls.extend(cls_new)

            objbertjewriter.writerow(cls)
    
bertje_file.close()

In [None]:
import csv
import random
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
sbert_model.max_seq_length = 512

In [None]:
#code block that takes manual features as an input and that calculates SBERT embeddings on full textual data
#these embeddings are subsequently written to a separate file

#this code block should only be ran once! (embeddings are the same for objective, subjective and linguistic errors)

sbert_file = open("..\\Data\\sbert.csv", 'w+', newline='')
inputfile = open("..\\Data\\input_data_model_ling_manual.csv", encoding='utf-8')

objsbertwriter = csv.writer(sbert_file, delimiter=';')
reader = csv.reader(inputfile, delimiter=',')

for i, row in enumerate(reader):
    
    if i != 0:
        original_changed_text = row[-2]
        new_changed_text = row[-1]

        original_embedding = sbert_model.encode(original_changed_text).tolist()
        new_embedding = sbert_model.encode(new_changed_text).tolist()
        
        original_embedding.extend(new_embedding)

        objsbertwriter.writerow(original_embedding)

    
sbert_file.close()

In [None]:
#code block that takes manual features as an input and that calculates SBERT embeddings on minimized textual data
#these embeddings are subsequently written to a separate file

#this code block should only be ran once! (embeddings are the same for objective, subjective and linguistic errors)

sbert_file = open("..\\Data\\sbert_minimized.csv", 'w+', newline='')
inputfile = open("..\\Data\\input_data_model_ling_manual.csv", encoding='utf-8')

objsbertwriter = csv.writer(sbert_file, delimiter=';')
reader = csv.reader(inputfile, delimiter=',')

for i, row in enumerate(reader):
    
    if i != 0:
        original_changed_text = row[-4]
        new_changed_text = row[-3]

        original_embedding = sbert_model.encode(original_changed_text).tolist()
        new_embedding = sbert_model.encode(new_changed_text).tolist()
        
        original_embedding.extend(new_embedding)

        objsbertwriter.writerow(original_embedding)
    
sbert_file.close()

In [None]:
#code block that takes manual features as an input and that calculates SBERT embeddings on minimized and lemmatized textual data
#these embeddings are subsequently written to a separate file

#this code block should only be ran once! (embeddings are the same for objective, subjective and linguistic errors)

sbert_file = open("..\\Data\\sbert_lemmatized.csv", 'w+', newline='')
inputfile = open("..\\Data\\input_data_model_ling_manual.csv", encoding='utf-8')

objsbertwriter = csv.writer(sbert_file, delimiter=';')
reader = csv.reader(inputfile, delimiter=',')

for i, row in enumerate(reader):
    
    if i != 0:
        original_changed_text = row[48]
        new_changed_text = row[49]

        original_embedding = sbert_model.encode(original_changed_text).tolist()
        new_embedding = sbert_model.encode(new_changed_text).tolist()
        
        original_embedding.extend(new_embedding)

        objsbertwriter.writerow(original_embedding)
    
sbert_file.close()

In [None]:
import csv
import random
from transformers import AutoTokenizer, AutoModel, TFAutoModel
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")

In [None]:
#read all embeddings and add them all in one single row per sample in the dataset

bertje_input_file = open("..\\Data\\bertje.csv", encoding='utf-8')

bertje_reader = csv.reader(bertje_input_file, delimiter=';')

embeddings = []
for row in bertje_reader:
    embeddings.append(row)
    
bertje_input_file.close()

#read BERTje lemmatized embeddings

bertje_input_file = open("..\\Data\\bertje_lemmatized.csv", encoding='utf-8')

bertje_reader = csv.reader(bertje_input_file, delimiter=';')

for i, row in enumerate(bertje_reader):
    embeddings[i].extend(row)
    
bertje_input_file.close()

#read BERTje minimized embeddings

bertje_input_file = open("..\\Data\\bertje_minimized.csv", encoding='utf-8')

bertje_reader = csv.reader(bertje_input_file, delimiter=';')

for i, row in enumerate(bertje_reader):
    embeddings[i].extend(row)
    
bertje_input_file.close()

#read SBERT embeddings

sbert_input_file = open("..\\Data\\sbert.csv", encoding='utf-8')

sbert_reader = csv.reader(sbert_input_file, delimiter=';')

for i, row in enumerate(sbert_reader):
    embeddings[i].extend(row)
    
sbert_input_file.close()

#read SBERT lemmatized embeddings

sbert_input_file = open("..\\Data\\sbert_lemmatized.csv", encoding='utf-8')

sbert_reader = csv.reader(sbert_input_file, delimiter=';')

for i, row in enumerate(sbert_reader):
    embeddings[i].extend(row)
    
sbert_input_file.close()

#read SBERT minimized embeddings

sbert_input_file = open("..\\Data\\sbert_minimized.csv", encoding='utf-8')

sbert_reader = csv.reader(sbert_input_file, delimiter=';')

for i, row in enumerate(sbert_reader):
    embeddings[i].extend(row)
    
sbert_input_file.close()

In [None]:
#code block that adds all embeddings to the feature data set for the objective error task

obj_input_file = open("..\\Data\\input_data_model_obj_manual.csv", encoding='utf-8')
obj_output_file = open("..\\Data\\input_data_model_obj.csv", 'w+', newline='', encoding='utf-8')

header_bertje_original = ['original_bertje_' + str(i) for i in range(0, 768)]
header_bertje_new = ['new_bertje_' + str(i) for i in range(0, 768)]

header_bertje_lemmatized_original = ['original_lemmatized_bertje_' + str(i) for i in range(0, 768)]
header_bertje_lemmatized_new = ['new_lemmatized_bertje_' + str(i) for i in range(0, 768)]

header_bertje_minimized_original = ['original_minimized_bertje_' + str(i) for i in range(0, 768)]
header_bertje_minimized_new = ['new_minimized_bertje_' + str(i) for i in range(0, 768)]

header_sbert_original = ['original_sbert_' + str(i) for i in range(0, 512)]
header_sbert_new = ['new_sbert_' + str(i) for i in range(0, 512)]

header_sbert_lemmatized_original = ['original_lemmatized_sbert_' + str(i) for i in range(0, 512)]
header_sbert_lemmatized_new = ['new_lemmatized_sbert_' + str(i) for i in range(0, 512)]

header_sbert_minimized_original = ['original_minimized_sbert_' + str(i) for i in range(0, 512)]
header_sbert_minimized_new = ['new_minimized_sbert_' + str(i) for i in range(0, 512)]

writer = csv.writer(obj_output_file, delimiter=',')
reader = csv.reader(obj_input_file, delimiter=',')

for i, row in enumerate(reader):
    if i == 0:
        row.extend(header_bertje_original)
        row.extend(header_bertje_new)
        row.extend(header_bertje_lemmatized_original)
        row.extend(header_bertje_lemmatized_new)
        row.extend(header_bertje_minimized_original)
        row.extend(header_bertje_minimized_new)
        
        row.extend(header_sbert_original)
        row.extend(header_sbert_new)
        row.extend(header_sbert_lemmatized_original)
        row.extend(header_sbert_lemmatized_new)
        row.extend(header_sbert_minimized_original)
        row.extend(header_sbert_minimized_new)
        
        writer.writerow(row)
    
    if i != 0:
        row.extend(embeddings[i-1])
        writer.writerow(row)
    
obj_input_file.close()
obj_output_file.close()

In [None]:
#code block that adds all embeddings to the feature data set for the subjective error task

subj_input_file = open("..\\Data\\input_data_model_subj_manual.csv", encoding='utf-8')
subj_output_file = open("..\\Data\\input_data_model_subj.csv", 'w+', newline='', encoding='utf-8')

header_bertje_original = ['original_bertje_' + str(i) for i in range(0, 768)]
header_bertje_new = ['new_bertje_' + str(i) for i in range(0, 768)]

header_bertje_lemmatized_original = ['original_lemmatized_bertje_' + str(i) for i in range(0, 768)]
header_bertje_lemmatized_new = ['new_lemmatized_bertje_' + str(i) for i in range(0, 768)]

header_bertje_minimized_original = ['original_minimized_bertje_' + str(i) for i in range(0, 768)]
header_bertje_minimized_new = ['new_minimized_bertje_' + str(i) for i in range(0, 768)]

header_sbert_original = ['original_sbert_' + str(i) for i in range(0, 512)]
header_sbert_new = ['new_sbert_' + str(i) for i in range(0, 512)]

header_sbert_lemmatized_original = ['original_lemmatized_sbert_' + str(i) for i in range(0, 512)]
header_sbert_lemmatized_new = ['new_lemmatized_sbert_' + str(i) for i in range(0, 512)]

header_sbert_minimized_original = ['original_minimized_sbert_' + str(i) for i in range(0, 512)]
header_sbert_minimized_new = ['new_minimized_sbert_' + str(i) for i in range(0, 512)]

writer = csv.writer(subj_output_file, delimiter=',')
reader = csv.reader(subj_input_file, delimiter=',')

for i, row in enumerate(reader):
    if i == 0:
        row.extend(header_bertje_original)
        row.extend(header_bertje_new)
        row.extend(header_bertje_lemmatized_original)
        row.extend(header_bertje_lemmatized_new)
        row.extend(header_bertje_minimized_original)
        row.extend(header_bertje_minimized_new)
        
        row.extend(header_sbert_original)
        row.extend(header_sbert_new)
        row.extend(header_sbert_lemmatized_original)
        row.extend(header_sbert_lemmatized_new)
        row.extend(header_sbert_minimized_original)
        row.extend(header_sbert_minimized_new)
        writer.writerow(row)
    
    if i != 0:
        row.extend(embeddings[i-1])
        writer.writerow(row)
    
subj_input_file.close()
subj_output_file.close()

In [None]:
#code block that adds all embeddings to the feature data set for the linguistic error task

spel_input_file = open("..\\Data\\input_data_model_ling_manual.csv", encoding='utf-8')
spel_output_file = open("..\\Data\\input_data_model_ling.csv", 'w+', newline='', encoding='utf-8')

header_bertje_original = ['original_bertje_' + str(i) for i in range(0, 768)]
header_bertje_new = ['new_bertje_' + str(i) for i in range(0, 768)]

header_bertje_lemmatized_original = ['original_lemmatized_bertje_' + str(i) for i in range(0, 768)]
header_bertje_lemmatized_new = ['new_lemmatized_bertje_' + str(i) for i in range(0, 768)]

header_bertje_minimized_original = ['original_minimized_bertje_' + str(i) for i in range(0, 768)]
header_bertje_minimized_new = ['new_minimized_bertje_' + str(i) for i in range(0, 768)]

header_sbert_original = ['original_sbert_' + str(i) for i in range(0, 512)]
header_sbert_new = ['new_sbert_' + str(i) for i in range(0, 512)]

header_sbert_lemmatized_original = ['original_lemmatized_sbert_' + str(i) for i in range(0, 512)]
header_sbert_lemmatized_new = ['new_lemmatized_sbert_' + str(i) for i in range(0, 512)]

header_sbert_minimized_original = ['original_minimized_sbert_' + str(i) for i in range(0, 512)]
header_sbert_minimized_new = ['new_minimized_sbert_' + str(i) for i in range(0, 512)]

writer = csv.writer(spel_output_file, delimiter=',')
reader = csv.reader(spel_input_file, delimiter=',')

for i, row in enumerate(reader):
    if i == 0:
        row.extend(header_bertje_original)
        row.extend(header_bertje_new)
        row.extend(header_bertje_lemmatized_original)
        row.extend(header_bertje_lemmatized_new)
        row.extend(header_bertje_minimized_original)
        row.extend(header_bertje_minimized_new)
        
        row.extend(header_sbert_original)
        row.extend(header_sbert_new)
        row.extend(header_sbert_lemmatized_original)
        row.extend(header_sbert_lemmatized_new)
        row.extend(header_sbert_minimized_original)
        row.extend(header_sbert_minimized_new)
        writer.writerow(row)
    
    if i != 0:
        row.extend(embeddings[i-1])
        writer.writerow(row)
    
spel_input_file.close()
spel_output_file.close()

In [46]:
#code block that takes the feature file as an input (which can be constructed using the code blocks above)
#and outputs two separate files, one containing the training set, one containing the test set
#records are split randomly, but such that stratified k-fold cross validation is possible 
#(equal relative amount of errors in training set and test set)

#new_test_file = open('..\\Data\\input_data_model_obj_test.csv', 'w', newline='', encoding='utf-8')
#new_train_file = open('..\\Data\\input_data_model_obj_train.csv', 'w', newline='', encoding='utf-8')
#new_test_file = open('..\\Data\\input_data_model_subj_test.csv', 'w', newline='', encoding='utf-8')
#new_train_file = open('..\\Data\\input_data_model_subj_train.csv', 'w', newline='', encoding='utf-8')
new_test_file = open('..\\Data\\input_data_model_ling_test.csv', 'w', newline='', encoding='utf-8')
new_train_file = open('..\\Data\\input_data_model_ling_train.csv', 'w', newline='', encoding='utf-8')


#csvfile = open('..\\Data\\input_data_model_obj.csv', 'r', newline='', encoding='utf-8')
#csvfile = open('..\\Data\\input_data_model_subj.csv', 'r', newline='', encoding='utf-8')
csvfile = open('..\\Data\\input_data_model_ling.csv', 'r', newline='', encoding='utf-8')

reader = csv.reader(csvfile, delimiter=',', quotechar='"')

test_writer = csv.writer(new_test_file, delimiter=',', quotechar='"')
header = ['newspaper', 'topic', 'textpart', 'length_original', 'length_new', 'max_version_number', 'version_number_progress',
                     'first_time', 'original_time', 'dates_difference', 'time_difference', 'original_title_length', 'original_intro_length',
               'original_text_length', 'new_title_length', 'new_intro_length', 'new_text_length', 'new_time', 
              'fraction_original_title_changed', 'fraction_new_title_changed',
              'fraction_original_intro_changed', 'fraction_new_intro_changed',
              'fraction_original_text_changed', 'fraction_new_text_changed',
          'fraction_total_changed_original', 'fraction_total_changed_new',
            'fraction_total_changed_title_original', 'fraction_total_changed_title_new',
              'fraction_total_changed_intro_original', 'fraction_total_changed_intro_new',
              'fraction_total_changed_text_original', 'fraction_total_changed_text_new',
          'type',
                     'original_changed_fraction_text_part', 'new_changed_fraction_text_part', 
              'levenshtein_maximalized', 'nr_insert_max', 'nr_delete_max', 'nr_replace_max',
              'levenshtein_minimalized', 'nr_insert_min', 'nr_delete_min', 'nr_replace_min', 'capitalized_equality',
                     'jaccard', 'seqratio', 'text_overlap_original', 'text_overlap_new', 
          'original_lemmatized_minimized_changed_text', 'new_lemmatized_minimized_changed_text', 
          'stop_words_ratio',    
          'ent_original', 'original_token_length', 'adv_orig', 'noun_orig', 'point_orig', 'comma_orig',
              'accent_orig', 'haakje_orig', 'doublepoint_orig', 'hyphen_orig', 'threepoints_orig', 'punct_orig', 'x_orig', 'propn_orig',
              'pron_orig', 'det_orig', 'sconj_orig', 'space_orig', 'sym_orig', 'num_orig', 'adp_orig',
              'intj_orig', 'aux_orig', 'inf_orig', 'pv_verl_ev_orig', 'pv_verl_mv_orig', 'pv_tgw_ev_orig', 'pv_tgw_mv_orig',
                'od_prenom_orig', 'od_nom_orig', 'od_postnom_orig', 'od_vrij_orig', 'vd_vrij_orig', 'vd_prenom_orig', 
              'vd_postnom_orig', 'vd_nom_orig', 'verb_orig', 'cconj_orig', 'adj_sup_orig', 'adj_comp_orig', 'adj_basis_orig',
              'ent_new', 'new_token_length', 'adv_new', 'noun_new', 'point_new', 'comma_new',
              'accent_new', 'haakje_new', 'doublepoint_new', 'hyphen_new', 'threepoints_new', 'punct_new', 'x_new', 'propn_new',
              'pron_new', 'det_new', 'sconj_new', 'space_new', 'sym_new', 'num_new', 'adp_new',
              'intj_new', 'aux_new', 'inf_new', 'pv_verl_ev_new', 'pv_verl_mv_new', 'pv_tgw_ev_new', 'pv_tgw_mv_new',
                'od_prenom_new', 'od_nom_new', 'od_postnom_new', 'od_vrij_new', 'vd_vrij_new', 'vd_prenom_new', 
              'vd_postnom_new', 'vd_nom_new', 'verb_new', 'cconj_new', 'adj_sup_new', 'adj_comp_new', 'adj_basis_new',
              'double_word', 'equal_after_subst', 'globally_equal_after_subst',
              'first_wordtype', 'last_wordtype', 'one_edit_change',
               'orginal_spelling_ok', 'new_spelling_ok', 'number_comparison', 'temporary', 'sentence_sim', 'diff_sim',
              'doubt_words_orig', 'doubt_words_new', 'doubt_words_total',
              'entity_present_in_original', 'entity_present_in_new',
              'changed_position',
              'colors', 'days', 'currencies', 'months', 'winds', 'states', 'countries', 'cities', 'belgian', 'nationality',
              'date_diff', 'person_diff',
              'negation_original', 'negation_new',
              'nr_red_parts', 'nr_green_parts', 'nr_full_sentences_original', 'nr_full_sentences_new',
              'orig_part_of_new', 'new_part_of_orig',
                'original_minimized_changed_text', 'new_minimized_changed_text',
              'original_changed_text', 'new_changed_text'
             ]

header_bertje_original = ['original_bertje_' + str(i) for i in range(0, 768)]
header_bertje_new = ['new_bertje_' + str(i) for i in range(0, 768)]

header_bertje_lemmatized_original = ['original_lemmatized_bertje_' + str(i) for i in range(0, 768)]
header_bertje_lemmatized_new = ['new_lemmatized_bertje_' + str(i) for i in range(0, 768)]

header_bertje_minimized_original = ['original_minimized_bertje_' + str(i) for i in range(0, 768)]
header_bertje_minimized_new = ['new_minimized_bertje_' + str(i) for i in range(0, 768)]

header_sbert_original = ['original_sbert_' + str(i) for i in range(0, 512)]
header_sbert_new = ['new_sbert_' + str(i) for i in range(0, 512)]

header_sbert_lemmatized_original = ['original_lemmatized_sbert_' + str(i) for i in range(0, 512)]
header_sbert_lemmatized_new = ['new_lemmatized_sbert_' + str(i) for i in range(0, 512)]

header_sbert_minimized_original = ['original_minimized_sbert_' + str(i) for i in range(0, 512)]
header_sbert_minimized_new = ['new_minimized_sbert_' + str(i) for i in range(0, 512)]

header.extend(header_bertje_original)
header.extend(header_bertje_new)
header.extend(header_bertje_lemmatized_original)
header.extend(header_bertje_lemmatized_new)
header.extend(header_bertje_minimized_original)
header.extend(header_bertje_minimized_new)
        
header.extend(header_sbert_original)
header.extend(header_sbert_new)
header.extend(header_sbert_lemmatized_original)
header.extend(header_sbert_lemmatized_new)
header.extend(header_sbert_minimized_original)
header.extend(header_sbert_minimized_new)
        
test_writer.writerow(header)

train_writer = csv.writer(new_train_file, delimiter=',', quotechar='"')      
train_writer.writerow(header)


all_lines = list(reader)[1:]
random.shuffle(all_lines)
true_lines = [line for line in all_lines if line[32] == 'true']
true_lines_indices = [i for i in range(0, len(true_lines))]
false_lines = [line for line in all_lines if line[32] == 'false']
false_lines_indices = [i for i in range(0, len(false_lines))]

true_test_lines_indices = random.sample(true_lines_indices, k=round(0.2*len(true_lines_indices)))
true_train_lines_indices = [i for i in true_lines_indices if i not in true_test_lines_indices]
true_test_lines = [true_lines[i] for i in true_test_lines_indices]
true_train_lines = [true_lines[i] for i in true_train_lines_indices]


false_test_lines_indices = random.sample(false_lines_indices, k=round(0.2*len(false_lines_indices)))
false_train_lines_indices = [i for i in false_lines_indices if i not in false_test_lines_indices]
false_test_lines = [false_lines[i] for i in false_test_lines_indices]
false_train_lines = [false_lines[i] for i in false_train_lines_indices]


for row in true_test_lines:
    test_writer.writerow(row)
    
for row in false_test_lines:
    test_writer.writerow(row)
    

for row in true_train_lines:
    train_writer.writerow(row)
    
for row in false_train_lines:
    train_writer.writerow(row)
    
new_test_file.close()
new_train_file.close()