In [None]:
import pandas as pd
import re
from collections import Counter
from operator import itemgetter

### Load data

In [None]:
config = {
    "delimiter":"\t",
    "header": 0,
    "usecols":["id", "DSArtikelID", "kilde_ID", "kildexml", "datering", "kildeopslag"]
}

In [None]:
kildeformer = pd.read_csv("data/danmarksstednavne_Kildeformer.csv", **config)

In [None]:
config["usecols"] = ["id", "DSId", "opslagsform", "toponummer", "GEO_X_WGS84_32N", "GEO_Y_WGS84_32N"]

In [None]:
opslagsformer = pd.read_csv("data/danmarksstednavne_DSArtikel.csv", **config)

In [None]:
kildeformer

In [None]:
len(kildeformer)

In [None]:
Counter(kildeformer.datering).most_common()

### Parsing date column

In [None]:
def parse_year(date):
    
    # Clean data
    date = re.sub("[\[\]\<\>]", "", date) # e.g., 17[96] 133<4>
    
    pattern = re.compile(r"(?P<from_to>(?P<century_from>[12]\d)(?P<decade_from>\d\d)( *[-–-] *(?P<century_to>[12]\d)?(?P<decade_to>\d\d))?)+")
    matches = re.finditer(pattern, date)
    
    years = []
    for match in matches:
        year_from = int(match["century_from"]+match["decade_from"])
        year_to = None
        
        # 1243-1250
        if match["century_to"]:
            year_to = int(match["century_to"]+match["decade_to"])

        # 1243-50
        elif match["decade_to"]:
            year_to = int(match["century_from"]+match["decade_to"])
        
        if year_to: 
            #assert year_from<=year_to, (match["from_to"], year_from, year_to, date)
            if year_from>year_to: print((match["from_to"], year_from, year_to, date))

        years.append((year_from, year_to))
   
    return years
    

In [None]:
kildeformer["date_from"] = None
kildeformer["date_to"] = None

In [None]:
new_entries = pd.DataFrame(columns=kildeformer.columns)

In [None]:
from_to = []
not_matched = []
for index, row in kildeformer.iterrows():
    print(index, end="\r")
    
    if str(row.datering) == "nan":
        from_to.append((None,None))
        continue

    parsed_years = parse_year(str(row.datering))
    
    if not parsed_years: 
        not_matched.append((index, row.datering))
        from_to.append((None,None))
        continue
    
    from_to.append(parsed_years.pop(0)) # Save to column, and apply later (faster)
    
    for date_from, date_to in parsed_years:
        new_row = row.copy()
        new_row.date_from, new_row.date_to = date_from, date_to
        new_entries = new_entries.append(new_row)

In [None]:
kildeformer["date_from"] = list(map(itemgetter(0), from_to))
kildeformer["date_to"] = list(map(itemgetter(1), from_to))

In [None]:
new_kildeformer = kildeformer.append(new_entries)

In [None]:
new_kildeformer["date_from"].describe()

In [None]:
new_kildeformer["date_from"].min(), new_kildeformer["date_from"].max()

In [None]:
import matplotlib.pyplot as plt

out = plt.hist(new_kildeformer["date_from"], bins=20, range=(1000,1600))

In [None]:
len(not_matched)

In [None]:
Counter(list(map(itemgetter(1), not_matched))).most_common()

## Append opslagsform and geo-location

In [None]:
import numpy as np

In [None]:
only_dated = new_kildeformer[new_kildeformer['date_from'].notnull()]

In [None]:
opslag = []
toponumre = []
geo_x = []
geo_y = []

for index, row in only_dated.iterrows():
    print(index, end="\r")    
    artikel = opslagsformer.loc[opslagsformer['id'] == row.DSArtikelID]
    if not artikel.empty:
        opslag.append(artikel.opslagsform.values[0])
        toponumre.append(artikel.toponummer.values[0])
        geo_x.append(artikel.GEO_X_WGS84_32N.values[0])
        geo_y.append(artikel.GEO_Y_WGS84_32N.values[0])
    else:
        opslag.append(np.nan)
        toponumre.append(np.nan)
        geo_x.append(np.nan)
        geo_y.append(np.nan)        

In [None]:
only_dated["opslagsform"] = opslag
only_dated["toponummer"] = toponumre
only_dated["GEO_X_WGS84_32N"] = geo_x
only_dated["GEO_Y_WGS84_32N"] = geo_y

In [None]:
only_dated

## Grapheme parsing

In [None]:
only_opslag = only_dated[only_dated['kildeopslag'].notnull()]
del new_kildeformer
del only_dated
only_opslag

In [None]:
def parse_graphs(name, opslagsform):
    graphs = []
    queue = list(name)
    graphs.append(queue.pop(0))
    caught = False
    
    # Complex graphs
    suffixes = ["h"]
    combinators = list("ptkcbdg")
    
    opslag_exceptions = ["holm", "hoved", "have", "havn", "høj", "hus", "hed"]
    
    # Manual exeptions
    exceptions=["ø", "ö", "olm", "us", "üs", "uus", "eth", "ed", "av", "af"]
    new_e = ["oolm", "ede", "eide","aue", "ave", "auge", "ouit", "oved", "ovi", "oed", "ods", "ollm", "ei", "ey", "ol", "alme",  "aug", "oue", "oui", "øgh", "om", "ode", "ü", "oy", "öu"]
    new_1 = ["øg", "off", "aus", "ove", "off", "yes", "owi", "ofv", "hywæ", "agæ", "ega", "awe", "yffu", "øgh", "iue", "iffue", "ifue", "if", "ws", "iw", "aun","uß", "oi", "ye", "ws", "alm", "oft", "uie", "au", "hoff"]
    new_2 = ["ove", "ye", "off", "au", "offu", "ow", "ofv", "yw", "uß", "age", "ega", "yffu", "ws", "hegn", "oi", "ye", "ues", "uns", "aus", "oj", "ow", "ws", "öu", "off", "uo", "ofu", "ofv", "og", "wed", "of", "ov", "oe", "uy", "iø" , "uo", "oet", "eü", "y"]
    exceptions = set(exceptions + new_e+new_1+new_2)
    changed = False
    while queue:
        item = queue.pop(0).lower()

        
        if graphs[-1] in combinators and item in suffixes:
            #check_exceptions =  [opslagsform.endswith(e) for e in exceptions]
            next_items = "".join(queue)
            check_exceptions =[next_items.startswith(e) for e in exceptions]
            check_opslag = [e in opslagsform for e in opslag_exceptions]
            
            if not any(check_opslag):
                changed = True                
                graphs[-1]+=item
            else:
                if not any(check_exceptions):
                    graphs[-1]+=item
                    changed = True
                else:
                    caught = True
                    graphs.append(item)
        else:
            graphs.append(item)
    
    return graphs, caught, changed

In [None]:
def normalise(graphs):
    norms = {
        "ch" : "kh",
        "c"  : "k",        
        "bh" : "b",
        "dh" : "d",
        "gh" : "g",
    }
    return [norms.get(g, g) for g in graphs]

In [None]:
graphs = []
normalised = []
changed = []
exceptions = []

for index, row in only_opslag.iterrows():
    print(index, end="\r")    
    graph, e, c = parse_graphs(row.kildeopslag, row.opslagsform)
    graphs.append(";".join(graph))
    normalised.append(";".join(normalise(graph)))
    if c:
        changed.append((graph, normalised[-1], row.kildeopslag, row.opslagsform))
    if e:
        exceptions.append((graph, normalised[-1], row.kildeopslag, row.opslagsform))

In [None]:
len(changed)

In [None]:
len(exceptions)

In [None]:
exceptions

In [None]:
only_opslag["graphs"] = graphs
only_opslag["normalised"] = normalised

In [None]:
#for n in set(["\t".join(n[1:]) for n in prev_changed])-set(["\t".join(n[1:]) for n in changed]):
#    print(n)

In [None]:
only_opslag.columns

In [None]:
only_opslag.to_csv("danmarksstednavne.csv", sep="\t", columns=["kildeopslag", "date_from", "date_to", "opslagsform", "graphs", "normalised"], index=False)