In [None]:
import pandas as pd

In [None]:
config = {
    "delimiter":"\t",
    "header": 0,
    "dtype":{0:str, 1:int, 3:str, 4:str, 5:str}
}

In [None]:
opslagsformer = pd.read_csv("danmarksstednavne.csv", **config)

In [None]:
opslagsformer

In [None]:
opslag_pre_1800 = opslagsformer[opslagsformer.date_from<1800]

In [None]:
opslag_pre_1800.describe()

In [None]:
import regex
def prepare_to_lm(row, col):
    name = " ".join(row[col].replace(" ", "#").split(";")).lower()
    name = regex.sub(r"\{( \/)? i \}", "", name)
    name = regex.sub(r"\p{Ps}|\p{Pe}", "", name)
    name = " ".join(name.split())
    return name

In [None]:
v = "{i}tester{/i} br(a)g{th} [igen]"
row = {
    "test": ";".join(list(v))
}
prepare_to_lm(row, "test")

In [None]:
opslag_pre_1800["lmdata"] = opslag_pre_1800.apply(lambda row: prepare_to_lm(row, "normalised"), axis=1)

In [None]:
opslag_pre_1800

In [None]:
import numpy as np

In [None]:
bins = np.arange(1300, 1801, step=50)
bins = np.insert(bins, 0, 0)
bins

In [None]:
opslag_pre_1800["label"] = np.digitize(opslag_pre_1800.date_from, bins)

In [None]:
from collections import Counter

In [None]:
for binx, count in sorted(Counter(opslag_pre_1800["label"]).items()):
    print(binx, "{}-{}".format(bins[binx-1], bins[binx]), count)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import os

In [None]:
main_dir = "data/experiment/danmarksstednavne"

In [None]:
for label in set(opslag_pre_1800["label"]):
    os.makedirs(main_dir+"/epoch_{}".format(label))

In [None]:
for label in set(opslag_pre_1800["label"]):
    f = open(main_dir+"/epoch_{}/train.txt".format(label), "w")
    f.close()
    f = open(main_dir+"/epoch_{}/valid.txt".format(label), "w")
    f.close()
    f = open(main_dir+"/epoch_{}/test.txt".format(label), "w")
    f.close()    
    
    items = opslag_pre_1800[opslag_pre_1800["label"]==label]
    train, dev = train_test_split(items, test_size=0.1, random_state=42)
    
    
    for index, row in train.iterrows():
        f = open(main_dir+"/epoch_{}/train.txt".format(label), "a")
        f.write(row.lmdata+"\n")
        f.close()
    
    for index, row in dev.iterrows():
        f = open(main_dir+"/epoch_{}/valid.txt".format(label), "a")

        f.write(row.lmdata+"\n")
        f.close()
        
        f = open(main_dir+"/epoch_{}/test.txt".format(label), "a")
        f.write(row.lmdata+"\n")
        f.close()

## Randomized bins

In [None]:
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import os

In [None]:
main_dir = main_dir+"_control/"
main_dir

In [None]:
labels = set(opslag_pre_1800["label"])

In [None]:
for label in labels:
    os.makedirs(main_dir+"/epoch_{}".format(label))

In [None]:
n_groups = len(labels)
splitter = StratifiedKFold(n_splits=n_groups, shuffle=True, random_state=42)

In [None]:
for label, (_, test_idx) in zip(labels, splitter.split(opslag_pre_1800["label"], opslag_pre_1800["label"])):
    items = opslag_pre_1800.iloc[test_idx]
    print("Generating random data for label <{}>".format(label))
    print("Number of data points: ", len(items))
    print("Distribution: ")
    print(Counter(items["label"]))
    
    
    train, dev = train_test_split(items, test_size=0.1, random_state=42)
    
    
    for index, row in train.iterrows():
        f = open(main_dir+"/epoch_{}/train.txt".format(label), "a")
        f.write(row.lmdata+"\n")
        f.close()
    
    for index, row in dev.iterrows():
        f = open(main_dir+"/epoch_{}/valid.txt".format(label), "a")

        f.write(row.lmdata+"\n")
        f.close()
        
        f = open(main_dir+"/epoch_{}/test.txt".format(label), "a")
        f.write(row.lmdata+"\n")
        f.close()    