From a5afe2b1dfa6403154eda8f7cbf3236e53b5bc51 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 11:12:54 +0100 Subject: [PATCH 01/40] refactor: get local alto files from alto path --- src/pipeline.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/pipeline.py b/src/pipeline.py index 27e470f..e4bf640 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -25,8 +25,11 @@ def files(): return filenames, files() def main(args): - if args.local_alto is not None: - package_ids = args.local_alto + if args.alto_path is not None: + if args.local_alto is not None: + package_ids = args.local_alto + else: + package_ids = os.listdir(args.alto_path) archive = None else: if args.protocol_ids is not None: @@ -38,7 +41,7 @@ def main(args): archive = LazyArchive() for package_id in progressbar.progressbar(list(package_ids)): data = infer_metadata(package_id) - print("metadata", data) + print("\n", package_id, "\n metadata", data) data["authority"] = args.authority data["session"] = data["sitting"] data["protocol_id"] = data["protocol"] @@ -52,7 +55,6 @@ def main(args): else: filenames, files = fetch_local_package(args.alto_path, package_id) paragraphs = convert_alto(filenames, files) - print() print(paragraphs[0]) data["paragraphs"] = paragraphs From a062874815c294241731e799044128056091a75f Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 11:13:50 +0100 Subject: [PATCH 02/40] refactor: operate on a single protocol, fetch data location via pyriksdagen.utils --- src/classify_intros.py | 45 ++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/classify_intros.py b/src/classify_intros.py index e0a4416..41bdb57 100644 --- a/src/classify_intros.py +++ b/src/classify_intros.py @@ -4,7 +4,7 @@ import pandas as pd from lxml import etree from transformers import AutoModelForSequenceClassification, BertTokenizerFast -from pyriksdagen.utils import protocol_iterators, elem_iter +from pyriksdagen.utils import protocol_iterators, elem_iter, get_data_location import torch from tqdm import tqdm from torch.utils.data import DataLoader @@ -52,25 +52,32 @@ def predict_intro(df, cuda): return pd.DataFrame(intros, columns=['file_path', 'id']) def main(args): - # Create folder iterator for reasonably large batches - protocols = protocol_iterators("corpus/protocols/", start=args.start, end=args.end) - protocols = [os.path.split(p) for p in protocols] - protocol_df = pd.DataFrame(protocols, columns=['folder', 'file']) - protocol_df = protocol_df.sort_values(by=['folder', 'file']) - folders = sorted(set(protocol_df['folder'])) - intros = [] - for folder in folders: - files = protocol_df.loc[protocol_df['folder'] == folder, 'file'].tolist() - data = [] - for file in tqdm(files, total=len(files)): - data.extend(extract_note_seg(os.path.join(folder, file))) - df = pd.DataFrame(data, columns=['text', 'id', 'file_path']) - print(df) + if args.protocol: + df = pd.DataFrame( + extract_note_seg(args.protocol), + columns=['text', 'id', 'file_path']) df = predict_intro(df, cuda=args.cuda) - intros.append(df) + print(df) + else: + # Create folder iterator for reasonably large batches + protocols = protocol_iterators(get_data_location("records"), start=args.start, end=args.end) + protocols = [os.path.split(p) for p in protocols] + protocol_df = pd.DataFrame(protocols, columns=['folder', 'file']) + protocol_df = protocol_df.sort_values(by=['folder', 'file']) + folders = sorted(set(protocol_df['folder'])) + + for folder in folders: + files = protocol_df.loc[protocol_df['folder'] == folder, 'file'].tolist() + data = [] + for file in tqdm(files, total=len(files)): + data.extend(extract_note_seg(os.path.join(folder, file))) + df = pd.DataFrame(data, columns=['text', 'id', 'file_path']) + print(df) + df = predict_intro(df, cuda=args.cuda) + intros.append(df) - df = pd.concat(intros) + df = pd.concat(intros) df.to_csv(args.outpath, index=False) @@ -78,6 +85,10 @@ def main(args): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") parser.add_argument("--cuda", action="store_true", help="Set this flag to run with cuda.") parser.add_argument("--outpath", default="input/segmentation/intros.csv") args = parser.parse_args() From 2428fab028ada5fca21be7e3e0f10c6649b979de Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 11:48:36 +0100 Subject: [PATCH 03/40] refactor: operate on a single protocol, fetch data location via pyriksdagen.utils --- src/resegment.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/resegment.py b/src/resegment.py index 7156274..1acd471 100644 --- a/src/resegment.py +++ b/src/resegment.py @@ -9,21 +9,33 @@ find_introductions, update_ids, ) -from pyriksdagen.utils import infer_metadata -from pyriksdagen.utils import protocol_iterators - +from pyriksdagen.utils import ( + infer_metadata, + get_data_location, + protocol_iterators, + write_protocol, +) from lxml import etree import pandas as pd import os, progressbar, argparse -def main(args): - start_year = args.start - end_year = args.end + + +def main(args): + if args.protocol: + protocols = [args.protocol] + else: + start_year = args.start + end_year = args.end + protocols = list(protocol_iterators( + get_data_location("records"), + start=args.start, + end=args.end)) parser = etree.XMLParser(remove_blank_text=True) intro_df = pd.read_csv(args.segmentation_file) - for protocol in progressbar.progressbar(list(protocol_iterators(args.records_folder, start=args.start, end=args.end))): + for protocol in progressbar.progressbar(protocols): intro_ids = intro_df.loc[intro_df['file_path'] == protocol, 'id'].tolist() metadata = infer_metadata(protocol) @@ -46,20 +58,20 @@ def main(args): (pattern_db["start"] <= year) & (pattern_db["end"] >= year) ] root = find_introductions(root, pattern_db, intro_ids, minister_db=None) - root = format_texts(root) - b = etree.tostring( - root, pretty_print=True, encoding="utf-8", xml_declaration=True - ) - with open(protocol, "wb") as f: - f.write(b) + write_protocol(root, protocol) + + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--records_folder", type=str, default="corpus/protocols") parser.add_argument("--segmentation_file", type=str, default="input/segmentation/intros.csv") parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") args = parser.parse_args() main(args) From 9fa56b0936cba8e6bc8f9a43dd3a610ad001e35c Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 12:38:29 +0100 Subject: [PATCH 04/40] refactor: operate on single protocol, get prot path from pyriksdagen.utils, rm outrrrraaageous package imports --- src/add_uuid.py | 77 ++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/src/add_uuid.py b/src/add_uuid.py index 19e7ea8..70254a9 100644 --- a/src/add_uuid.py +++ b/src/add_uuid.py @@ -2,71 +2,64 @@ Add a randomly generated UUID to all elements in the XML ID field that are currently missing one. """ from lxml import etree -import pandas as pd -import json, math -import os, argparse -from datetime import datetime -from pyparlaclarin.refine import ( - format_texts, +from pyriksdagen.utils import ( + elem_iter, + get_formatted_uuid, + get_data_location, + parse_protocol, + protocol_iterators, + write_protocol, ) - -from pyriksdagen.db import filter_db, load_patterns, load_metadata -from pyriksdagen.refine import ( - redetect_protocol, - detect_mps, - find_introductions, - update_ids, -) -from pyriksdagen.utils import infer_metadata, parse_date, elem_iter, protocol_iterators, get_formatted_uuid -from pyriksdagen.match_mp import clean_names, multiple_replace from tqdm import tqdm +import argparse import multiprocessing -import uuid -import base58 + + + def add_protocol_id(protocol): - ids = set() - tei_ns = ".//{http://www.tei-c.org/ns/1.0}" - xml_ns = "{http://www.w3.org/XML/1998/namespace}" - parser = etree.XMLParser(remove_blank_text=True) - root = etree.parse(protocol, parser).getroot() - - tei = root.find(f"{tei_ns}TEI") - tei.attrib[f"{xml_ns}id"] = protocol.split("/")[-1][:-4] + ids = set() num_ids = 0 + + root, ns = parse_protocol(protocol, get_ns=True) for tag, elem in elem_iter(root): if tag == "u": for subelem in elem: - x = subelem.attrib.get(f'{xml_ns}id', get_formatted_uuid()) - subelem.attrib[f'{xml_ns}id'] = x + x = subelem.attrib.get(f"{ns['xml_ns']}id", get_formatted_uuid()) + subelem.attrib[f"{ns['xml_ns']}id"] = x ids.add(x) num_ids += 1 - x = elem.attrib.get(f'{xml_ns}id', get_formatted_uuid()) - elem.attrib[f'{xml_ns}id'] = x + x = elem.attrib.get(f"{ns['xml_ns']}id", get_formatted_uuid()) + elem.attrib[f"{ns['xml_ns']}id"] = x ids.add(x) num_ids += 1 - elif tag in ["note"]: - x = elem.attrib.get(f'{xml_ns}id', get_formatted_uuid()) - elem.attrib[f'{xml_ns}id'] = x + x = elem.attrib.get(f"{ns['xml_ns']}id", get_formatted_uuid()) + elem.attrib[f"{ns['xml_ns']}id"] = x ids.add(x) num_ids += 1 - b = etree.tostring( - root, pretty_print=True, encoding="utf-8", xml_declaration=True - ) - f = open(protocol, "wb") - f.write(b) + write_protocol(root, protocol) assert len(ids) == num_ids return ids, num_ids + + def main(args): - protocols = sorted(list(protocol_iterators(args.records_folder, start=args.start, end=args.end))) + num_ids = 0 ids = [] + + if args.protocol: + protocols = [args.protocol] + else: + protocols = sorted(list(protocol_iterators( + get_data_location("records"), + start=args.start, end=args.end))) + with multiprocessing.Pool() as pool: for i, n in tqdm(pool.imap(add_protocol_id, protocols), total=len(protocols)): ids += i @@ -75,10 +68,16 @@ def main(args): assert len(set(ids)) == num_ids + + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--records_folder", type=str, default="corpus/records") parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") args = parser.parse_args() main(args) From 392c5104f8b876cc683110eb42c6758cd2b6cc59 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 12:49:53 +0100 Subject: [PATCH 05/40] refactor: operate on single protocol, get prot path from pyriksdagen.utils, rm outrrrraaageous package imports --- src/find_dates.py | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/find_dates.py b/src/find_dates.py index b9f28de..e23188b 100644 --- a/src/find_dates.py +++ b/src/find_dates.py @@ -2,32 +2,45 @@ Find margin notes with a date in protocols and include them as metadata. """ from lxml import etree -import pandas as pd -import os +from pyriksdagen.refine import detect_date +from pyriksdagen.utils import ( + infer_metadata, + parse_protocol, + protocol_iterators, + write_protocol, +) import progressbar import argparse -from pyriksdagen.db import filter_db, load_patterns -from pyriksdagen.refine import detect_date -from pyriksdagen.utils import infer_metadata, protocol_iterators + + def main(args): - parser = etree.XMLParser(remove_blank_text=True) - for protocol_path in progressbar.progressbar(list(protocol_iterators("corpus/", start=args.start, end=args.end))): + + if args.protocol: + protocols = [args.protocol] + else: + protocols = sorted(list(protocol_iterators( + get_data_location("records"), + start=args.start, end=args.end))) + + for protocol_path in progressbar.progressbar(protocols): metadata = infer_metadata(protocol_path) - root = etree.parse(protocol_path, parser) + root = parse_protocol(protocol_path) root, dates = detect_date(root, metadata) - b = etree.tostring( - root, pretty_print=True, encoding="utf-8", xml_declaration=True - ) - f = open(protocol_path, "wb") - f.write(b) - f.close() + write_protocol(root, protocol_path) + + + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") args = parser.parse_args() main(args) From 9917bfa627bf21ac5a58607ab7b1b083d3824afd Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 13:16:58 +0100 Subject: [PATCH 06/40] refactor: operate on single protocol, get prot path from pyriksdagen.utils, rm outrrrraaageous package imports --- src/reclassify.py | 49 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/src/reclassify.py b/src/reclassify.py index 982695b..ada75cf 100755 --- a/src/reclassify.py +++ b/src/reclassify.py @@ -1,18 +1,27 @@ """ Run the classification into utterances and notes. """ -from pyparlaclarin.refine import reclassify, format_texts, random_classifier - -from pyriksdagen.db import filter_db, load_patterns -from pyriksdagen.utils import infer_metadata, protocol_iterators +from pyparlaclarin.refine import reclassify, random_classifier + +#from pyriksdagen.db import filter_db, load_patterns +from pyriksdagen.utils import ( + #infer_metadata, + get_data_location, + parse_protocol, + protocol_iterators, + write_protocol, +) from lxml import etree import pandas as pd -import os, progressbar, sys +import progressbar import argparse import numpy as np TEI_NS = "{http://www.tei-c.org/ns/1.0}" + + + def classify_paragraph(s, model, ft, dim, prior=np.log([0.8, 0.2]), prob_dict={}, cache_preds=True): if s is None: return "note" @@ -44,6 +53,7 @@ def classify_paragraph(s, model, ft, dim, prior=np.log([0.8, 0.2]), prob_dict={} else: return "u" + def get_neural_classifier(model, ft, dim): prob_dict = {} return (lambda paragraph: classify_paragraph(paragraph.text, model, ft, dim, prob_dict=prob_dict)) @@ -62,6 +72,7 @@ def preclassified(d, elem): xml_id = elem.attrib[xml_id] return d.get(xml_id, default) + def get_filename_classifier(filename): df = pd.read_csv(filename) print("Generate dict...") @@ -69,6 +80,9 @@ def get_filename_classifier(filename): print("done") return (lambda paragraph: preclassified(d, paragraph)) + + + def main(args): parser = etree.XMLParser(remove_blank_text=True) @@ -86,22 +100,31 @@ def main(args): model = keras.models.load_model('input/segment-classifier/') classifier = get_neural_classifier(model, ft, dim) - for protocol_path in progressbar.progressbar(list(protocol_iterators("corpus/protocols/", start=args.start, end=args.end))): - print(protocol_path) - metadata = infer_metadata(protocol_path) - root = etree.parse(protocol_path, parser).getroot() + if args.protocol: + protocols = [args.protocol] + else: + protocols = list(protocol_iterators(get_data_location("records"), start=args.start, end=args.end)) + for protocol_path in progressbar.progressbar(protocols): + print(protocol_path) + #metadata = infer_metadata(protocol_path) + root = parse_protocol(protocol_path) root = reclassify(root, classifier, exclude=["date", "speaker"]) - root = format_texts(root) - b = etree.tostring(root, pretty_print=True, encoding="utf-8", xml_declaration=True) - with open(protocol_path, "wb") as f: - f.write(b) + write_protocol(root, protocol_path) + + + + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") parser.add_argument("--method", type=str, default="w2v", help="default: w2w") parser.add_argument("--classfile", type=str, default=None, help="What's this? default=None") args = parser.parse_args() From f0315c56bc94c958c844b55fbb975b06b6ec8ef4 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 13:21:19 +0100 Subject: [PATCH 07/40] refactor: operate on single protocol, get prot path from pyriksdagen.utils, rm outrrrraaageous package imports --- src/reclassify.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/reclassify.py b/src/reclassify.py index ada75cf..09956b9 100755 --- a/src/reclassify.py +++ b/src/reclassify.py @@ -1,21 +1,19 @@ """ Run the classification into utterances and notes. """ +from lxml import etree from pyparlaclarin.refine import reclassify, random_classifier - -#from pyriksdagen.db import filter_db, load_patterns from pyriksdagen.utils import ( - #infer_metadata, get_data_location, parse_protocol, protocol_iterators, write_protocol, ) -from lxml import etree -import pandas as pd -import progressbar import argparse import numpy as np +import pandas as pd +import progressbar + TEI_NS = "{http://www.tei-c.org/ns/1.0}" @@ -84,7 +82,6 @@ def get_filename_classifier(filename): def main(args): - parser = etree.XMLParser(remove_blank_text=True) if args.classfile is not None: classifier = get_filename_classifier(args.classfile) @@ -107,7 +104,6 @@ def main(args): for protocol_path in progressbar.progressbar(protocols): print(protocol_path) - #metadata = infer_metadata(protocol_path) root = parse_protocol(protocol_path) root = reclassify(root, classifier, exclude=["date", "speaker"]) From a93044f4a7e1a3e853216074b44e88a89be2deec Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 13:23:24 +0100 Subject: [PATCH 08/40] style --- src/reclassify.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/reclassify.py b/src/reclassify.py index 09956b9..5e2dae6 100755 --- a/src/reclassify.py +++ b/src/reclassify.py @@ -100,7 +100,10 @@ def main(args): if args.protocol: protocols = [args.protocol] else: - protocols = list(protocol_iterators(get_data_location("records"), start=args.start, end=args.end)) + protocols = list(protocol_iterators( + get_data_location("records"), + start=args.start, + end=args.end)) for protocol_path in progressbar.progressbar(protocols): print(protocol_path) From 796837897020b056243cf72676e33c699d6409da Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 13:31:40 +0100 Subject: [PATCH 09/40] refactor: operate on single protocol, get prot path from pyriksdagen.utils, rm outrrrraaageous package imports --- src/dollar_sign_replace.py | 61 ++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/src/dollar_sign_replace.py b/src/dollar_sign_replace.py index 55a3adb..0415266 100644 --- a/src/dollar_sign_replace.py +++ b/src/dollar_sign_replace.py @@ -1,22 +1,23 @@ """ Fix a common OCR error: § is replaced with $. Only do this if we are sure of the error. """ -import argparse -from pyparlaclarin.refine import format_texts - -from pyriksdagen.db import load_metadata -from pyriksdagen.refine import redetect_protocol -from pyriksdagen.utils import protocol_iterators -from pyriksdagen.segmentation import join_text -from tqdm import tqdm -from multiprocessing import Pool -from functools import partial from lxml import etree +from pyriksdagen.utils import ( + get_data_location, + parse_protocol, + protocol_iterators, + write_protocol, +) +from tqdm import tqdm +import argparse import re tei_ns ="{http://www.tei-c.org/ns/1.0}" xml_ns = "{http://www.w3.org/XML/1998/namespace}" + + + def dollar_signs(root, exp_dollar_1, exp_dollar_2): for body in root.findall(f".//{tei_ns}body"): for div in body.findall(f"{tei_ns}div"): @@ -37,14 +38,15 @@ def dollar_signs(root, exp_dollar_1, exp_dollar_2): m = exp_dollar_2.search(elemtext).group(0) m_new = m.replace("$", "§") elem.text = elem.text.replace(m, m_new) - return root + def join_soft_hyphens_p(t): t = " ".join(t.split()) t = re.sub(' ?\u00ad ?', '', t) return t + def join_soft_hyphens(root, soft_hyphen): for body in root.findall(f".//{tei_ns}body"): for div in body.findall(f"{tei_ns}div"): @@ -55,37 +57,46 @@ def join_soft_hyphens(root, soft_hyphen): seg.text = join_soft_hyphens_p(seg.text) elif elem.text is not None: elem.text = join_soft_hyphens_p(elem.text) - root = format_texts(root, padding=10) + #root = format_texts(root, padding=10) return root + + + def main(args): - protocols = sorted(list(protocol_iterators("corpus/protocols/", start=args.start, end=args.end))) - print(protocols) - parser = etree.XMLParser(remove_blank_text=True) + + if args.protocol: + protocols = [args.protocol] + else: + protocols = sorted(list(protocol_iterators( + get_data_location("records"), + start=args.start, + end=args.end))) exp_dollar_1 = re.compile("^8 [0-9]{1,2}\.") exp_dollar_2 = re.compile("^[0-9]{1,2} ?\$") - soft_hyphen = re.compile("^[0-9]{1,2} ?\$") for protocol in tqdm(protocols, total=len(protocols)): - with open(protocol) as f: - root = etree.parse(f, parser).getroot() - + root = parse_protocol(protocol) root = dollar_signs(root, exp_dollar_1, exp_dollar_2) root = join_soft_hyphens(root, soft_hyphen) - b = etree.tostring( - root, pretty_print=True, encoding="utf-8", xml_declaration=True - ) + write_protocol(root, protocol) + - with open(protocol, "wb") as f: - f.write(b) if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") - parser.add_argument("--parallel", type=int, default=1, help="type=int, default=1: nymber of parallel...doesn't seem to do anything.") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") + parser.add_argument("--parallel", + type=int, + default=1, + help="type=int, default=1: nymber of parallel...doesn't seem to do anything.") args = parser.parse_args() main(args) From c65c4e51ac3f2f125b3a52b4f7bb0a3f2abd001a Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 14:26:46 +0100 Subject: [PATCH 10/40] refactor: operate on single protocol, get prot path from pyriksdagen.utils, rm outrrrraaageous package imports --- src/fix_capitalized_dashes.py | 58 ++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/src/fix_capitalized_dashes.py b/src/fix_capitalized_dashes.py index 3349f34..efbeabf 100755 --- a/src/fix_capitalized_dashes.py +++ b/src/fix_capitalized_dashes.py @@ -1,48 +1,58 @@ """ Concatenate split names of format "PERS- SON" into "PERSSON" """ -from lxml import etree -import pandas as pd -import os, progressbar, re -import argparse from pyparlaclarin.read import paragraph_iterator -from pyparlaclarin.refine import format_texts -from pyriksdagen.utils import protocol_iterators +from pyriksdagen.utils import ( + get_data_location, + parse_protocol, + protocol_iterators, + write_protocol, +) from tqdm import tqdm +import argparse, re + + def main(args): + # NB: [A-ZÀ-Þ] is UPPERCASE LETTERS + Accented UPPERCASE letters, ÅÄÖ etc pattern = "([A-ZÀ-Þ]{2,10})(- )([A-ZÀ-Þ]{2,10})" e = re.compile(pattern) - protocols = sorted(list(protocol_iterators("corpus/protocols/", start=args.start, end=args.end))) - #print(protocols) - parser = etree.XMLParser(remove_blank_text=True) + if args.protocol: + protocols = [args.protocol] + else: + if args.records_folder is not None: + data_location = args.records_folder + else: + data_location = get_data_location("records") + protocols = sorted(list(protocol_iterators(data_location, + start=args.start, + end=args.end))) for protocol in tqdm(protocols, total=len(protocols)): - with open(protocol) as f: - root = etree.parse(f, parser).getroot() + root = parse_protocol(protocol) + for elem in paragraph_iterator(root, output="lxml"): + txt = elem.text + if txt is not None and len(e.findall(txt)) > 0: + elem.text = re.sub(pattern, r"\1\3", txt) - for elem in paragraph_iterator(root, output="lxml"): - pass # if elem.text is not None: - # print(elem.text) - txt = elem.text - if txt is not None and len(e.findall(txt)) > 0: - elem.text = re.sub(pattern, r"\1\3", txt) - # e.match(string) + write_protocol(root, protocol) - root = format_texts(root) - b = etree.tostring( - root, pretty_print=True, encoding="utf-8", xml_declaration=True - ) - with open(protocol, "wb") as f: - f.write(b) if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") args = parser.parse_args() main(args) From dbd71a043120c1cba5bc3c55b5c85b0d9ba3729b Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 15:13:50 +0100 Subject: [PATCH 11/40] refactor: operate on single protocol, get prot path from pyriksdagen.utils --- src/redetect.py | 53 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/src/redetect.py b/src/redetect.py index 286cc88..1e73e71 100644 --- a/src/redetect.py +++ b/src/redetect.py @@ -5,25 +5,38 @@ import argparse from pyriksdagen.db import load_metadata from pyriksdagen.refine import redetect_protocol -from pyriksdagen.utils import protocol_iterators -from pyriksdagen.segmentation import join_text +from pyriksdagen.utils import protocol_iterators, get_data_location from tqdm import tqdm from multiprocessing import Pool from functools import partial + + + def main(args): - party_mapping, *dfs = load_metadata(metadata_folder=f"{args.metadata_root}/corpus/metadata", processed_metadata_folder=f"{args.metadata_root}/input/matching") - ## DEPRECIATED ##join_intros['text'] = join_intros.apply(lambda x: join_text(x['text1'], x['text2']), axis=1) - ## DEPRECIATED ##join_intros = join_intros.drop(['text1', 'text2'], axis=1) - + if args.metadata_root is not None: + metadata_location = args.metadata_root + else: + metadata_location = get_data_location("metadata") + party_mapping, *dfs = load_metadata(metadata_location=metadata_location, + processed_metadata_folder=args.processed_metadata_folder) for df in dfs: df[["start", "end"]] = df[["start", "end"]].apply(pd.to_datetime, format='%Y-%m-%d') metadata = [party_mapping] + dfs - - redetect_fun = partial(redetect_protocol, metadata) - protocols = sorted(list(protocol_iterators(args.records_folder, start=args.start, end=args.end))) + + if args.protocol: + protocols = [args.protocol] + else: + if args.records_folder is not None: + data_location = args.records_folder + else: + data_location = get_data_location("records") + protocols = sorted(list(protocol_iterators(data_location, + start=args.start, + end=args.end))) unknowns = [] + redetect_fun = partial(redetect_protocol, metadata) if args.parallel == 1: pool = Pool() for unk in tqdm(pool.imap(redetect_fun, protocols), total=len(protocols)): @@ -36,16 +49,30 @@ def main(args): unknowns = pd.DataFrame(unknowns, columns=['protocol_id', 'uuid']+["gender", "party", "other"]) print('Proportion of metadata identified for unknowns:') print((unknowns[["gender", "party", "other"]] != '').sum() / len(unknowns)) - unknowns.drop_duplicates().to_csv(args.outfile, index=False) + unknowns.drop_duplicates().to_csv(f"{args.processed_metadata_folder}/unknowns.csv", index=False) + + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--records_folder", type=str, default="corpus/records") - parser.add_argument("--metadata_root", type=str, default=".") + + parser.add_argument("-s", "--start", type=int, default=1867, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") + parser.add_argument("-m", "--metadata-root", + type=str, + default=None, + help="(optional) Path to metadata root folder, defaults to environment var or `data/`") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") parser.add_argument("--parallel", type=int, default=1, help="N parallel processes (default=1)") - parser.add_argument("--outfile", type=str, default="input/matching/unknowns.csv") + parser.add_argument("--processed-metadata-folder", type=str, default="input/matching") args = parser.parse_args() main(args) From e8c2fc7600776dbc82973e5218e4cffc0357594d Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 15:25:05 +0100 Subject: [PATCH 12/40] refactor: operate on single protocol, get prot path from pyriksdagen.utils --- src/split_into_sections.py | 64 ++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/src/split_into_sections.py b/src/split_into_sections.py index 496f4c2..53282be 100755 --- a/src/split_into_sections.py +++ b/src/split_into_sections.py @@ -1,13 +1,19 @@ """ Split protocol into
sections baseed on the paragraph sign '§' and other heuristics """ -from pyriksdagen.utils import protocol_iterators, infer_metadata from lxml import etree -import numpy as np -import pandas as pd +from pyriksdagen.utils import ( + get_data_location, + infer_metadata, + parse_protocol, + protocol_iterators, + write_protocol, +) from tqdm import tqdm import argparse -from multiprocessing import Pool +import numpy as np +import pandas as pd + TEI_NS = "{http://www.tei-c.org/ns/1.0}" XML_NS = "{http://www.w3.org/XML/1998/namespace}" @@ -40,8 +46,6 @@ def clean_next_prev(div, DEBUG): return div - - def create_divs(root, metadata, DEBUG): bodies = root.findall(f".//{TEI_NS}body") assert len(bodies) == 1 @@ -68,8 +72,6 @@ def create_divs(root, metadata, DEBUG): return root - - def clean_divs(root, DEBUG): divs = list(root.findall(f".//{TEI_NS}body/{TEI_NS}div")) if DEBUG: print(f"Cleaning {len(divs)} divs") @@ -91,8 +93,6 @@ def clean_divs(root, DEBUG): return root - - def convert_u_heuristic(root): rows = [] for div in list(root.findall(f".//{TEI_NS}div")) + list(root.findall(".//div")): @@ -108,8 +108,6 @@ def convert_u_heuristic(root): return rows - - def nextprev_clean(root, DEBUG): divs = list(root.findall(f".//{TEI_NS}body/{TEI_NS}div")) if DEBUG: print(f"Cleaning {len(divs)} divs") @@ -119,9 +117,7 @@ def nextprev_clean(root, DEBUG): return root - - -def flow(root, rows, DEBUG): +def flow(root, rows, metadata, DEBUG): if args.nextprev_only: if DEBUG: print("Only cleaning next/prev attribs") root = nextprev_clean(root, DEBUG) @@ -143,34 +139,32 @@ def main(args): rows = [] skip_counter = 0 failures = [] - parser = etree.XMLParser(remove_blank_text=True) if args.protocol: protocols = [args.protocol] else: - protocols = list(protocol_iterators(args.records_path, start=args.start, end=args.end)) - - for protocol in tqdm(protocols): + if args.records_folder is not None: + data_location = args.records_folder + else: + data_location = get_data_location("records") + protocols = sorted(list(protocol_iterators(data_location, + start=args.start, + end=args.end))) + for protocol in tqdm(protocols, total=len(protocols)): if DEBUG: print(protocol) - if protocol in skip: if DEBUG: print("!!! SKIPPING") continue - root = etree.parse(protocol, parser).getroot() - + root = parse_protocol(protocol) metadata = infer_metadata(protocol) try: - root, rows = flow(root, rows, DEBUG) + root, rows = flow(root, rows, metadata, DEBUG) except Exception: skip_counter += 1 failures.append(protocol) print(f"Problem with {protocol} ... Skipping ...") else: - b = etree.tostring( - root, pretty_print=True, encoding="utf-8", xml_declaration=True - ) - with open(protocol, "wb") as f: - f.write(b) + write_protocol(root, protocol) if len(rows) > 0: df = pd.DataFrame(rows, columns=["id", "preds"]) @@ -184,12 +178,20 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--records_path", type=str, default="corpus/records") parser.add_argument("-s", "--start", type=int, default=1867, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") - parser.add_argument("-p", "--protocol", type=str, help="Provide a specific protocol") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") + parser.add_argument("-p", "--protocol", + type=str, + default=None, + help="operate on a single protocol") parser.add_argument("-d", "--debug", action="store_true", help="Print debug statements") - parser.add_argument("-c", "--nextprev-only", action="store_true", help="Only clean up next-prev attrs.") + parser.add_argument("-c", "--nextprev-only", + action="store_true", + help="Only clean up next-prev attrs.") parser.add_argument("--outpath", type=str, default="input/segmentation/section_heuristic_preds.csv") args = parser.parse_args() main(args) From d1d946001222261643c37e2ab1c7792a1a2c8ac3 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 15:43:19 +0100 Subject: [PATCH 13/40] feat: single script to add ID to u, seg, note and div --- src/add_uuid.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/add_uuid.py b/src/add_uuid.py index 70254a9..0fd1eb1 100644 --- a/src/add_uuid.py +++ b/src/add_uuid.py @@ -2,6 +2,7 @@ Add a randomly generated UUID to all elements in the XML ID field that are currently missing one. """ from lxml import etree +from pathlib import Path from pyriksdagen.utils import ( elem_iter, get_formatted_uuid, @@ -18,11 +19,25 @@ def add_protocol_id(protocol): - + print("add IDs") ids = set() num_ids = 0 root, ns = parse_protocol(protocol, get_ns=True) + + body = root.find(f".//{ns['tei_ns']}body") + if body is None: + print(protocol) + else: + divs = body.findall(f"{ns['tei_ns']}div") + for div in divs: + protocol_id = Path(protocol).stem + seed_str = f"{protocol_id}\n{' '.join(div.itertext())}" + x = div.attrib.get(f"{ns['xml_ns']}id", get_formatted_uuid(seed_str)) + div.attrib[f"{ns['xml_ns']}id"] = x + num_ids += 1 + ids.add(x) + for tag, elem in elem_iter(root): if tag == "u": for subelem in elem: @@ -56,9 +71,12 @@ def main(args): if args.protocol: protocols = [args.protocol] else: - protocols = sorted(list(protocol_iterators( - get_data_location("records"), - start=args.start, end=args.end))) + if args.records_folder is not None: + data_location = args.records_folder + else: + protocols = sorted(list(protocol_iterators( + get_data_location("records"), + start=args.start, end=args.end))) with multiprocessing.Pool() as pool: for i, n in tqdm(pool.imap(add_protocol_id, protocols), total=len(protocols)): @@ -75,6 +93,10 @@ def main(args): parser.add_argument("--records_folder", type=str, default="corpus/records") parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") parser.add_argument("-p", "--protocol", type=str, default=None, From 92c5ae4dfc00ac7441a92076d2db26f97d714bd2 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 15:46:11 +0100 Subject: [PATCH 14/40] feat: add --records-folder arg --- src/dollar_sign_replace.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/dollar_sign_replace.py b/src/dollar_sign_replace.py index 0415266..093e370 100644 --- a/src/dollar_sign_replace.py +++ b/src/dollar_sign_replace.py @@ -68,11 +68,14 @@ def main(args): if args.protocol: protocols = [args.protocol] else: - protocols = sorted(list(protocol_iterators( - get_data_location("records"), - start=args.start, - end=args.end))) - + if args.records_folder is not None: + data_location = args.records_folder + else: + data_location = get_data_location("records") + protocols = sorted(list(protocol_iterators(data_location, + start=args.start, + end=args.end))) + exp_dollar_1 = re.compile("^8 [0-9]{1,2}\.") exp_dollar_2 = re.compile("^[0-9]{1,2} ?\$") soft_hyphen = re.compile("^[0-9]{1,2} ?\$") @@ -90,6 +93,10 @@ def main(args): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") parser.add_argument("-p", "--protocol", type=str, default=None, From c185fbdc49007bc7f7722c1dc2eba41c225d6a8b Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 15:50:21 +0100 Subject: [PATCH 15/40] feat: add --records-folder arg --- src/reclassify.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/reclassify.py b/src/reclassify.py index 5e2dae6..1490cda 100755 --- a/src/reclassify.py +++ b/src/reclassify.py @@ -100,10 +100,13 @@ def main(args): if args.protocol: protocols = [args.protocol] else: - protocols = list(protocol_iterators( - get_data_location("records"), - start=args.start, - end=args.end)) + if args.records_folder is not None: + data_location = args.records_folder + else: + data_location = get_data_location("records") + protocols = list(protocol_iterators(data_location, + start=args.start, + end=args.end)) for protocol_path in progressbar.progressbar(protocols): print(protocol_path) @@ -120,6 +123,10 @@ def main(args): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") parser.add_argument("-p", "--protocol", type=str, default=None, From d5b18b1cbb745a33f7bc8ba7b7d5d70bcb32a32b Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 15:51:21 +0100 Subject: [PATCH 16/40] feat: add --records-folder arg --- src/find_dates.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/find_dates.py b/src/find_dates.py index e23188b..df5a634 100644 --- a/src/find_dates.py +++ b/src/find_dates.py @@ -20,9 +20,13 @@ def main(args): if args.protocol: protocols = [args.protocol] else: - protocols = sorted(list(protocol_iterators( - get_data_location("records"), - start=args.start, end=args.end))) + if args.records_folder is not None: + data_location = args.records_folder + else: + data_location = get_data_location("records") + protocols = sorted(list(protocol_iterators(data_location, + start=args.start, + end=args.end))) for protocol_path in progressbar.progressbar(protocols): metadata = infer_metadata(protocol_path) @@ -38,6 +42,10 @@ def main(args): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") parser.add_argument("-p", "--protocol", type=str, default=None, From 521cd3551276f716265772b653bf78a32e4a4b5a Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 15:55:40 +0100 Subject: [PATCH 17/40] feat: add --records-folder arg --- src/resegment.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/resegment.py b/src/resegment.py index 1acd471..7a4a5bb 100644 --- a/src/resegment.py +++ b/src/resegment.py @@ -12,6 +12,7 @@ from pyriksdagen.utils import ( infer_metadata, get_data_location, + parse_protocol, protocol_iterators, write_protocol, ) @@ -26,13 +27,14 @@ def main(args): if args.protocol: protocols = [args.protocol] else: - start_year = args.start - end_year = args.end - protocols = list(protocol_iterators( - get_data_location("records"), - start=args.start, - end=args.end)) - parser = etree.XMLParser(remove_blank_text=True) + if args.records_folder is not None: + data_location = args.records_folder + else: + data_location = get_data_location("records") + protocols = list(protocol_iterators(data_location, + start=args.start, + end=args.end)) + intro_df = pd.read_csv(args.segmentation_file) for protocol in progressbar.progressbar(protocols): @@ -41,7 +43,8 @@ def main(args): metadata = infer_metadata(protocol) protocol_id = protocol.split("/")[-1] year = metadata["year"] - root = etree.parse(protocol, parser).getroot() + + root = parse_protocol(protocol) years = [ int(elem.attrib.get("when").split("-")[0]) @@ -66,9 +69,15 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--segmentation_file", type=str, default="input/segmentation/intros.csv") + parser.add_argument("--segmentation_file", + type=str, + default="input/segmentation/intros.csv") parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") parser.add_argument("-p", "--protocol", type=str, default=None, From c15c557998705acab2c0daabdbb978e277102d01 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 15:59:00 +0100 Subject: [PATCH 18/40] feat: add --records-folder arg --- src/classify_intros.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/classify_intros.py b/src/classify_intros.py index 41bdb57..6477adb 100644 --- a/src/classify_intros.py +++ b/src/classify_intros.py @@ -13,8 +13,10 @@ from functools import partial import os + def extract_elem(protocol, elem): - return elem.text, elem.get("{http://www.w3.org/XML/1998/namespace}id"), protocol + return elem.text, elem.get("{http://www.w3.org/XML/1998/namespace}id"), protocol + def extract_note_seg(protocol): parser = etree.XMLParser(remove_blank_text=True) @@ -27,6 +29,7 @@ def extract_note_seg(protocol): data.extend(list(map(partial(extract_elem, protocol), elem))) return data + def predict_intro(df, cuda): model = AutoModelForSequenceClassification.from_pretrained("jesperjmb/parlaBERT") if cuda: @@ -51,6 +54,8 @@ def predict_intro(df, cuda): intros.extend([[file_path, xml_id] for file_path, xml_id, pred in zip(file_path, xml_ids, preds) if pred == 1]) return pd.DataFrame(intros, columns=['file_path', 'id']) + + def main(args): intros = [] if args.protocol: @@ -61,7 +66,11 @@ def main(args): print(df) else: # Create folder iterator for reasonably large batches - protocols = protocol_iterators(get_data_location("records"), start=args.start, end=args.end) + if args.records_folder is not None: + data_location = args.records_folder + else: + data_location = get_data_location("records") + protocols = protocol_iterators(data_location, start=args.start, end=args.end) protocols = [os.path.split(p) for p in protocols] protocol_df = pd.DataFrame(protocols, columns=['folder', 'file']) protocol_df = protocol_df.sort_values(by=['folder', 'file']) @@ -81,10 +90,16 @@ def main(args): df.to_csv(args.outpath, index=False) + + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") parser.add_argument("-p", "--protocol", type=str, default=None, From 567c3108bd975e4a7f1d6813586a4061a16b0e37 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 16:35:16 +0100 Subject: [PATCH 19/40] chore: organize scripts dir --- src/{ => cur-prot}/add_uuid.py | 0 src/{ => cur-prot}/classify_intros.py | 0 src/{ => cur-prot}/dollar_sign_replace.py | 0 src/{ => cur-prot}/find_dates.py | 0 src/{ => cur-prot}/fix_capitalized_dashes.py | 0 src/{ => cur-prot}/pipeline.py | 0 src/{ => cur-prot}/reclassify.py | 0 src/{ => cur-prot}/redetect.py | 0 src/{ => cur-prot}/resegment.py | 0 src/{ => cur-prot}/split_into_sections.py | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename src/{ => cur-prot}/add_uuid.py (100%) rename src/{ => cur-prot}/classify_intros.py (100%) rename src/{ => cur-prot}/dollar_sign_replace.py (100%) rename src/{ => cur-prot}/find_dates.py (100%) rename src/{ => cur-prot}/fix_capitalized_dashes.py (100%) rename src/{ => cur-prot}/pipeline.py (100%) rename src/{ => cur-prot}/reclassify.py (100%) rename src/{ => cur-prot}/redetect.py (100%) rename src/{ => cur-prot}/resegment.py (100%) rename src/{ => cur-prot}/split_into_sections.py (100%) diff --git a/src/add_uuid.py b/src/cur-prot/add_uuid.py similarity index 100% rename from src/add_uuid.py rename to src/cur-prot/add_uuid.py diff --git a/src/classify_intros.py b/src/cur-prot/classify_intros.py similarity index 100% rename from src/classify_intros.py rename to src/cur-prot/classify_intros.py diff --git a/src/dollar_sign_replace.py b/src/cur-prot/dollar_sign_replace.py similarity index 100% rename from src/dollar_sign_replace.py rename to src/cur-prot/dollar_sign_replace.py diff --git a/src/find_dates.py b/src/cur-prot/find_dates.py similarity index 100% rename from src/find_dates.py rename to src/cur-prot/find_dates.py diff --git a/src/fix_capitalized_dashes.py b/src/cur-prot/fix_capitalized_dashes.py similarity index 100% rename from src/fix_capitalized_dashes.py rename to src/cur-prot/fix_capitalized_dashes.py diff --git a/src/pipeline.py b/src/cur-prot/pipeline.py similarity index 100% rename from src/pipeline.py rename to src/cur-prot/pipeline.py diff --git a/src/reclassify.py b/src/cur-prot/reclassify.py similarity index 100% rename from src/reclassify.py rename to src/cur-prot/reclassify.py diff --git a/src/redetect.py b/src/cur-prot/redetect.py similarity index 100% rename from src/redetect.py rename to src/cur-prot/redetect.py diff --git a/src/resegment.py b/src/cur-prot/resegment.py similarity index 100% rename from src/resegment.py rename to src/cur-prot/resegment.py diff --git a/src/split_into_sections.py b/src/cur-prot/split_into_sections.py similarity index 100% rename from src/split_into_sections.py rename to src/cur-prot/split_into_sections.py From b7280bdba30b5c6ddb15c5586d4003ceb60c3923 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 1 Mar 2024 16:41:31 +0100 Subject: [PATCH 20/40] feat: script to run everything at once --- src/cur-prot/README.md | 14 +++ src/cur-prot/post-pipeline.py | 184 ++++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 src/cur-prot/README.md create mode 100644 src/cur-prot/post-pipeline.py diff --git a/src/cur-prot/README.md b/src/cur-prot/README.md new file mode 100644 index 0000000..2129926 --- /dev/null +++ b/src/cur-prot/README.md @@ -0,0 +1,14 @@ +# cur-prot + +All the scripts used in curation of records should live here. + +New records are curated by running the following from the root of the project (one dir up from `scripts`: + +``` +RiksdagenCorpus $ python scripts/src/cur-prot/pipeline.py +RiksdagenCorpus $ python scripts/src/cur-prot/post-pipeline.py +``` + +Important: pay attention to the required args and/or environment variables. Run each script with `-h` to see. + +Alternatively, look at the curation steps in post-pipeline and run them one at a time. \ No newline at end of file diff --git a/src/cur-prot/post-pipeline.py b/src/cur-prot/post-pipeline.py new file mode 100644 index 0000000..53ef4ad --- /dev/null +++ b/src/cur-prot/post-pipeline.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Do the whole post-`pipeline.py` curation in order. + + classify_intros + resegment + add_uuid + find_dates + reclassify + add_uuid + dollar_sign_replace + fix_capitalized_dashes + redetect + split_into_sections + add_uuid + +""" +import argparse, os, subprocess, sys + + +SCR = os.environ.get("SCRIPTS", 'scripts/source') +PIP = os.environ.get("PIPENV") +CONDA = os.environ.get("CONDAENV") + + + +def list_args(l, args): + if args.protocol: + l.extend(["-p", args.protocol]) + else: + l.extend(["-s", args.start, "-e", args.end]) + return l + + +def classify_intros(args): + print("\n\n\n Classifying Introductions \n\n\n") + l = [args.condaenv, f"{SCR}/cur-prot/classify_intros.py"] + l = list_args(l, args) + l.append("--cuda") + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def resegment(args): + print("\n\n\n Resegmenting \n\n\n") + l = [args.pipenv, f"{SCR}/cur-prot/resegment.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def add_uuid(args): + print("\n\n\n Add UUID \n\n\n") + l = [args.pipenv, f"{SCR}/cur-prot/add_uuid.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def find_dates(args): + print("\n\n\n Find Dates \n\n\n") + l = [args.pipenv, f"{SCR}/cur-prot/find_dates.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def reclassify(args): + print("\n\n\n Reclassifying Intros \n\n\n") + l = [args.condaenv, f"{SCR}/cur-prot/reclassify.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def dollar_sign_replace(args): + print("\n\n\n Dollar Sign Replace \n\n\n") + l = [args.pipenv, f"{SCR}/cur-prot/dollar_sign_replace.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def fix_capitalized_dashes(args): + print("\n\n\n Fix Capitalized Dashes \n\n\n") + l = [args.pipenv, f"{SCR}/cur-prot/fix_capitalized_dashes.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def redetect(args): + print("\n\n\n Redetect Speakers \n\n\n") + l = [args.pipenv, f"{SCR}/cur-prot/redetect.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def split_into_sections(args): + print("\n\n\n Split into sections \n\n\n") + l = [args.pipenv, f"{SCR}/cur-prot/split_into_sections.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def add_uuid_to_divs(args): + print("\n\n\n Add UUID to divs \n\n\n") + l = [args.pipenv, f"{SCR}/cur-prot/add_uuid_to_divs.py"] + l = list_args(l, args) + with subprocess.Popen(l, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + +def empty_subp(): + print("\n\n\n \n\n\n") + with subprocess.Popen([], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + for line in process.stdout: + print(line.decode('utf8')) + + + + +def main(args): + classify_intros(args) + resegment(args) + add_uuid(args) + find_dates(args) + reclassify(args) + add_uuid(args) + dollar_sign_replace(args) + fix_capitalized_dashes(args) + redetect(args) + split_into_sections(args) + add_uuid(args) + # to do -- update corpus docs + + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-s", "--start", type=str, default="1867", help="Start year") + parser.add_argument("-e", "--end", type=str, default="2022", help="End year") + parser.add_argument("-r", "--records-folder", + type=str, + default=None, + help="(optional) Path to records folder, defaults to environment var or `data/`") + parser.add_argument("-p", "--protocol", + type=str, + help="operate on a single protocol") + parser.add_argument("--pipenv", + type=str, + default=None, + help="Path to pip env. If unset, looks for environment variable, else fails.") + parser.add_argument("--condaenv", + type=str, + default=None, + help="Path to conda env. If unset, looks for environment variable, else fails.") + args = parser.parse_args() + if args.pipenv == None: + if PIP == None: + print("You need to set a pip env.") + sys.exit() + else: + args.pipenv = PIP + if args.condaenv == None: + if CONDA == None: + print("You need to set a pip env.") + sys.exit() + else: + args.condaenv = CONDA + main(args) From 0753be751228025cc12e779c94faaf89c9493fbc Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Tue, 5 Mar 2024 12:36:56 +0100 Subject: [PATCH 21/40] fix: test all scripts --- .github/workflows/push.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 2835b32..32baa86 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -20,7 +20,7 @@ jobs: pip install -r requirements.txt - name: Test scripts run: | - for filename in src/*.py; do + for filename in src/**.py; do echo "$filename" python "$filename" --help - done \ No newline at end of file + done From 821d962efd34d940ae6aa356ee82fce2e41f4ee4 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 7 Mar 2024 12:54:02 +0100 Subject: [PATCH 22/40] fix: rm double arg --- src/cur-prot/add_uuid.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cur-prot/add_uuid.py b/src/cur-prot/add_uuid.py index 0fd1eb1..b304f36 100644 --- a/src/cur-prot/add_uuid.py +++ b/src/cur-prot/add_uuid.py @@ -90,7 +90,6 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--records_folder", type=str, default="corpus/records") parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") parser.add_argument("-r", "--records-folder", From d76d07c2df1bb7e26efa7cb4b9dd33bfaf69b793 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 21 Mar 2024 14:22:43 +0100 Subject: [PATCH 23/40] fix: update default start year --- src/cur-prot/dollar_sign_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cur-prot/dollar_sign_replace.py b/src/cur-prot/dollar_sign_replace.py index 093e370..386b0bc 100644 --- a/src/cur-prot/dollar_sign_replace.py +++ b/src/cur-prot/dollar_sign_replace.py @@ -91,7 +91,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") + parser.add_argument("-s", "--start", type=int, default=1867, help="Start year") parser.add_argument("-e", "--end", type=int, default=2022, help="End year") parser.add_argument("-r", "--records-folder", type=str, From bbf461277fd1badb6e1b27131ed05d19ad7e0188 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 21 Mar 2024 14:24:56 +0100 Subject: [PATCH 24/40] doc: specify more clearly in help str --- src/cur-prot/add_uuid.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cur-prot/add_uuid.py b/src/cur-prot/add_uuid.py index b304f36..f2fb8bd 100644 --- a/src/cur-prot/add_uuid.py +++ b/src/cur-prot/add_uuid.py @@ -19,7 +19,6 @@ def add_protocol_id(protocol): - print("add IDs") ids = set() num_ids = 0 @@ -99,6 +98,6 @@ def main(args): parser.add_argument("-p", "--protocol", type=str, default=None, - help="operate on a single protocol") + help="operate on a single protocol. Set the full path -- this option doesn't cooperate with `-r`.") args = parser.parse_args() main(args) From 7b2e679b066086ee125868e260eb799a145fc82a Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 21 Mar 2024 14:26:29 +0100 Subject: [PATCH 25/40] chore: add requirement --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a3b9aa8..b826ea7 100755 --- a/requirements.txt +++ b/requirements.txt @@ -20,4 +20,5 @@ torch tqdm transformers Unidecode -Wikidata \ No newline at end of file +Wikidata +pytest-cfg-fetcher From ffbce4af09a5e230739e63ee5d83d674deb1ea95 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 21 Mar 2024 14:28:05 +0100 Subject: [PATCH 26/40] refactor: work w/ new structure, use pyriksdagen better, add functionality (query from file) --- src/KWIC-iter-search.py | 241 ++++++++++++++++++++++++---------------- 1 file changed, 143 insertions(+), 98 deletions(-) diff --git a/src/KWIC-iter-search.py b/src/KWIC-iter-search.py index d7cafc3..f388351 100644 --- a/src/KWIC-iter-search.py +++ b/src/KWIC-iter-search.py @@ -6,121 +6,166 @@ from argparse import RawTextHelpFormatter from datetime import datetime from lxml import etree -from pyriksdagen.utils import elem_iter, protocol_iterators +from pyriksdagen.utils import ( + elem_iter, + parse_protocol, + protocol_iterators, +) from tqdm import tqdm -import argparse, re +import argparse, os, re import pandas as pd -tei_ns = ".//{http://www.tei-c.org/ns/1.0}" -xml_ns = "{http://www.w3.org/XML/1998/namespace}" - +class NoRecordsAbspath(Exception): + def __init__(self): + self.message = "No RECORDS_ABSPATH environment variable." + def __str__(self): + return self.message def format_text(text): - lines = text.split('\n') - return ' '.join([l.strip() for l in lines]) - - + lines = text.split('\n') + return ' '.join([l.strip() for l in lines]) -def append_matches(matches, counter, rows, protocol, elem_id, elem_type, who, txt, context, facs, line_number): - for m in matches: - counter += 1 - s = m.start() - e = m.end() - left = txt[s-context:s] - right = txt[e:e+context] - prot = protocol.split('/')[-1][:-4] - gh = f"https://github.com/welfare-state-analytics/riksdagen-corpus/blob/{args.branch}/{protocol}/#L{line_number}" - if args.print or args.print_only: - tqdm.write(f'{prot}: {txt[s-context:s]} --| {m.group(0)} |-- {txt[e:e+context]}') - row = [protocol, elem_id, elem_type, who, s, e, left, m.group(0), right, facs, gh] - rows.append(row) - return rows, counter +def append_matches(matches, counter, rows, + protocol, elem_id, elem_type, + who, txt, context, facs, line_number): + for m in matches: + counter += 1 + s = m.start() + e = m.end() + left = txt[s-context:s] + right = txt[e:e+context] + prot = protocol.split('/')[-1][:-4] + gh = f"https://github.com/swerik-project/riksdagen-records/blob/{args.branch}/{protocol}/#L{line_number}" + if args.print or args.print_only: + tqdm.write(f'{prot}: {txt[s-context:s]} --| {m.group(0)} |-- {txt[e:e+context]}') + row = [protocol, elem_id, elem_type, who, s, e, left, m.group(0), right, facs, gh] + rows.append(row) + return rows, counter def main(args): - dts = datetime.now().strftime("%Y%m%d-%H%M%S") - if args.keyword: - pattern = re.compile(rf'\b\S*{args.keyword}\S*\b', re.IGNORECASE) - if args.regex_keyword: - pattern = re.compile(rf'{args.regex_keyword}') - - search_u = True - search_note = True - - if args.segment != False and args.note != False: - search_u = args.segment - search_note = args.note - - protocols = sorted(list(protocol_iterators("corpus/protocols/", start=args.start, end=args.end))) - if args.chamber: - protocols = [p for p in protocols if args.chamber in p] - - rows = [] - match_counter = 0 - - for protocol in tqdm(protocols, total=len(protocols)): - parser = etree.XMLParser(remove_blank_text=True) - root = etree.parse(protocol, parser).getroot() - facs = None - for tag, elem in elem_iter(root): - if tag == "u" and search_u: - who = elem.attrib.get("who") - for subelem in elem: - line_number = subelem.sourceline - subelem_id = subelem.attrib.get(f'{xml_ns}id') - txt = format_text(subelem.text) - matches = re.finditer(pattern, txt) - rows, match_counter = append_matches(matches, match_counter, rows, protocol, elem_id, "seg", who, txt, args.context, facs, line_number) - elif tag == "note" and search_note: - line_number = elem.sourceline - elem_id = elem.attrib.get(f'{xml_ns}id') - txt = format_text(elem.text) - matches = re.finditer(pattern, txt) - rows, match_counter = append_matches(matches, match_counter, rows, protocol, elem_id, "note", None, txt, args.context, facs, line_number) - elif tag == "pb": - facs = elem.attrib.get("facs") - - if not args.print_only: - print("Writing file...") - df = pd.DataFrame(rows, columns = ["protocol", "elem_id", "elem_type", "who", "match_start", - "match_end", "left_context", "match", "right_context", "facs", "github"]) - df.to_csv(f"input/KWIC/{args.out_file}_{dts}.csv", index=False) - - - print(f"\n\n\tFinito -- {match_counter} matches\n\n") + try: + records_path = os.environ.get("RECORDS_ABSPATH", None) + assert records_path != None + except NoRecordsAbspath: + records_path = os.environ.get("RECORDS_PATH", None) + assert records_path != None + except: + records_path = "data" + + dts = datetime.now().strftime("%Y%m%d-%H%M%S") + if args.keyword: + pattern = re.compile(rf'\b\S*{args.keyword}\S*\b', re.IGNORECASE) + elif args.regex_keyword: + pattern = re.compile(rf'{args.regex_keyword}') + elif args.regex_fromfile: + with open(args.regex_fromfile, 'r') as rq: + pattern = re.compile(rf"{rq.read().strip()}") + + search_u = True + search_note = True + + if args.segment != False and args.note != False: + search_u = args.segment + search_note = args.note + + protocols = sorted(list(protocol_iterators(corpus_root=records_path, start=args.start, end=args.end))) + + if args.chamber: + protocols = [p for p in protocols if args.chamber in p] + + rows = [] + match_counter = 0 + + for protocol in tqdm(protocols, total=len(protocols)): + root, ns = parse_protocol(protocol, get_ns=True) + facs = None + for tag, elem in elem_iter(root): + if tag == "u" and search_u: + who = elem.attrib.get("who") + for subelem in elem: + line_number = subelem.sourceline + subelem_id = subelem.attrib.get(f'{ns["xml_ns"]}id') + txt = format_text(subelem.text) + matches = re.finditer(pattern, txt) + rows, match_counter = append_matches(matches, match_counter, rows, + protocol, elem_id, "seg", who, + txt, args.context, facs, line_number) + elif tag == "note" and search_note: + line_number = elem.sourceline + elem_id = elem.attrib.get(f'{ns["xml_ns"]}id') + txt = format_text(elem.text) + matches = re.finditer(pattern, txt) + rows, match_counter = append_matches(matches, match_counter, rows, + protocol, elem_id, "note", None, + txt, args.context, facs, line_number) + elif tag == "pb": + facs = elem.attrib.get("facs") + + if not args.print_only: + print("Writing file...") + df = pd.DataFrame(rows, columns = ["protocol", "elem_id", "elem_type", + "who", "match_start", "match_end", + "left_context", "match", "right_context", + "facs", "github"]) + df.to_csv(f"{args.out_path}/{args.out_file}_{dts}.csv", index=False) + + + print(f"\n\n\tFinito -- {match_counter} matches\n\n") if __name__ == '__main__': - parser = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter) - parser.add_argument("-s", "--start", type=int, default=1867, help="Start year") - parser.add_argument("-e", "--end", type=int, default=2022, help="End year") - parser.add_argument("-c", "--chamber", type=str, choices=["fk", "ak"], default=None, - help="Search return results from a specific chamber in the bicameral period.\n(Default:None, means you search both chambers. If you set this, no ek results will be returned.)") - parser.add_argument("-S", "--segment", action="store_true", help="Search only in utterance segments.") - parser.add_argument("-n", "--note", action="store_true", help="Search only in notes.") - parser.add_argument("-k", "--keyword", type=str, default=None, help="Search term.") - parser.add_argument("-r", "--regex-keyword", type=str, default=None, help="Regular expression search term.\nWrap expression in single quotes, e.g. '\\bHerr\\S*\\b'.") - parser.add_argument("-C", "--context", type=int, default=45, - help="N characters to the left & right of match in results file.") - parser.add_argument("-o", "--out-file", type=str, default="KWIC-results", - help="Name of output file @ input/KWIC/") - parser.add_argument("-b", "--branch", type=str, default="dev", help="Github branch (for links in the output csv).") - parser.add_argument("-p", "--print", action="store_true", help="Print matches to stdout.") - parser.add_argument("-P", "--print-only", action="store_true", help="Print matches to stdout; no output file.") - args = parser.parse_args() - if args.keyword == None and args.regex_keyword == None: - print("\n\tYou have to search for a keyword or a regex-keyword.\n\n") - parser.print_help() - elif args.keyword != None and args.regex_keyword != None: - print("\n\tYou can't set keyword *AND* a regex-keyword\n\n") - parser.print_help() - else: - main(args) + parser = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter) + parser.add_argument("-s", "--start", type=int, default=1867, help="Start year") + parser.add_argument("-e", "--end", type=int, default=2022, help="End year") + parser.add_argument("-c", "--chamber", + type=str, choices=["fk", "ak"], + default=None, + help="Search return results from a specific chamber in the bicameral period.\n(Default:None, means you search both chambers. If you set this, no ek results will be returned.)") + parser.add_argument("-S", "--segment", + action="store_true", + help="Search only in utterance segments.") + parser.add_argument("-n", "--note", action="store_true", help="Search only in notes.") + parser.add_argument("-k", "--keyword", type=str, default=None, help="Search term.") + parser.add_argument("-r", "--regex-keyword", + type=str, + default=None, + help="Regular expression search term.\nWrap expression in single quotes, e.g. '\\bHerr\\S*\\b'.") + parser.add_argument("-Q", "--regex-fromfile", default=None, help="Read in a regex query from a file.") + parser.add_argument("-C", "--context", + type=int, + default=45, + help="N characters to the left & right of match in results file.") + parser.add_argument("-O", "--out-path", + type=str, + default=".", + help="output folder") + parser.add_argument("-o", "--out-file", + type=str, + default="KWIC-results", + help="Name of output file @ --out-path") + parser.add_argument("-b", "--branch", + type=str, + default="dev", + help="Github branch (for links in the output csv).") + parser.add_argument("-p", "--print", + action="store_true", + help="Print matches to stdout.") + parser.add_argument("-P", "--print-only", + action="store_true", + help="Print matches to stdout; no output file.") + args = parser.parse_args() + test = [args.keyword == None, args.regex_keyword == None, args.regex_fromfile == None] + if test.count(True) != 2: + print("\n\tYou have to EITHER provide a keyword, a regex-keyword, or read a regex query from file.\n\n") + parser.print_help() + else: + main(args) From 1d82a69294e5c9da859e5627121e25254d1ad0ba Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 21 Mar 2024 14:29:33 +0100 Subject: [PATCH 27/40] refactor: work with new structure of corpus --- src/mapping_accuracy_estimate.py | 60 ++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/src/mapping_accuracy_estimate.py b/src/mapping_accuracy_estimate.py index a80777c..5308d7b 100644 --- a/src/mapping_accuracy_estimate.py +++ b/src/mapping_accuracy_estimate.py @@ -1,27 +1,33 @@ """ Calculate an upper bound for introduction mapping accuracy """ -from pyriksdagen.utils import protocol_iterators -from lxml import etree -import numpy as np -import pandas as pd +from multiprocessing import Pool +from pyriksdagen.utils import ( + get_data_location, + parse_protocol, + protocol_iterators, +) from tqdm import tqdm import argparse -from multiprocessing import Pool +import numpy as np +import pandas as pd -def get_date(root): - for docDate in root.findall(".//{http://www.tei-c.org/ns/1.0}docDate"): + + + +def get_date(root, ns): + for docDate in root.findall(f".//{ns['tei_ns']}docDate"): date_string = docDate.text break return date_string + # Fix parallellization def accuracy(protocol): - parser = etree.XMLParser(remove_blank_text=True) - root = etree.parse(protocol, parser).getroot() - year = int(get_date(root).split("-")[0]) + root, ns = parse_protocol(protocol, get_ns=True) + year = int(get_date(root, ns).split("-")[0]) known, unknown = 0, 0 - for div in root.findall(".//{http://www.tei-c.org/ns/1.0}div"): + for div in root.findall(f".//{ns['tei_ns']}div"): for elem in div: if "who" in elem.attrib: who = elem.attrib["who"] @@ -31,29 +37,39 @@ def accuracy(protocol): known += 1 return year, known, unknown + + + def main(args): - protocols = list(protocol_iterators("corpus/")) + protocols = sorted(list(protocol_iterators(get_data_location('records')))) if args.start is not None: - protocols = list(protocol_iterators("corpus/", start=args.start, end=args.end)) + protocols = sorted(list(protocol_iterators( + get_data_location('records'), + start=args.start, + end=args.end))) years = sorted(set([int(p.split('/')[2][:4]) for p in protocols])) years.append(max(years)+1) - df = pd.DataFrame(np.zeros((len(years), 2), dtype=int), index=years, columns=['known', 'unknown']) + df = pd.DataFrame( + np.zeros((len(years), 2), dtype=int), + index=years, columns=['known', 'unknown']) pool = Pool() for year, known, unknown in tqdm(pool.imap(accuracy, protocols), total=len(protocols)): df.loc[year, 'known'] += known df.loc[year, 'unknown'] += unknown df['accuracy_upper_bound'] = df.div(df.sum(axis=1), axis=0)['known'] - return df + print(df) + print("Average:", df['accuracy_upper_bound'].mean()) + print("Weighted average:", df["known"].sum() / (df["known"] + df["unknown"]).sum()) + print("Minimum: {} ({})".format(*[getattr(df['accuracy_upper_bound'], f)() for f in ['min', 'idxmin']])) + df.to_csv("input/accuracy/upper_bound.csv", index_label='year') + + + if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--start", type=int, default=None) - parser.add_argument("--end", type=int, default=None) + parser.add_argument("-s", "--start", type=int, default=None) + parser.add_argument("-e", "--end", type=int, default=None) args = parser.parse_args() df = main(args) - print(df) - print("Average:", df['accuracy_upper_bound'].mean()) - print("Weighted average:", df["known"].sum() / (df["known"] + df["unknown"]).sum()) - print("Minimum: {} ({})".format(*[getattr(df['accuracy_upper_bound'], f)() for f in ['min', 'idxmin']])) - df.to_csv("input/accuracy/upper_bound.csv", index_label='year') \ No newline at end of file From a86a97a917b3dd26f3c5a737710dd7171851c414 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Wed, 17 Apr 2024 13:35:40 +0200 Subject: [PATCH 28/40] fix: find path to queries --- src/wikidata_query.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/wikidata_query.py b/src/wikidata_query.py index b93a6d0..edc0833 100755 --- a/src/wikidata_query.py +++ b/src/wikidata_query.py @@ -4,9 +4,8 @@ from SPARQLWrapper import SPARQLWrapper, JSON import numpy as np import pandas as pd -import os, argparse -import time -import re +import argparse, os, re, time +from pyriksdagen.data import queries as pyriksdagen_queries from pyriksdagen.wikidata import ( query2df, separate_name_location, @@ -39,6 +38,17 @@ def track_missing_id(df, l, id_map=None): def main(args): # Change query path to be from module! + + if args.metadata_folder: + metadata_folder = args.metadata_folder + else: + metadata_folder = get_data_location("metadata") + + if args.query_folder: + query_folder = args.query_folder + else: + query_folder = pyriksdagen_queries.__path__._path[0] + if args.queries: queries = args.queries else: From d90b437336064d2ecd69f3215d45474715b50c26 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Tue, 14 May 2024 10:42:17 +0200 Subject: [PATCH 29/40] refactor: cleanup script - queries from pyriksdagen --- src/wikidata_query.py | 250 ++++++++++++++++++++++-------------------- 1 file changed, 133 insertions(+), 117 deletions(-) diff --git a/src/wikidata_query.py b/src/wikidata_query.py index edc0833..3309873 100755 --- a/src/wikidata_query.py +++ b/src/wikidata_query.py @@ -1,134 +1,150 @@ """ Query wikidata for metadata, process it and save it in corpus/metadata """ -from SPARQLWrapper import SPARQLWrapper, JSON -import numpy as np -import pandas as pd -import argparse, os, re, time +from pathlib import Path from pyriksdagen.data import queries as pyriksdagen_queries +from pyriksdagen.db import clean_person_duplicates +from pyriksdagen.utils import get_data_location from pyriksdagen.wikidata import ( - query2df, - separate_name_location, - move_party_to_party_df, - elongate_external_ids, + query2df, + separate_name_location, + move_party_to_party_df, + elongate_external_ids, ) -from pyriksdagen.db import clean_person_duplicates -from pathlib import Path +from SPARQLWrapper import SPARQLWrapper, JSON +import argparse +import numpy as np +import os +import pandas as pd +import re, time + + + def track_missing_id(df, l, id_map=None): - no_id = df.loc[pd.isna(df["person_id"])] - for i, r in no_id.iterrows(): - tmpdf = df.loc[(df['government'] == r['government']) & (df['role'] == r['role']) & (df['wiki_id'] == r['wiki_id']) & (pd.notnull(df["person_id"]))] - if tmpdf.empty: - found = False - if not id_map.empty: - tmpidmap = id_map.loc[id_map['wiki_id'] == r['wiki_id']].copy() - if not tmpidmap.empty: - tmpidmap.reset_index(drop=True, inplace=True) - swerik_id = tmpidmap.at[0, "person_id"] - df.at[i, "person_id"] = swerik_id - found = True - if found == False: - if r['wiki_id'] not in l: - l.append(r['wiki_id']) - - df = df.loc[pd.notnull(df['person_id'])].copy() - df.drop(columns=["wiki_id"], inplace=True) - return df.reset_index(drop=True), l + no_id = df.loc[pd.isna(df["person_id"])] + for i, r in no_id.iterrows(): + tmpdf = df.loc[(df['government'] == r['government']) & (df['role'] == r['role']) & (df['wiki_id'] == r['wiki_id']) & (pd.notnull(df["person_id"]))] + if tmpdf.empty: + found = False + if not id_map.empty: + tmpidmap = id_map.loc[id_map['wiki_id'] == r['wiki_id']].copy() + if not tmpidmap.empty: + tmpidmap.reset_index(drop=True, inplace=True) + swerik_id = tmpidmap.at[0, "person_id"] + df.at[i, "person_id"] = swerik_id + found = True + if found == False: + if r['wiki_id'] not in l: + l.append(r['wiki_id']) + + df = df.loc[pd.notnull(df['person_id'])].copy() + df.drop(columns=["wiki_id"], inplace=True) + return df.reset_index(drop=True), l + + + def main(args): - # Change query path to be from module! - - if args.metadata_folder: - metadata_folder = args.metadata_folder - else: - metadata_folder = get_data_location("metadata") - - if args.query_folder: - query_folder = args.query_folder - else: - query_folder = pyriksdagen_queries.__path__._path[0] - - if args.queries: - queries = args.queries - else: - queries = sorted([q.stem for q in Path(args.query_folder).glob('*.rq')]) - input_folders = ['name_location_specifier', 'alias', "member_of_parliament", "party_affiliation"] - - # Query for and store cleaned versions of metadata - d = {} - no_swerik_id = [] - id_map = None - if "wiki_id" in queries: - print(f"Query Wiki ID started.") - id_map = query2df("wiki_id", args.source) - print(type(id_map)) - id_map = id_map.drop_duplicates() - id_map.to_csv(f'{args.metadata_folder}/wiki_id.csv', index=False) - - for q in queries: - if q == "wiki_id": - continue - print(f"Query {q} started.") - df = query2df(q, args.source) - print(df, len(df)) - # Format values - if 'riksdagen_id' in df.columns: - df['riksdagen_id'] = df['riksdagen_id'].astype(str) - - if 'gender' in df.columns: - df["gender"] = df["gender"].map({'kvinna':'woman', 'man':'man'}) - - if q == 'minister': - df["role"] = df["role"].str.replace('Sveriges', '').str.strip() - df, no_swerik_id = track_missing_id(df, no_swerik_id, id_map=id_map) - - if q == 'member_of_parliament': - df["role"] = df["role"].str.extract(r'([A-Za-zÀ-ÿ]*ledamot)') - df, no_swerik_id = track_missing_id(df, no_swerik_id, id_map=id_map) - - if q == 'speaker': - df, no_swerik_id = track_missing_id(df, no_swerik_id, id_map=id_map) - - if q == "external_identifiers": - df = elongate_external_ids(df) - - # Store files needing additional preprocessing in input - folder = args.metadata_folder if not q in input_folders else args.input_metadata_folder - if folder == args.input_metadata_folder: - d[q] = df - - if q == 'person': - df = clean_person_duplicates(df) - - df = df.drop_duplicates() - df.to_csv(f'{folder}/{q}.csv', index=False) - - # Process name and location files - if d: - for key in d.keys(): - if key not in queries: - d['key'] = pd.read_csv(f'{args.input_metadata_folder}/{key}.csv') - name, loc = separate_name_location(d['name_location_specifier'], d['alias']) - name.to_csv(f'{args.metadata_folder}/name.csv', index=False) - loc.to_csv(f'{args.metadata_folder}/location_specifier.csv', index=False) - - mp_df, party_df = move_party_to_party_df(d['member_of_parliament'], d['party_affiliation']) - mp_df.to_csv(f'{args.metadata_folder}/member_of_parliament.csv', index=False) - party_df.to_csv(f'{args.metadata_folder}/party_affiliation.csv', index=False) - - if len(no_swerik_id) > 0: - print("Some entities returned in the queries seem not to have a swerik ID. Check and add an ID, then requery.") - with open(f"{args.input_metadata_folder}/no_swerik_id_query_results.txt", "w+") as outf: - [outf.write(f"{_}\n") for _ in no_swerik_id] + if args.metadata_folder is None: + metadata_folder = get_data_location("metadata") + else: + metadata_folder = args.metadata_folder + + if args.query_folder is None: + query_folder = pyriksdagen_queries.__path__._path[0] + else: + query_folder = args.query_folder + + if args.queries is None: + queries = sorted([q.stem for q in Path(query_folder).glob('*.rq')]) + else: + queries = args.queries + input_folders = ['name_location_specifier', 'alias', "member_of_parliament", "party_affiliation"] + + # Query for and store cleaned versions of metadata + d = {} + no_swerik_id = [] + id_map = None + if "wiki_id" in queries: + print(f"Query Wiki ID started.") + id_map = query2df("wiki_id", args.source) + print(type(id_map)) + id_map = id_map.drop_duplicates() + id_map.to_csv(f'{metadata_folder}/wiki_id.csv', index=False) + + for q in queries: + print(q) + if q == "wiki_id": + continue + print(f"Query {q} started.") + df = query2df(q, args.source) + print(df, len(df)) + # Format values + if 'riksdagen_id' in df.columns: + df['riksdagen_id'] = df['riksdagen_id'].astype(str) + + if 'gender' in df.columns: + df["gender"] = df["gender"].map({'kvinna':'woman', 'man':'man'}) + + if q == 'minister': + df["role"] = df["role"].str.replace('Sveriges', '').str.strip() + df, no_swerik_id = track_missing_id(df, no_swerik_id, id_map=id_map) + + if q == 'member_of_parliament': + df["role"] = df["role"].str.extract(r'([A-Za-zÀ-ÿ]*ledamot)') + df, no_swerik_id = track_missing_id(df, no_swerik_id, id_map=id_map) + + if q == 'speaker': + df, no_swerik_id = track_missing_id(df, no_swerik_id, id_map=id_map) + + if q == "external_identifiers": + df = elongate_external_ids(df) + + # Store files needing additional preprocessing in input + folder = metadata_folder if not q in input_folders else args.input_metadata_folder + if folder == args.input_metadata_folder: + d[q] = df + + if q == 'person': + df = clean_person_duplicates(df) + + df = df.drop_duplicates() + df.to_csv(f'{folder}/{q}.csv', index=False) + + # Process name and location files + if d: + for key in d.keys(): + if key not in queries: + d['key'] = pd.read_csv(f'{args.input_metadata_folder}/{key}.csv') + name, loc = separate_name_location(d['name_location_specifier'], d['alias']) + name.to_csv(f'{metadata_folder}/name.csv', index=False) + loc.to_csv(f'{metadata_folder}/location_specifier.csv', index=False) + + mp_df, party_df = move_party_to_party_df(d['member_of_parliament'], d['party_affiliation']) + mp_df.to_csv(f'{metadata_folder}/member_of_parliament.csv', index=False) + party_df.to_csv(f'{metadata_folder}/party_affiliation.csv', index=False) + + if len(no_swerik_id) > 0: + print("Some entities returned in the queries seem not to have a swerik ID. Check and add an ID, then requery.") + with open(f"{args.input_metadata_folder}/no_swerik_id_query_results.txt", "w+") as outf: + [outf.write(f"{_}\n") for _ in no_swerik_id] + + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--input_metadata_folder', type=str, default="input/metadata") - parser.add_argument('--metadata_folder', type=str, default="corpus/metadata") - parser.add_argument('--query_folder', type=str, default="pyriksdagen/data/queries") - parser.add_argument('-q', '--queries', default=None, nargs='+', help='One or more sparql query files (separated by space)') - parser.add_argument('-s', '--source', default=None, nargs='+', help='One or more of member_of_parliament | minister | speaker (separated by space)') + parser.add_argument('--metadata_folder', type=str, default=None) + parser.add_argument('--query_folder', type=str, default=None) + parser.add_argument('-q', '--queries', + default=None, + nargs='+', + help='One or more sparql query files (separated by space)') + parser.add_argument('-s', '--source', + default=None, + nargs='+', + help='One or more of member_of_parliament | minister | speaker (separated by space)') args = parser.parse_args() main(args) From 01628bfaf3641effb0e3460696c2a38c531e2004 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Mon, 20 May 2024 15:03:51 +0200 Subject: [PATCH 30/40] docs: add note about what to do next --- src/cur-prot/redetect.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cur-prot/redetect.py b/src/cur-prot/redetect.py index 1e73e71..ea8103a 100644 --- a/src/cur-prot/redetect.py +++ b/src/cur-prot/redetect.py @@ -52,6 +52,10 @@ def main(args): unknowns.drop_duplicates().to_csv(f"{args.processed_metadata_folder}/unknowns.csv", index=False) + print("redetect seems to have finished successfully. Now run `cur-mot/split_into_sections --nextprev-only`.") + # TODO: move abovementioned --nextprev-only function to pyriksdagen; import and run here + + if __name__ == "__main__": From c4fe1b41177b4299d0899a14c76c4c2a9189d130 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 28 Aug 2024 12:56:43 +0200 Subject: [PATCH 31/40] fix: support zero padded protocol IDs; support new repo structure --- src/segmentation_accuracy_estimate.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/segmentation_accuracy_estimate.py b/src/segmentation_accuracy_estimate.py index 259a4fc..9c3967e 100644 --- a/src/segmentation_accuracy_estimate.py +++ b/src/segmentation_accuracy_estimate.py @@ -72,8 +72,19 @@ def estimate_accuracy(protocol, df): return correct, incorrect def main(args): - protocols = list(protocol_iterators("corpus/", start=args.start, end=args.end)) + protocols = list(protocol_iterators(args.records_folder, start=args.start, end=args.end)) df = pd.read_csv(args.path_goldstandard) + def pad_id(pid): + protocol_number = infer_metadata(pid)["number"] + protocol_id = infer_metadata(pid)["protocol"] + protocol_id = protocol_id.replace("_", "-") + protocol_number_str_old = str(protocol_number) + protocol_number_str = f"{protocol_number:0>3}" + protocol_id = protocol_id[:-len(protocol_number_str_old)] + protocol_number_str + print(protocol_id, protocol_number, protocol_number_str) + return protocol_id + + df['protocol_id'] = df['protocol_id'].apply(lambda x: pad_id(x)) rows = [] correct, incorrect = 0, 0 @@ -84,6 +95,7 @@ def main(args): #print(p, protocol_id) df_p = df[df["protocol_id"] == protocol_id] if len(df_p) >= 1: + print(p) metadata = infer_metadata(p) acc = estimate_accuracy(path, df_p) @@ -118,6 +130,7 @@ def main(args): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--start", type=int, default=1867) parser.add_argument("--end", type=int, default=2022) + parser.add_argument("--records_folder", type=str, default="corpus/records") parser.add_argument("--path_goldstandard", type=str, default="corpus/quality_assesment/segment_classification/prot-segment-classification.csv") args = parser.parse_args() df = main(args) From cd4d6254bfaeebd07658e62c636529461ee70550 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Wed, 28 Aug 2024 12:57:53 +0200 Subject: [PATCH 32/40] refactor: remove unnecessary prints --- src/segmentation_accuracy_estimate.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/segmentation_accuracy_estimate.py b/src/segmentation_accuracy_estimate.py index 9c3967e..a6c07cc 100644 --- a/src/segmentation_accuracy_estimate.py +++ b/src/segmentation_accuracy_estimate.py @@ -81,7 +81,6 @@ def pad_id(pid): protocol_number_str_old = str(protocol_number) protocol_number_str = f"{protocol_number:0>3}" protocol_id = protocol_id[:-len(protocol_number_str_old)] + protocol_number_str - print(protocol_id, protocol_number, protocol_number_str) return protocol_id df['protocol_id'] = df['protocol_id'].apply(lambda x: pad_id(x)) @@ -95,7 +94,6 @@ def pad_id(pid): #print(p, protocol_id) df_p = df[df["protocol_id"] == protocol_id] if len(df_p) >= 1: - print(p) metadata = infer_metadata(p) acc = estimate_accuracy(path, df_p) From 359aab66b4d2697de1f5f378c03e86cc50e707ed Mon Sep 17 00:00:00 2001 From: ninpnin Date: Thu, 29 Aug 2024 14:05:53 +0200 Subject: [PATCH 33/40] feat: estimate date quality based on an annotated sample --- date-sample-corrected.csv | 262 +++++++++++++++++++++++++++++++++ src/date_accurcacy_estimate.py | 112 ++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 date-sample-corrected.csv create mode 100644 src/date_accurcacy_estimate.py diff --git a/date-sample-corrected.csv b/date-sample-corrected.csv new file mode 100644 index 0000000..fb4bbcc --- /dev/null +++ b/date-sample-corrected.csv @@ -0,0 +1,262 @@ +path,link-prot,true-dates,comment +data/1867/prot-1867--ak--0510.xml,https://betalab.kb.se/prot-1867--ak--0510/_view,1867-05-10, +data/1867/prot-1867--fk--0426.xml,https://betalab.kb.se/prot-1867--fk--0426/_view,1867-04-26, +data/1868/prot-1868--ak--0313.xml,https://betalab.kb.se/prot-1868--ak--0313/_view,1868-03-13 - 1868-03-14,fixed +data/1868/prot-1868--fk--0229.xml,https://betalab.kb.se/prot-1868--fk--0229/_view,1868-02-29 - 1868-03-02,fixed +data/1869/prot-1869--ak--0310.xml,https://betalab.kb.se/prot-1869--ak--0310/_view,1869-03-10, +data/1869/prot-1869--fk--0507.xml,https://betalab.kb.se/prot-1869--fk--0507/_view,1869-05-07, +data/1870/prot-1870--ak--0504.xml,https://betalab.kb.se/prot-1870--ak--0504/_view,1870-05-04, +data/1870/prot-1870--fk--0330.xml,https://betalab.kb.se/prot-1870--fk--0330/_view,1870-03-30, +data/1871/prot-1871--ak--0512.xml,https://betalab.kb.se/prot-1871--ak--0512/_view,1871-05-12, +data/1871/prot-1871--fk--0218.xml,https://betalab.kb.se/prot-1871--fk--0218/_view,1871-02-18 - 1871-02-21,fixed +data/1872/prot-1872--ak--0207.xml,https://betalab.kb.se/prot-1872--ak--0207/_view,1872-02-07, +data/1872/prot-1872--fk--0320.xml,https://betalab.kb.se/prot-1872--fk--0320/_view,1872-03-20, +data/1873/prot-1873--ak--0322.xml,https://betalab.kb.se/prot-1873--ak--0322/_view,1873-03-22, +data/1873/prot-1873--fk--0523.xml,https://betalab.kb.se/prot-1873--fk--0523/_view,1873-05-23, +data/1874/prot-1874--ak--0411.xml,https://betalab.kb.se/prot-1874--ak--0411/_view,1874-04-11, +data/1874/prot-1874--fk--0120.xml,https://betalab.kb.se/prot-1874--fk--0120/_view,1874-01-20, +data/1875/prot-1875--ak--019.xml,https://betalab.kb.se/prot-1875--ak--19/_view,1875-03-13 - 1875-03-15,fixed +data/1875/prot-1875--fk--042.xml,https://betalab.kb.se/prot-1875--fk--42/_view,1875-05-18, +data/1876/prot-1876--ak--009.xml,https://betalab.kb.se/prot-1876--ak--9/_view,1876-02-19, +data/1876/prot-1876--fk--004.xml,https://betalab.kb.se/prot-1876--fk--4/_view,1876-02-05 - 1876-02-12,fixed +data/1877/prot-1877--ak--024.xml,https://betalab.kb.se/prot-1877--ak--24/_view,1877-03-16 - 1877-03-20,fixed +data/1877/prot-1877--fk--032.xml,https://betalab.kb.se/prot-1877--fk--32/_view,1877-04-28, +data/1878/prot-1878--ak--057.xml,https://betalab.kb.se/prot-1878--ak--57/_view,1878-05-11, +data/1878/prot-1878--fk--016.xml,https://betalab.kb.se/prot-1878--fk--16/_view,1878-03-11 - 1878-03-13,fixed +data/1879/prot-1879--ak--059.xml,https://betalab.kb.se/prot-1879--ak--59/_view,1879-05-11 - 1879-05-12,fixed +data/1879/prot-1879--fk--036.xml,https://betalab.kb.se/prot-1879--fk--36/_view,1879-05-06, +data/1880/prot-1880--ak--003.xml,https://betalab.kb.se/prot-1880--ak--3/_view,1880-01-21 - 1880-01-24,fixed +data/1880/prot-1880--fk--042.xml,https://betalab.kb.se/prot-1880--fk--42/_view,1880-05-10, +data/1881/prot-1881--ak--003.xml,https://betalab.kb.se/prot-1881--ak--3/_view,1881-01-26, +data/1881/prot-1881--fk--021.xml,https://betalab.kb.se/prot-1881--fk--21/_view,1881-03-19, +data/1882/prot-1882--ak--055.xml,https://betalab.kb.se/prot-1882--ak--55/_view,1882-05-11, +data/1882/prot-1882--fk--033.xml,https://betalab.kb.se/prot-1882--fk--33/_view,1882-04-27, +data/1883/prot-1883--ak--049.xml,https://betalab.kb.se/prot-1883--ak--49/_view,1883-05-21, +data/1883/prot-1883--fk--032.xml,https://betalab.kb.se/prot-1883--fk--32/_view,1883-05-05, +data/1884/prot-1884--ak--007.xml,https://betalab.kb.se/prot-1884--ak--7/_view,1884-02-08 - 1884-02-09,fixed +data/1884/prot-1884--fk--006.xml,https://betalab.kb.se/prot-1884--fk--6/_view,1884-02-12 - 1884-02-13,fixed +data/1885/prot-1885--ak--019.xml,https://betalab.kb.se/prot-1885--ak--19/_view,1885-03-06 - 1885-03-07,fixed +data/1885/prot-1885--fk--016.xml,https://betalab.kb.se/prot-1885--fk--16/_view,1885-03-14, +data/1886/prot-1886--ak--060.xml,https://betalab.kb.se/prot-1886--ak--60/_view,1886-05-15 - 1886-06-02,fixed +data/1886/prot-1886--fk--009.xml,https://betalab.kb.se/prot-1886--fk--9/_view,1886-02-20 - 1886-02-21,fixed +data/1887/prot-1887-janmar-ak--016.xml,https://betalab.kb.se/prot-1887-janmar-ak--16/_view,1887-03-02, +data/1887/prot-1887-majjul-fk--022.xml,https://betalab.kb.se/prot-1887-majjul-fk--22/_view,1887-06-28, +data/1888/prot-1888--ak--023.xml,https://betalab.kb.se/prot-1888--ak--23/_view,1888-03-17 - 1888-03-20,fixed +data/1888/prot-1888--fk--027.xml,https://betalab.kb.se/prot-1888--fk--27/_view,1888-04-20, +data/1889/prot-1889--ak--047.xml,https://betalab.kb.se/prot-1889--ak--47/_view,1889-05-08, +data/1889/prot-1889--fk--035.xml,https://betalab.kb.se/prot-1889--fk--35/_view,1889-05-01, +data/1890/prot-1890--ak--011.xml,https://betalab.kb.se/prot-1890--ak--11/_view,1890-03-01, +data/1890/prot-1890--fk--038.xml,https://betalab.kb.se/prot-1890--fk--38/_view,1890-05-16, +data/1891/prot-1891--ak--044.xml,https://betalab.kb.se/prot-1891--ak--44/_view,1891-05-04, +data/1891/prot-1891--fk--020.xml,https://betalab.kb.se/prot-1891--fk--20/_view,1891-04-18 - 1891-04-21,fixed +data/1892/prot-1892--ak--043.xml,https://betalab.kb.se/prot-1892--ak--43/_view,1892-05-12, +data/1892/prot-1892--fk--019.xml,https://betalab.kb.se/prot-1892--fk--19/_view,1892-03-29 - 1892-04-01,fixed +data/1893/prot-1893--ak--040.xml,https://betalab.kb.se/prot-1893--ak--40/_view,1893-04-29 - 1893-05-01,fixed +data/1893/prot-1893--fk--029.xml,https://betalab.kb.se/prot-1893--fk--29/_view,1893-04-19 - 1893-04-21,fixed +data/1894/prot-1894--ak--018.xml,https://betalab.kb.se/prot-1894--ak--18/_view,1894-03-15 - 1894-03-16,fixed +data/1894/prot-1894--fk--005.xml,https://betalab.kb.se/prot-1894--fk--5/_view,1894-02-09 - 1894-02-18,fixed +data/1895/prot-1895--ak--001.xml,https://betalab.kb.se/prot-1895--ak--1/_view,1895-01-16 - 1895-01-17,fixed +data/1895/prot-1895--fk--031.xml,https://betalab.kb.se/prot-1895--fk--31/_view,1895-05-08, +data/1896/prot-1896--ak--025-01.xml,https://betalab.kb.se/prot-1896--ak--25/_view,1896-03-28 - 1896-04-02,fixed +data/1896/prot-1896--fk--002.xml,https://betalab.kb.se/prot-1896--fk--2/_view,1896-01-21 - 1896-01-22,fixed +data/1897/prot-1897--ak--044.xml,https://betalab.kb.se/prot-1897--ak--44/_view,1897-05-12, +data/1897/prot-1897--fk--013.xml,https://betalab.kb.se/prot-1897--fk--13/_view,1897-03-17 - 1897-03-19,fixed +data/1898/prot-1898--ak--019.xml,https://betalab.kb.se/prot-1898--ak--19/_view,1898-03-22 - 1898-03-25,fixed +data/1898/prot-1898--fk--034.xml,https://betalab.kb.se/prot-1898--fk--34/_view,1898-05-08 - 1898-05-09,fixed +data/1899/prot-1899--ak--004.xml,https://betalab.kb.se/prot-1899--ak--4/_view,1899-01-30 - 1899-02-04,fixed +data/1899/prot-1899--fk--006.xml,https://betalab.kb.se/prot-1899--fk--6/_view,1899-02-10 - 1899-02-17,fixed +data/1900/prot-1900--ak--023.xml,https://betalab.kb.se/prot-1900--ak--23/_view,1900-03-16, +data/1900/prot-1900--fk--020.xml,https://betalab.kb.se/prot-1900--fk--20/_view,1900-03-21, +data/1901/prot-1901--ak--038.xml,https://betalab.kb.se/prot-1901--ak--38/_view,1901-05-17, +data/1901/prot-1901--fk--007.xml,https://betalab.kb.se/prot-1901--fk--7/_view,1901-02-12 - 1901-02-13,fixed +data/1902/prot-1902--ak--048.xml,https://betalab.kb.se/prot-1902--ak--48/_view,1902-05-10, +data/1902/prot-1902--fk--027.xml,https://betalab.kb.se/prot-1902--fk--27/_view,1902-04-26, +data/1903/prot-1903--ak--009.xml,https://betalab.kb.se/prot-1903--ak--9/_view,1903-02-03 - 1903-03-04,fixed +data/1903/prot-1903--fk--016.xml,https://betalab.kb.se/prot-1903--fk--16/_view,1903-02-21, +data/1904/prot-1904--ak--042.xml,https://betalab.kb.se/prot-1904--ak--42/_view,1904-04-20, +data/1904/prot-1904--fk--017.xml,https://betalab.kb.se/prot-1904--fk--17/_view,1904-02-17, +data/1905/prot-1905--ak--008.xml,https://betalab.kb.se/prot-1905--ak--8/_view,1905-02-04, +data/1905/prot-1905--fk--047.xml,https://betalab.kb.se/prot-1905--fk--47/_view,1905-05-16, +data/1906/prot-1906--ak--004.xml,https://betalab.kb.se/prot-1906--ak--4/_view,1906-01-23 - 1906-01-24,fixed +data/1906/prot-1906--fk--019.xml,https://betalab.kb.se/prot-1906--fk--19/_view,1906-03-02 - 1906-03-03,fixed +data/1907/prot-1907--ak--062.xml,https://betalab.kb.se/prot-1907--ak--62/_view,1907-05-21, +data/1907/prot-1907--fk--022.xml,https://betalab.kb.se/prot-1907--fk--22/_view,1907-03-28, +data/1908/prot-1908--ak--027.xml,https://betalab.kb.se/prot-1908--ak--27/_view,1908-03-14, +data/1908/prot-1908--fk--053.xml,https://betalab.kb.se/prot-1908--fk--53/_view,1908-05-15, +data/1909/prot-1909--ak--034.xml,https://betalab.kb.se/prot-1909--ak--34/_view,1909-03-25 - 1909-03-27,fixed +data/1909/prot-1909--fk--025.xml,https://betalab.kb.se/prot-1909--fk--25/_view,1909-04-02, +data/1910/prot-1910--ak--040.xml,https://betalab.kb.se/prot-1910--ak--40/_view,1910-05-04 - 1910-05-06,fixed +data/1910/prot-1910--fk--006.xml,https://betalab.kb.se/prot-1910--fk--6/_view,1910-02-10 - 1910-02-12,fixed +data/1911/prot-1911--ak--045.xml,https://betalab.kb.se/prot-1911--ak--45/_view,1911-05-11, +data/1911/prot-1911--fk--014.xml,https://betalab.kb.se/prot-1911--fk--14/_view,1911-03-18 - 1911-03-21,fixed +data/1912/prot-1912--ak--031.xml,https://betalab.kb.se/prot-1912--ak--31/_view,1912-04-18, +data/1912/prot-1912--fk--016.xml,https://betalab.kb.se/prot-1912--fk--16/_view,1912-03-13 - 1912-03-15,fixed +data/1913/prot-1913--ak--026.xml,https://betalab.kb.se/prot-1913--ak--26/_view,1913-04-02 - 1913-04-03,fixed +data/1913/prot-1913--fk--003.xml,https://betalab.kb.se/prot-1913--fk--3/_view,1913-01-29 - 1913-02-05,fixed +data/1914/prot-1914-b-ak--074.xml,https://betalab.kb.se/prot-1914-b-ak--74/_view,1914-08-28, +data/1914/prot-1914-b-fk--030.xml,https://betalab.kb.se/prot-1914-b-fk--30/_view,1914-07-08, +data/1915/prot-1915--ak--009.xml,https://betalab.kb.se/prot-1915--ak--9/_view,1915-01-26, +data/1915/prot-1915--fk--055.xml,https://betalab.kb.se/prot-1915--fk--55/_view,1915-04-27, +data/1916/prot-1916--ak--021.xml,https://betalab.kb.se/prot-1916--ak--21/_view,1916-02-15, +data/1916/prot-1916--fk--077.xml,https://betalab.kb.se/prot-1916--fk--77/_view,1916-05-27, +data/1917/prot-1917--ak--085.xml,https://betalab.kb.se/prot-1917--ak--85/_view,1917-12-18, +data/1917/prot-1917--fk--024.xml,https://betalab.kb.se/prot-1917--fk--24/_view,1917-03-21, +data/1918/prot-1918--ak--048.xml,https://betalab.kb.se/prot-1918--ak--48/_view,1918-05-02, +data/1918/prot-1918--fk--054.xml,https://betalab.kb.se/prot-1918--fk--54/_view,1918-06-14, +data/1919/prot-1919--ak--038.xml,https://betalab.kb.se/prot-1919--ak--38/_view,1919-04-15, +data/1919/prot-1919--fk--037.xml,https://betalab.kb.se/prot-1919--fk--37/_view,1919-05-14 - 1919-05-16,fixed +data/1920/prot-1920--ak--030.xml,https://betalab.kb.se/prot-1920--ak--30/_view,1920-03-17, +data/1920/prot-1920--fk--045.xml,https://betalab.kb.se/prot-1920--fk--45/_view,1920-05-11, +data/1921/prot-1921--ak--038.xml,https://betalab.kb.se/prot-1921--ak--38/_view,1921-05-04, +data/1921/prot-1921--fk--026.xml,https://betalab.kb.se/prot-1921--fk--26/_view,1921-04-20 - 1921-04-22,fixed +data/1922/prot-1922--ak--018.xml,https://betalab.kb.se/prot-1922--ak--18/_view,1922-03-15 - 1922-03-17,fixed +data/1922/prot-1922--fk--022.xml,https://betalab.kb.se/prot-1922--fk--22/_view,1922-04-01, +data/1923/prot-1923--ak--039.xml,https://betalab.kb.se/prot-1923--ak--39/_view,1923-05-23, +data/1923/prot-1923--fk--023.xml,https://betalab.kb.se/prot-1923--fk--23/_view,1923-04-05 - 1923-04-06,fixed +data/1924/prot-1924--ak--034.xml,https://betalab.kb.se/prot-1924--ak--34/_view,1924-05-10 - 1924-05-13,fixed +data/1924/prot-1924--fk--040.xml,https://betalab.kb.se/prot-1924--fk--40/_view,1924-05-24 - 1924-05-27,fixed +data/1925/prot-1925--ak--002.xml,https://betalab.kb.se/prot-1925--ak--2/_view,1925-01-12 - 1925-01-14,fixed +data/1925/prot-1925--fk--006.xml,https://betalab.kb.se/prot-1925--fk--6/_view,1925-01-28 - 1925-02-01,fixed +data/1926/prot-1926--ak--003.xml,https://betalab.kb.se/prot-1926--ak--3/_view,1926-01-18, +data/1926/prot-1926--fk--028.xml,https://betalab.kb.se/prot-1926--fk--28/_view,1926-04-23 - 1926-04-24,fixed +data/1927/prot-1927--ak--031.xml,https://betalab.kb.se/prot-1927--ak--31/_view,1927-05-14, +data/1927/prot-1927--fk--032.xml,https://betalab.kb.se/prot-1927--fk--32/_view,1927-05-19, +data/1928/prot-1928--ak--034.xml,https://betalab.kb.se/prot-1928--ak--34/_view,1928-05-12, +data/1928/prot-1928--fk--037.xml,https://betalab.kb.se/prot-1928--fk--37/_view,1928-05-29, +data/1929/prot-1929--ak--038.xml,https://betalab.kb.se/prot-1929--ak--38/_view,1929-05-21 - 1929-05-22,fixed +data/1929/prot-1929--fk--006.xml,https://betalab.kb.se/prot-1929--fk--6/_view,1929-01-31 - 1929-02-06,fixed +data/1930/prot-1930--ak--026.xml,https://betalab.kb.se/prot-1930--ak--26/_view,1930-04-09 - 1930-04-17,fixed +data/1930/prot-1930--fk--044.xml,https://betalab.kb.se/prot-1930--fk--44/_view,1930-06-03, +data/1931/prot-1931--ak--015.xml,https://betalab.kb.se/prot-1931--ak--15/_view,1931-02-27 - 1931-02-28,fixed +data/1931/prot-1931--fk--039.xml,https://betalab.kb.se/prot-1931--fk--39/_view,1931-05-20, +data/1932/prot-1932--ak--031.xml,https://betalab.kb.se/prot-1932--ak--31/_view,1932-04-06, +data/1932/prot-1932--fk--047.xml,https://betalab.kb.se/prot-1932--fk--47/_view,1932-06-08, +data/1933/prot-1933--ak--044.xml,https://betalab.kb.se/prot-1933--ak--44/_view,1933-06-03 - 1933-06-07,fixed +data/1933/prot-1933--fk--034.xml,https://betalab.kb.se/prot-1933--fk--34/_view,1933-05-06 - 1933-05-09,fixed +data/1934/prot-1934--ak--024.xml,https://betalab.kb.se/prot-1934--ak--24/_view,1934-04-10 - 1934-04-11,fixed +data/1934/prot-1934--fk--007.xml,https://betalab.kb.se/prot-1934--fk--7/_view,1934-02-07 - 1934-02-09,fixed +data/1935/prot-1935--ak--007.xml,https://betalab.kb.se/prot-1935--ak--7/_view,1935-01-26 - 1935-01-29,fixed +data/1935/prot-1935--fk--032.xml,https://betalab.kb.se/prot-1935--fk--32/_view,1935-05-15 - 1935-05-7,fixed +data/1936/prot-1936--ak--021.xml,https://betalab.kb.se/prot-1936--ak--21/_view,1936-04-01, +data/1936/prot-1936--fk--024.xml,https://betalab.kb.se/prot-1936--fk--24/_view,1936-04-16, +data/1937/prot-1937--ak--014.xml,https://betalab.kb.se/prot-1937--ak--14/_view,1937-02-27, +data/1937/prot-1937--fk--006.xml,https://betalab.kb.se/prot-1937--fk--6/_view,1937-01-26 - 1937-01-27,fixed +data/1938/prot-1938--ak--026.xml,https://betalab.kb.se/prot-1938--ak--26/_view,1938-04-05 - 1938-04-08,fixed +data/1938/prot-1938--fk--021.xml,https://betalab.kb.se/prot-1938--fk--21/_view,1938-03-18, +data/1939/prot-1939--ak--001.xml,https://betalab.kb.se/prot-1939--ak--1/_view,1939-01-10, +data/1939/prot-1939--fk--041.xml,https://betalab.kb.se/prot-1939--fk--41/_view,1939-06-13 - 1939-06-17,fixed +data/1940/prot-1940--ak--031.xml,https://betalab.kb.se/prot-1940--ak--31/_view,1940-05-14 - 1940-05-18,fixed +data/1940/prot-1940--fk--004.xml,https://betalab.kb.se/prot-1940--fk--4/_view,1940-01-20 - 1940-01-23,fixed +data/1941/prot-1941--ak--050.xml,https://betalab.kb.se/prot-1941-hšst-ak--50/_view,1941-12-03 - 1941-12-10,fixed +data/1941/prot-1941--fk--009.xml,https://betalab.kb.se/prot-1941--fk--9/_view,1941-02-18 - 1941-02-19,fixed +data/1942/prot-1942--ak--007.xml,https://betalab.kb.se/prot-1942--ak--7/_view,1942-02-10 - 1942-02-14,fixed +data/1942/prot-1942--fk--028.xml,https://betalab.kb.se/prot-1942-hšst-fk--28/_view,1942-11-02 - 1942-11-03,fixed +data/1943/prot-1943--ak--012.xml,https://betalab.kb.se/prot-1943--ak--12/_view,1943-03-30 - 1943-03-31,fixed +data/1943/prot-1943--fk--031.xml,https://betalab.kb.se/prot-1943-hšst-fk--31/_view,1943-11-09 - 1943-11-17,fixed +data/1944/prot-1944--ak--009.xml,https://betalab.kb.se/prot-1944--ak--9/_view,1944-03-07 - 1944-03-11,fixed +data/1944/prot-1944--fk--015.xml,https://betalab.kb.se/prot-1944--fk--15/_view,1944-04-25 - 1944-05-02,fixed +data/1945/prot-1945--ak--017.xml,https://betalab.kb.se/prot-1945--ak--17/_view,1945-04-19, +data/1945/prot-1945--fk--004.xml,https://betalab.kb.se/prot-1945--fk--4/_view,1945-01-18 - 1945-01-22,fixed +data/1946/prot-1946--ak--025.xml,https://betalab.kb.se/prot-1946--ak--25/_view,1946-06-14 - 1946-06-18,fixed +data/1946/prot-1946--fk--001.xml,https://betalab.kb.se/prot-1946--fk--1/_view,1946-01-10 - 1946-01-16,fixed +data/1947/prot-1947--ak--002.xml,https://betalab.kb.se/prot-1947--ak--2/_view,1947-01-17, +data/1947/prot-1947--fk--019.xml,https://betalab.kb.se/prot-1947--fk--19/_view,1947-04-26 - 1947-05-02,fixed +data/1948/prot-1948--ak--011.xml,https://betalab.kb.se/prot-1948--ak--11/_view,1948-03-13 - 1948-03-16,fixed +data/1948/prot-1948--fk--011.xml,https://betalab.kb.se/prot-1948--fk--11/_view,1948-03-13 - 1948-03-17,fixed +data/1949/prot-1949--ak--024.xml,https://betalab.kb.se/prot-1949-hšst-ak--24/_view,1949-10-17 - 1949-10-18, +data/1949/prot-1949--fk--031.xml,https://betalab.kb.se/prot-1949-hšst-fk--31/_view,1949-12-06 - 1949-12-07, +data/1950/prot-1950--ak--012.xml,https://betalab.kb.se/prot-1950--ak--12/_view,1950-03-23 - 1950-03-29, +data/1950/prot-1950--fk--006.xml,https://betalab.kb.se/prot-1950--fk--6/_view,1950-02-11 - 1950-02-15, +data/1951/prot-1951--ak--029.xml,https://betalab.kb.se/prot-1951-extrahšst-ak--29/_view,1951-10-27 - 1951-10-31, +data/1951/prot-1951--fk--023.xml,https://betalab.kb.se/prot-1951--fk--23/_view,1951-05-26, +data/1952/prot-1952--ak--004.xml,https://betalab.kb.se/prot-1952--ak--4/_view,1952-01-30 - 1952-02-06, +data/1952/prot-1952--fk--031.xml,https://betalab.kb.se/prot-1952-hšst-fk--31/_view,1952-12-06 - 1952-12-10, +data/1953/prot-1953--ak--027.xml,https://betalab.kb.se/prot-1953-hšst-ak--27/_view,1953-11-03 - 1953-11-04, +data/1953/prot-1953--fk--004.xml,https://betalab.kb.se/prot-1953--fk--4/_view,1953-01-31 - 1953-02-04, +data/1954/prot-1954--ak--031.xml,https://betalab.kb.se/prot-1954-hšst-ak--31/_view,1954-11-27 - 1954-12-01, +data/1954/prot-1954--fk--001.xml,https://betalab.kb.se/prot-1954--fk--1/_view,1954-01-11 - 1954-01-15, +data/1955/prot-1955--ak--016.xml,https://betalab.kb.se/prot-1955--ak--16/_view,1955-04-29 - 1955-05-04, +data/1955/prot-1955--fk--004.xml,https://betalab.kb.se/prot-1955--fk--4/_view,1955-02-04 - 1955-02-09, +data/1956/prot-1956--ak--019.xml,https://betalab.kb.se/prot-1956--ak--19/_view,1956-05-18 - 1956-05-23, +data/1956/prot-1956--fk--009.xml,https://betalab.kb.se/prot-1956--fk--9/_view,1956-03-02 - 1956-03-07, +data/1957/prot-1957--ak--002.xml,https://betalab.kb.se/prot-1957--ak--2/_view,1957-01-21 - 1957-01-22, +data/1957/prot-1957--fk--027.xml,https://betalab.kb.se/prot-1957-hšst-fk--27/_view,1957-11-22 - 1957-11-27, +data/1958/prot-1958-a-fk--002.xml,https://betalab.kb.se/prot-1958-a-fk--2/_view,1958-01-21 - 1958-01-22, +data/1958/prot-1958-b-ak--013.xml,https://betalab.kb.se/prot-1958-b-ak--13/_view,1958-11-28 - 1958-12-03, +data/1959/prot-1959--ak--007.xml,https://betalab.kb.se/prot-1959--ak--7/_view,1959-03-06 - 1959-03-11, +data/1959/prot-1959--fk--006.xml,https://betalab.kb.se/prot-1959--fk--6/_view,1959-02-27 - 1959-03-04, +data/1960/prot-1960--ak--025.xml,https://betalab.kb.se/prot-1960-hšst-ak--25/_view,1960-11-08 - 1960-11-09, +data/1960/prot-1960--fk--019.xml,https://betalab.kb.se/prot-1960--fk--19/_view,1960-05-23 - 1960-05-25, +data/1961/prot-1961--ak--030.xml,https://betalab.kb.se/prot-1961-hšst-ak--30/_view,1961-11-10 - 1961-11-15, +data/1961/prot-1961--fk--020.xml,https://betalab.kb.se/prot-1961--fk--20/_view,1961-05-17, +data/1962/prot-1962--ak--030.xml,https://betalab.kb.se/prot-1962-hšst-ak--30/_view,1962-11-06 - 1962-11-09, +data/1962/prot-1962--fk--014.xml,https://betalab.kb.se/prot-1962--fk--14/_view,1962-03-30 - 1962-04-04, +data/1963/prot-1963--ak--024.xml,https://betalab.kb.se/prot-1963--ak--24/_view,1963-05-20 - 1963-05-22, +data/1963/prot-1963--fk--011.xml,https://betalab.kb.se/prot-1963--fk--11/_view,1963-03-15 - 1963-03-20, +data/1964/prot-1964--ak--003.xml,https://betalab.kb.se/prot-1964--ak--3/_view,1964-01-21 - 1964-01-22, +data/1964/prot-1964--fk--008.xml,https://betalab.kb.se/prot-1964--fk--8/_view,1964-02-18 - 1964-02-21, +data/1965/prot-1965--ak--013.xml,https://betalab.kb.se/prot-1965--ak--13/_view,1965-03-23 - 1965-03-24, +data/1965/prot-1965--fk--003.xml,https://betalab.kb.se/prot-1965--fk--3/_view,1965-01-19, +data/1966/prot-1966--ak--016.xml,https://betalab.kb.se/prot-1966--ak--16/_view,1966-04-13, +data/1966/prot-1966--fk--037.xml,https://betalab.kb.se/prot-1966-hšst-fk--37/_view,1966-12-08 - 1966-12-09, +data/1967/prot-1967--ak--038.xml,https://betalab.kb.se/prot-1967-hšst-ak--38/_view,1967-10-26 - 1967-10-27, +data/1967/prot-1967--fk--046.xml,https://betalab.kb.se/prot-1967-hšst-fk--46/_view,1967-11-23 - 1967-11-24, +data/1968/prot-1968--ak--041.xml,https://betalab.kb.se/prot-1968-hšst-ak--41/_view,1968-12-03 - 1968-12-04, +data/1968/prot-1968--fk--021.xml,https://betalab.kb.se/prot-1968--fk--21/_view,1968-05-02 - 1968-05-03, +data/1969/prot-1969--ak--014.xml,https://betalab.kb.se/prot-1969--ak--14/_view,1969-03-27 - 1969-03-28, +data/1969/prot-1969--fk--039.xml,https://betalab.kb.se/prot-1969-hšst-fk--39/_view,1969-12-02 - 1969-12-03, +data/1970/prot-1970--ak--037.xml,https://betalab.kb.se/prot-1970-hšst-ak--37/_view,1970-11-10 - 1970-11-13, +data/1970/prot-1970--fk--012.xml,https://betalab.kb.se/prot-1970--fk--12/_view,1970-03-19 - 1970-03-20, +data/1971/prot-1971--074.xml,https://betalab.kb.se/prot-1971--74/_view,1971-04-29 - 1971-04-30, +data/1972/prot-1972--006.xml,https://betalab.kb.se/prot-1972--6/_view,1972-01-18 - 1972-01-21, +data/1973/prot-1973--037.xml,https://betalab.kb.se/prot-1973--37/_view,1973-03-06 - 1973-03-07, +data/1974/prot-1974--101.xml,https://betalab.kb.se/prot-1974--101/_view,1974-10-16 - 1974-10-18, +data/1975/prot-1975--036.xml,https://betalab.kb.se/prot-1975--36/_view,1975-03-13 - 1975-03-18, +data/197576/prot-197576--117.xml,https://betalab.kb.se/prot-197576--117/_view,1976-05-05, +data/197677/prot-197677--046.xml,https://betalab.kb.se/prot-197677--46/_view,1976-12-15, +data/197778/prot-197778--125.xml,https://betalab.kb.se/prot-197778--125/_view,1978-04-20, +data/197879/prot-197879--099.xml,https://betalab.kb.se/prot-197879--99/_view,1979-03-08 - 1979-03-13, +data/197980/prot-197980--158.xml,https://betalab.kb.se/prot-197980--158/_view,1980-05-30 - 1980-06-02, +data/1980/prot-1980-urtima-005.xml,https://betalab.kb.se/prot-1980-urtima-5/_view,1980-09-04 - 1980-09-05, +data/198081/prot-198081--024.xml,https://betalab.kb.se/prot-198081--24/_view,1980-11-10 - 1980-11-14, +data/198182/prot-198182--097.xml,https://betalab.kb.se/prot-198182--97/_view,1982-03-11 - 1982-03-16, +data/198283/prot-198283--073.xml,https://betalab.kb.se/prot-198283--73/_view,1983-02-03, +data/198384/prot-198384--122.xml,https://betalab.kb.se/prot-198384--122/_view,1984-04-11, +data/198485/prot-198485--113.xml,https://betalab.kb.se/prot-198485--113/_view,1985-04-10, +data/198586/prot-198586--032.xml,https://betalab.kb.se/prot-198586--32/_view,1985-11-20, +data/198687/prot-198687--045.xml,https://betalab.kb.se/prot-198687--45/_view,1986-12-08 - 1986-12-09, +data/198788/prot-198788--048.xml,https://betalab.kb.se/prot-198788--48/_view,1988-01-11 - 1988-01-15, +data/198889/prot-198889--063.xml,https://betalab.kb.se/prot-198889--63/_view,1989-02-06 - 1989-02-08, +data/198990/prot-198990--116.xml,https://betalab.kb.se/prot-198990--116/_view,1990-05-07, +data/199091/prot-199091--124.xml,,,missing +data/199192/prot-199192--086.xml,,,missing +data/199293/prot-199293--127.xml,https://betalab.kb.se/prot-199293--127/_view,,missing +data/199394/prot-199394--073.xml,,,missing +data/199495/prot-199495--032.xml,http://data.riksdagen.se/fil/DF0F601E-79EB-4311-AAAC-77A4011EEB88#page=1,1994-11-30, +data/199596/prot-199596--039.xml,http://data.riksdagen.se/fil/E092044C-BA72-406E-9477-63B3E07903EF#page=1,1995-12-15, +data/199697/prot-199697--010.xml,http://data.riksdagen.se/fil/F53CD70F-D454-44E1-9B53-9C8449F98200#page=1,1996-10-04, +data/199798/prot-199798--079.xml,http://data.riksdagen.se/fil/4888C0E2-49E6-41FE-9E4F-20EABD9ED427#page=1,1998-03-12, +data/199899/prot-199899--109.xml,http://data.riksdagen.se/fil/DE8FEE49-9E47-4DE7-A87A-F65D9972B209#page=1,1999-06-14, +data/19992000/prot-19992000--071.xml,http://data.riksdagen.se/fil/11D0A087-D45B-468A-98FD-BFDF4AC19CEF#page=1,2000-02-23, +data/200001/prot-200001--035.xml,http://data.riksdagen.se/fil/8408DBDC-02E5-4010-A512-B68DEE244026#page=1,2000-11-28, +data/200102/prot-200102--113.xml,http://data.riksdagen.se/fil/A15B349D-4A57-4D6E-BA60-CC60D5F694F1#page=1,2002-05-27, +data/200203/prot-200203--035.xml,http://data.riksdagen.se/fil/2C25FA23-E2FD-4A58-9C29-7A10034A3EE3#page=1,2002-12-12, +data/200304/prot-200304--134.xml,http://data.riksdagen.se/fil/2C8CA51D-1500-4527-B54D-09E0D19B9D6C#page=1,2003-07-08, +data/200405/prot-200405--005.xml,http://data.riksdagen.se/fil/1C5D8C0A-8F19-46FC-B5C9-8692CC5DAC8B#page=1,2004-09-20, +data/200506/prot-200506--065.xml,http://data.riksdagen.se/fil/9306D1FA-5E84-4726-A320-4DC7E94781D8#page=1,2006-02-01, +data/200607/prot-200607--006.xml,http://data.riksdagen.se/fil/76CFB38E-EC0B-4B69-A4AE-D6EA14F5B846#page=1,2006-10-06, +data/200708/prot-200708--123.xml,http://data.riksdagen.se/fil/4A6A23DE-8BE8-441E-AD0A-32C727A1A9ED#page=1,2008-06-05, +data/200809/prot-200809--052.xml,http://data.riksdagen.se/fil/7D701EA0-AA0B-4576-89D6-9CA1E24CC566#page=1,2008-12-18, +data/200910/prot-200910--072.xml,http://data.riksdagen.se/fil/E2205F18-EC1C-406D-B7D0-AF506124C568#page=1,2010-02-10, +data/201011/prot-201011--067.xml,http://data.riksdagen.se/fil/B4F3E90C-CB03-4A8D-849E-A102CB588C50#page=1,2011-03-04, +data/201112/prot-201112--024.xml,http://data.riksdagen.se/fil/2AB241D3-4999-400F-AF8E-2D0AB3A52833#page=1,2011-10-27, +data/201213/prot-201213--036.xml,http://data.riksdagen.se/fil/717A907B-D5F2-4CC1-B1DF-F20F9690E378#page=1,2012-12-04, +data/201314/prot-201314--055.xml,http://data.riksdagen.se/fil/3823D10A-92D5-44FE-A31C-3CFED613269A#page=1,2014-01-16, +data/201415/prot-201415--010.xml,http://data.riksdagen.se/fil/9144082F-C534-4ED6-A2FD-FC0128084219#page=1,2014-10-16, +data/201516/prot-201516--103.xml,http://data.riksdagen.se/fil/67C9796C-D0ED-4B5D-871C-016E1496A886#page=1,2016-05-11, +data/201617/prot-201617--065.xml,http://data.riksdagen.se/fil/1B5912DC-5248-4042-93DA-69CA45C0E6A8#page=1,2017-02-02, +data/201718/prot-201718--054.xml,http://data.riksdagen.se/fil/9AD3BA05-C2AA-4237-A5A1-49030A853263#page=1,2017-12-20, +data/201819/prot-201819--016.xml,http://data.riksdagen.se/fil/6C1C1DCC-216C-43E3-B5A9-FE1A148C479A#page=1,2018-11-14, +data/201920/prot-201920--065.xml,http://data.riksdagen.se/fil/D4032E83-5F14-4D77-AE77-E9D8504D84D2#page=1,2020-01-29, +data/202021/prot-202021--140.xml,http://data.riksdagen.se/fil/D62BF200-413F-446F-B644-B499603D815A#page=1,2021-06-14, +data/202122/prot-202122--093.xml,http://data.riksdagen.se/fil/57A5D444-042A-4B6A-9650-11DCBA47962B#page=1,2022-04-05, \ No newline at end of file diff --git a/src/date_accurcacy_estimate.py b/src/date_accurcacy_estimate.py new file mode 100644 index 0000000..a764ad2 --- /dev/null +++ b/src/date_accurcacy_estimate.py @@ -0,0 +1,112 @@ +""" +Calculate an upper bound for segment classification accuracy. +Based on the gold standard annotations. +""" +from pyriksdagen.utils import protocol_iterators, elem_iter, infer_metadata +from lxml import etree +import numpy as np +import pandas as pd +from tqdm import tqdm +import argparse +from multiprocessing import Pool +from pathlib import Path +import warnings +import progressbar +from scipy.stats import beta +import seaborn as sns +from matplotlib import pyplot as plt +from pyriksdagen.utils import TEI_NS +from datetime import datetime + +XML_NS = "{http://www.w3.org/XML/1998/namespace}" + + +# Fix parallellization +def date_range_from_protocol(protocol): + parser = etree.XMLParser(remove_blank_text=True) + root = etree.parse(protocol, parser).getroot() + mindate, maxdate = None, None + for docDate in root.findall(f".//{TEI_NS}docDate"): + if mindate is None or mindate > docDate.get("when"): + mindate = docDate.get("when") + if maxdate is None or maxdate < docDate.get("when"): + maxdate = docDate.get("when") + return mindate, maxdate + +def get_jaccard(startdate, enddate, startdate_hat, enddate_hat): + startdate = datetime.fromisoformat(f'{startdate} 00:00:01') + startdate_hat = datetime.fromisoformat(f'{startdate_hat} 00:00:01') + enddate = datetime.fromisoformat(f'{enddate} 23:59:59') + enddate_hat = datetime.fromisoformat(f'{enddate_hat} 23:59:59') + + union = min(startdate, startdate_hat), max(enddate, enddate_hat) + intersection = max(startdate, startdate_hat), min(enddate, enddate_hat) + if intersection[1] <= intersection[0]: + return 0.0, 0.0, 0.0 + + unionlen = (union[1] - union[0]).total_seconds() + intersectionlen = (intersection[1] - intersection[0]).total_seconds() + + return intersectionlen / unionlen, int(intersectionlen == unionlen), int(intersectionlen / unionlen > 0.0) + +def main(args): + protocols = list(protocol_iterators(args.records_folder, start=args.start, end=args.end)) + print(args.path_goldstandard) + df = pd.read_csv(args.path_goldstandard) + print(df) + df = df[df["path"].notnull()] + df = df[df["path"].str.contains("/")] + df["protocol_id"] = df["path"].str.split("/").str[-1].str.split(".").str[0] + print(df) + rows = [] + correct, incorrect = 0, 0 + jaccs, perfects, overlaps, contains = [], [], [], [] + zero_overlaps = [] + for p in progressbar.progressbar(protocols): + path = Path(p) + protocol_id = path.stem + df_p = df[df["protocol_id"] == protocol_id] + if len(df_p) == 1: + startdate, enddate = None, None + datestr = list(df_p["true-dates"])[0] + startdate_hat, enddate_hat = date_range_from_protocol(path) + try: + if " - " in datestr: + startdate, enddate = datestr.split(" - ") + else: + startdate, enddate = datestr, datestr + + jacc, perfect, overlap, contain = get_jaccard(startdate, enddate, startdate_hat, enddate_hat) + jaccs.append(jacc) + perfects.append(perfect) + overlaps.append(overlap) + if overlap == 0: + print() + print(protocol_id) + print("true:" +startdate + " - " + enddate, startdate_hat + " - " + enddate_hat) + zero_overlaps.append(protocol_id) + + except: + print("Problem with", protocol_id, datestr) + #print(jacc) + + elif len(df_p) > 1: + print("Problem with", protocol_id) + + print("E[J]", np.mean(jaccs)) + print("P(J == 1)", np.mean(perfects)) + print("P(J > 0)", np.mean(overlaps)) + + zero_overlaps = "\n".join(zero_overlaps) + print(f"Zero overlap in: {zero_overlaps}") + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--start", type=int, default=1867) + parser.add_argument("--end", type=int, default=2022) + parser.add_argument("--records_folder", type=str, default="corpus/records") + parser.add_argument("--path_goldstandard", type=str, default="date-sample.csv") + args = parser.parse_args() + df = main(args) + + print(df) From aecf379160b9272c880259c7c459d5ec675d95c2 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Thu, 29 Aug 2024 14:12:58 +0200 Subject: [PATCH 34/40] feat: also estimate subset proportion --- src/date_accurcacy_estimate.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/date_accurcacy_estimate.py b/src/date_accurcacy_estimate.py index a764ad2..33ef794 100644 --- a/src/date_accurcacy_estimate.py +++ b/src/date_accurcacy_estimate.py @@ -42,12 +42,12 @@ def get_jaccard(startdate, enddate, startdate_hat, enddate_hat): union = min(startdate, startdate_hat), max(enddate, enddate_hat) intersection = max(startdate, startdate_hat), min(enddate, enddate_hat) if intersection[1] <= intersection[0]: - return 0.0, 0.0, 0.0 + return 0.0, 0.0, 0.0, 0.0 unionlen = (union[1] - union[0]).total_seconds() intersectionlen = (intersection[1] - intersection[0]).total_seconds() - - return intersectionlen / unionlen, int(intersectionlen == unionlen), int(intersectionlen / unionlen > 0.0) + contain = int(startdate >= startdate_hat and enddate <= enddate_hat) + return intersectionlen / unionlen, int(intersectionlen == unionlen), int(intersectionlen / unionlen > 0.0), contain def main(args): protocols = list(protocol_iterators(args.records_folder, start=args.start, end=args.end)) @@ -80,6 +80,7 @@ def main(args): jaccs.append(jacc) perfects.append(perfect) overlaps.append(overlap) + contains.append(contain) if overlap == 0: print() print(protocol_id) @@ -96,6 +97,7 @@ def main(args): print("E[J]", np.mean(jaccs)) print("P(J == 1)", np.mean(perfects)) print("P(J > 0)", np.mean(overlaps)) + print("Contains", np.mean(contains)) zero_overlaps = "\n".join(zero_overlaps) print(f"Zero overlap in: {zero_overlaps}") From 65df985b0375d098eef999b39d9f41d2670de4d8 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Thu, 29 Aug 2024 14:13:39 +0200 Subject: [PATCH 35/40] chore: fix typo in filename --- src/{date_accurcacy_estimate.py => date_accuracy_estimate.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{date_accurcacy_estimate.py => date_accuracy_estimate.py} (100%) diff --git a/src/date_accurcacy_estimate.py b/src/date_accuracy_estimate.py similarity index 100% rename from src/date_accurcacy_estimate.py rename to src/date_accuracy_estimate.py From 933360e10ee2b294d5a1cf5936719c4a26396ca6 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Thu, 12 Sep 2024 11:35:46 +0200 Subject: [PATCH 36/40] refactor: remove default argument --- src/segmentation_accuracy_estimate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segmentation_accuracy_estimate.py b/src/segmentation_accuracy_estimate.py index a6c07cc..488a94a 100644 --- a/src/segmentation_accuracy_estimate.py +++ b/src/segmentation_accuracy_estimate.py @@ -128,7 +128,7 @@ def pad_id(pid): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--start", type=int, default=1867) parser.add_argument("--end", type=int, default=2022) - parser.add_argument("--records_folder", type=str, default="corpus/records") + parser.add_argument("--records_folder", type=str, default=None) parser.add_argument("--path_goldstandard", type=str, default="corpus/quality_assesment/segment_classification/prot-segment-classification.csv") args = parser.parse_args() df = main(args) From fe6ce9f64c891593b755c86b3b2002d649a57391 Mon Sep 17 00:00:00 2001 From: ninpnin Date: Thu, 12 Sep 2024 11:50:10 +0200 Subject: [PATCH 37/40] fix: broken import and missing env var fetching --- src/cur-prot/find_dates.py | 1 + src/segmentation_accuracy_estimate.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cur-prot/find_dates.py b/src/cur-prot/find_dates.py index df5a634..63106c0 100644 --- a/src/cur-prot/find_dates.py +++ b/src/cur-prot/find_dates.py @@ -8,6 +8,7 @@ parse_protocol, protocol_iterators, write_protocol, + get_data_location, ) import progressbar import argparse diff --git a/src/segmentation_accuracy_estimate.py b/src/segmentation_accuracy_estimate.py index 488a94a..c356f2f 100644 --- a/src/segmentation_accuracy_estimate.py +++ b/src/segmentation_accuracy_estimate.py @@ -2,7 +2,7 @@ Calculate an upper bound for segment classification accuracy. Based on the gold standard annotations. """ -from pyriksdagen.utils import protocol_iterators, elem_iter, infer_metadata +from pyriksdagen.utils import protocol_iterators, elem_iter, infer_metadata, get_data_location from lxml import etree import numpy as np import pandas as pd @@ -131,6 +131,8 @@ def pad_id(pid): parser.add_argument("--records_folder", type=str, default=None) parser.add_argument("--path_goldstandard", type=str, default="corpus/quality_assesment/segment_classification/prot-segment-classification.csv") args = parser.parse_args() + if args.records_folder is None: + args.records_folder = get_data_location("records") df = main(args) print(df) From 02e443325479e427dd1dd3c23fa452de6e85b0a3 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 19 Sep 2024 14:00:18 +0200 Subject: [PATCH 38/40] fix: swerik_id --> person_id --- src/generate_website_catalog.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/generate_website_catalog.py b/src/generate_website_catalog.py index 011d1ee..d2f93c0 100644 --- a/src/generate_website_catalog.py +++ b/src/generate_website_catalog.py @@ -42,7 +42,7 @@ def main(args): last_catalog_list = None ids_need_attention = [] last_pulled_ids = pd.read_csv(f"{args.metadata_folder}/wiki_id.csv") - this_catalog_list = last_pulled_ids["swerik_id"].unique() + this_catalog_list = last_pulled_ids["person_id"].unique() if last_catalog_list: [ids_need_attention.append(_) for _ in last_catalog_list if _ not in this_catalog_list] if len(ids_need_attention) > 0: @@ -61,10 +61,10 @@ def main(args): issue_counter = 0 for swerik_id in tqdm(this_catalog_list, total=len(this_catalog_list)): #print(">>>---", swerik_id) - filtered_Corpus = corpus_metadata.loc[corpus_metadata["swerik_id"] == swerik_id].copy() + filtered_Corpus = corpus_metadata.loc[corpus_metadata["person_id"] == swerik_id].copy() peripheral_metadata = {} for key, df in additional_metadata.items(): - df = df.loc[df["swerik_id"] == swerik_id] + df = df.loc[df["person_id"] == swerik_id] df = df.fillna(np.nan).replace([np.nan], [None]) df.reset_index(inplace=True) peripheral_metadata[key] = df.copy() @@ -84,10 +84,10 @@ def main(args): idname = [] primary_names = corpus_metadata.loc[corpus_metadata["primary_name"]==True] - primary_names.drop_duplicates(["swerik_id", "name", "born"], inplace=True) + primary_names.drop_duplicates(["person_id", "name", "born"], inplace=True) primary_names.sort_values(by='name', key=lambda x: x.str.split('\s+').str[-1], inplace=True) for i, r in primary_names.iterrows(): - idname.append({"swerik_id": r["swerik_id"], "name": r["name"], "born": r["born"]}) + idname.append({"person_id": r["person_id"], "name": r["name"], "born": r["born"]}) names_list = { "version": args.version, "last-updated": now, @@ -110,11 +110,11 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--metadata_folder", type=str, default="corpus/metadata") - parser.add_argument("--input_metadata_folder", type=str, default="input/metadata") + parser.add_argument("--metadata_folder", type=str, default="./riksdagen-persons/data") + parser.add_argument("--input_metadata_folder", type=str, default="./input/metadata") parser.add_argument("-w", "--website_root", type=str, - default="../swerik-project.github.io/", + default="./swerik-project.github.io/", help="Root path to the website's local files") parser.add_argument("-v", "--version", type=str, From c16ce7a2d09c9f4d200744048ac4765ff62167ed Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 12 Dec 2024 09:31:08 +0100 Subject: [PATCH 39/40] refactor: use pyriksdagen argparse helper and doctype-agnostic parse/write fns --- src/cur-prot/find_dates.py | 47 +++++++++++--------------------------- 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/src/cur-prot/find_dates.py b/src/cur-prot/find_dates.py index 63106c0..8d6d710 100644 --- a/src/cur-prot/find_dates.py +++ b/src/cur-prot/find_dates.py @@ -3,14 +3,16 @@ """ from lxml import etree from pyriksdagen.refine import detect_date +from pyriksdagen.args import ( + fetch_parser, + impute_args, +) from pyriksdagen.utils import ( infer_metadata, - parse_protocol, - protocol_iterators, - write_protocol, - get_data_location, + parse_tei, + write_tei, ) -import progressbar +from tqdm import tqdm import argparse @@ -18,38 +20,15 @@ def main(args): - if args.protocol: - protocols = [args.protocol] - else: - if args.records_folder is not None: - data_location = args.records_folder - else: - data_location = get_data_location("records") - protocols = sorted(list(protocol_iterators(data_location, - start=args.start, - end=args.end))) - - for protocol_path in progressbar.progressbar(protocols): - metadata = infer_metadata(protocol_path) - root = parse_protocol(protocol_path) + for record in tqdm(args.records): + metadata = infer_metadata(record) + root, ns = parse_tei(record) root, dates = detect_date(root, metadata) - - write_protocol(root, protocol_path) + write_tei(root, record) if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("-s", "--start", type=int, default=1920, help="Start year") - parser.add_argument("-e", "--end", type=int, default=2022, help="End year") - parser.add_argument("-r", "--records-folder", - type=str, - default=None, - help="(optional) Path to records folder, defaults to environment var or `data/`") - parser.add_argument("-p", "--protocol", - type=str, - default=None, - help="operate on a single protocol") - args = parser.parse_args() - main(args) + parser = fetch_parser("records") + main(impute_args(parser.parse_args())) From 6361dce55d5931fecf12497cd64649dc00f27285 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Fri, 13 Dec 2024 17:31:53 +0100 Subject: [PATCH 40/40] feat: add flag for skip sicknotes --- src/cur-prot/find_dates.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cur-prot/find_dates.py b/src/cur-prot/find_dates.py index 8d6d710..c3bc66c 100644 --- a/src/cur-prot/find_dates.py +++ b/src/cur-prot/find_dates.py @@ -23,7 +23,7 @@ def main(args): for record in tqdm(args.records): metadata = infer_metadata(record) root, ns = parse_tei(record) - root, dates = detect_date(root, metadata) + root, dates = detect_date(root, metadata, skip_doctors_notes=args.skip_doctors_notes) write_tei(root, record) @@ -31,4 +31,5 @@ def main(args): if __name__ == "__main__": parser = fetch_parser("records") + parser.add_argument("--skip-doctors-notes", action='store_true') main(impute_args(parser.parse_args()))