## Setup
download *marc\_extractor.py* and *almasru.py* from https://labs.onb.ac.at/gitlab/labs-team/catalogue

In [1]:
import almasru
import marc_extractor
import pandas as pd
from collections import OrderedDict

## Load Mapping
load *mapping.csv* used in *marc\_extractor.py*

In [2]:
mapping = pd.read_csv('mapping.csv', keep_default_na=False)
mapping.sample(3)

Unnamed: 0,MARC controlfield,MARC extra selector,Liste,Label
25,505 8 0 $$a >505 8 0 $$t,,,Inhalt
23,700 1 _ $$a,$$4 dte,;,Widmungsempfänger
22,700 1 _ $$a ; 700 1 _ $$a,$$4 egr ; $$4 ill,;,Illustratoren


## Build Extractors
create Extractors for all mapping entries

In [3]:
column_extractors = OrderedDict()
for _, row in mapping.iterrows():
    column_extractors[row[3]] = marc_extractor.build_extractor(row[0], row[1], row[2])

Initializing SingleCommand for pattern "009"
Initializing SingleCommand for pattern "001"
Initializing SingleCommand for pattern "856 4 0 $$u"
Initializing SingleCommand for pattern "100 1 _ $$a"
Initializing SingleCommand for pattern "100 1 _ $$0"
Initializing SingleCommand for pattern "240 1 0 $$a"
Initializing SingleCommand for pattern "130 0 _ $$a"
Initializing SingleCommand for pattern "490 1 _ $$a"
Initializing SingleCommand for pattern "490 1 _ $$v"
Initializing SingleCommand for pattern "245 0 0 $$a"
Initializing SingleCommand for pattern "245 0 0 $$b"
Initializing SingleCommand for pattern "245 0 0 $$n"
Initializing SingleCommand for pattern "245 0 0 $$p"
Initializing SingleCommand for pattern "264 _ 1 $$a"
Initializing SingleCommand for pattern "751 _ _ $$0"
   collecting all results separated by ";"
Initializing SingleCommand for pattern "264 _ 1 $$b"
Initializing SingleCommand for pattern "700 1 _ $$a"
   with selector $$4 pbl
  Initializing SelectorPredicate for selector "

## Prepare Postprocessing
- col\_to\_split ???
- remove ; from Signatur
- add Barcode

In [4]:
split_char = ' ;;; '
col_to_split = 'Anm. zu Illustrationen ;;;\xa0Anm. zu Kollation ;;;\xa0Bibliografie ;;;\xa0Anmerkungen'
column_replacement = {
    0: 'Anm. zu Illustrationen',
    1: 'Anm. zu Kollation',
    2: 'Bibliografie',
    3: 'Anmerkungen'
}

def post(df):
    df_out = pd.concat([df, df[col_to_split].str.split(split_char, expand=True).rename(columns=column_replacement)], axis='columns')
    df_out = df_out.drop(labels=[col_to_split], axis='columns').copy()
    df_out['Signatur'] = df_out['Signatur'].str.replace(';', '')
    df_out['Barcode'] = '+Z' + df_out['Volltext'].str.extract(r'http://data.onb.ac.at/ABO/%2BZ(.*)')
    return df_out

## Prepare Extraction

In [5]:
import datetime
import sys
import re

def now():
    now = datetime.datetime.now()
    now_s = now.isoformat()
    now_without_micros = now_s.split('.')[0]
    now_without_colons = now_without_micros.replace(':', '')
    return now_without_colons

def ac_to_dict(ac):
    try:
        print(f' {ac} processing')
        marc_xml = almasru.RecordRetriever('obv-at-oenb', '43ACC_ONB', 'marcxml').by_marc_009(ac)
    except almasru.NoRecord:
        print(f'No record for AC number {ac} found.', file=sys.stderr)
        d = OrderedDict()
        for column, _ in column_extractors.items():
            d[column] = None
        return d
    d = OrderedDict()
    for column, extractor in column_extractors.items():
        d[column] = extractor.parse(marc_xml)
    return d

def ac_list_to_excel(ac_list, excel_file_name_stem):
    data = [ ac_to_dict(ac) for ac in ac_list ]
    df = pd.DataFrame(data)
    df_post = post(df)
    df_post.to_excel(f'{excel_file_name_stem} {now()}.xlsx')

ac_pattern = re.compile(r'(AC\d{7,8})')
def load_ac_list(file_name):
    return pd.read_excel(file_name)['Datensatznummer'].apply(lambda s: ac_pattern.findall(s)[0])

## Extraction
1. load input Excel (*load_ac_list*)
2. enrich input Excel (*ac_list_to_excel*)


In [6]:
ac_list_to_excel(load_ac_list('TravelogueD17_Japan.xlsx'),'TravelogueD17_Japan_Enriched')

 AC05861671 processing
 AC13796079 processing
 AC14473154 processing
 AC15257577 processing
 AC15353248 processing
 AC07530148 processing
 AC10134690 processing
 AC10356133 processing
 AC10472044 processing
