In [1]:
from pathlib import Path

p = Path("reocr_hocr/")
pp = Path("reocr_hocr_batch_2/")

for f in pp.iterdir():
    ff = p / f.name
    f.rename(ff)

In [4]:
def get_code(filename):
    return filename.split("_")[1]

unique_books = set([get_code(e.name) for e in p.iterdir() if e.is_file()])

In [28]:
for f in p.iterdir():
    if f.is_file():
        code = get_code(f.name)
        dp = p / f"digibok_{code}/{f.name}"
        f.rename(dp)


In [40]:
from pathlib import Path
import pandas as pd

p = Path("reocr_batch_2/digibok/")

df = pd.read_csv("batch2_info.tsv", sep="\t")

codes = [x.name.split("digibok_")[1] for x in p.iterdir()]


(156, 138)

In [88]:
import json 
import requests

def get_metadata(codes):
    metadata = {}
    base_api = "https://api.nb.no/catalog/v1/items/URN:NBN:no-nb_digibok_"
    for code in codes:
        response = requests.get(base_api+code)
        j = response.json()
        metadata[code] = j["metadata"]
    return metadata

def metadata_to_df(metadata):
    d = []
    for code, data in metadata.items():
        lang = data["languages"][0]["code"]
        year = int(data["originInfo"]["issued"])
        title = data["title"]
        
        data_code = data["identifiers"]["urn"].split("_")[-1]
        if code != data_code:
            print(f"Received code: {code}, metadata code: {data_code}")

        if "people" in data:
            author = data["people"][0]["name"]
        elif "statementOfResponsibility" in data:
            author = data["statementOfResponsibility"][0]
        d.append({"year": year, "author": author, "title": title, "filename code": code, "api code": data_code, "lang": lang})
    return pd.DataFrame(d)


# metadata = get_metadata(codes)

# with open("metadata_b2.json", "w+") as f:
#     json.dump(metadata, f)

In [91]:
metadata

{'2012082408040': {'title': 'Samlede dikt',
  'titleInfos': [{'title': 'Samlede dikt'}],
  'typeOfResource': 'text',
  'people': [{'name': 'Grieg, Nordahl',
    'date': '1902-1943',
    'roles': [{'name': 'cre', 'description': 'Opphavsmann'}],
    'usage': 'primary',
    'identifier': 'bibsys.no:authority:90057909'}],
  'geographic': {'placeString': 'Oslo', 'city': 'Oslo'},
  'originInfo': {'publisher': 'Gyldendal',
   'issued': '1947',
   'issuedUntouched': '1947',
   'firstIndexTime': '2021-11-22T09:03:32.272Z',
   'firstDigitalContentTime': '2016-01-21T22:24:46.403Z'},
  'recordInfo': {'identifier': '999314449264702202',
   'identifierSource': 'nb.bibsys.no',
   'created': '150420'},
  'classification': {'udc': ['839.6']},
  'identifiers': {'sesamId': '98cd556a3d5235732cb58a6eb3fdc889',
   'oaiId': 'oai:nb.bibsys.no:999314449264702202',
   'urn': 'URN:NBN:no-nb_digibok_2012082408040'},
  'notes': ['Elektronisk reproduksjon [Norge] Nasjonalbiblioteket Digital 2012-09-06'],
  'subject

In [73]:
df2 = metadata_to_df(metadata)
df2

Received code: 2010051412002, metadata code: 2010060420002
Received code: 2010041612002, metadata code: 2010041612003
Received code: 2010040912004, metadata code: 2010041212002
Received code: 2007073010003, metadata code: 2011041820007
Received code: 2007121410004, metadata code: 2008040800024
Received code: 2006120701085, metadata code: 2007080110001


Unnamed: 0,year,author,title,filename code,api code,lang
0,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
1,1946,"Grieg, Nordahl",Håbet : dikt,2013012806125,2013012806125,nob
2,1904,"Ibsen, Henrik","Kæmpehøjen, dramatisk digtning i en akt ; og O...",2010082422026,2010082422026,nob
3,1921,"Welhaven, J.S. (Johan Sebastian)",Samlede digterverker. 3 : [Halvhundrede Digte],2007092102002,2007092102002,nob
4,1922,"Hægstad, Marius",Norsk diktning efter 1814,2006120101048,2006120101048,mul
...,...,...,...,...,...,...
151,1913,"Løchen, Arne",Digtning og videnskap,2006090100024,2006090100024,nob
152,1919,"Kinck, Hans E.",Stammens røst : fem italienske digtere fra vor...,2006082800020,2006082800020,nob
153,1926,"Eskeland, Lars",Ungdom : dikt i utval,2010053106046,2010053106046,nno
154,1943,"Welhaven, J.S. (Johan Sebastian)",Samlede digterverker. B. 2,2008041500052,2008041500052,nob


## Find the files we want to remove

In [78]:
df2['filename code'] = df2['filename code'].astype('string')
df["filename code"] = df['filename code'].astype('string')


0      2012082408040
1      2013012806125
2      2010082422026
3      2007092102002
4      2006120101048
           ...      
151    2006090100024
152    2006082800020
153    2010053106046
154    2008041500052
155    2011051604088
Name: filename code, Length: 156, dtype: string

In [87]:
remove_these = set(df2["filename code"]) - set(df["filename code"])
remove_these

{'2006120101048',
 '2006120500055',
 '2007060701064',
 '2008040704116',
 '2008040904047',
 '2008041100031',
 '2009072801107',
 '2010052806041',
 '2010053106058',
 '2010061806033',
 '2010070106216',
 '2010070205055',
 '2011050604018',
 '2011121308005',
 '2012011110001',
 '2012052505023',
 '2013061408118',
 '2014102306006'}

In [90]:
pp = Path("iranai_hocr/")
for x in p.iterdir():
    code = x.name.split("digibok_")[1]
    if code in remove_these:
        dp = pp / x.name
        x.rename(dp)


In [52]:
df.sort_values("year")

Unnamed: 0.1,Unnamed: 0,year,author,title,filename code,api code,lang
28,173,1902,"Sibbern, Georg",Efterladte Digte,2011102524009,2011102524009,nob
97,453,1904,"Vogt, Nils Collett",Fra Vaar til Høst : Digte,2008022904013,2008022904013,nob
53,294,1904,"Ibsen, Henrik","Kæmpehøjen, dramatisk digtning i en akt ; og O...",2010082422026,2010082422026,nob
54,296,1905,"Ibsen, Henrik",Brand : et dramatisk digt,2010051412002,2010060420002,nob
23,151,1905,"Hovden, Anders",Salme og Song,2008040201033,2008040201033,nno
...,...,...,...,...,...,...,...
20,146,1959,"Hovden, Anders",Dikt i utval,2012012308310,2012012308310,nno
36,235,1962,"Garborg, Hulda",Den store freden : dramatisk dikt,2008052004044,2008052004044,nno
95,451,1964,"Vogt, Nils Collett",Et liv i dikt,2010120606023,2010120606023,nob
105,494,1976,"Landstad, M.B.",M.B. Landstads Kirkesalmebok : revidert og for...,2013012206081,2013012206081,nob


In [64]:
2011102524009 == 2011102524009

True

{2006081000055,
 2006082800020,
 2006082800028,
 2006082800111,
 2006083000125,
 2006083100012,
 2006083100021,
 2006090100024,
 2006113000043,
 2006120101012,
 2006120401055,
 2006120501131,
 2006120701085,
 2006121301001,
 2007011001018,
 2007011701030,
 2007062904043,
 2007073010003,
 2007092102002,
 2007121410004,
 2008012203003,
 2008021403002,
 2008022704011,
 2008022904012,
 2008022904013,
 2008032702005,
 2008040200033,
 2008040200103,
 2008040201033,
 2008040204086,
 2008040404048,
 2008040704071,
 2008040800028,
 2008040800036,
 2008040804104,
 2008041004045,
 2008041104013,
 2008041400023,
 2008041401006,
 2008041500052,
 2008041500060,
 2008042104034,
 2008052004044,
 2008082104036,
 2009013000065,
 2009062204062,
 2009070601030,
 2009070601033,
 2009070601034,
 2009071403009,
 2009082500010,
 2009100610003,
 2009111012004,
 2010010713001,
 2010011113001,
 2010011213001,
 2010011313001,
 2010011413001,
 2010011512001,
 2010011512002,
 2010040812003,
 2010040912004,
 2010040

### TODO: 
Stille sammen data slik at vi har en all_data og en post_1900 med alle tekstfilene.  
Denne skal legges ut 

## Remove the dont-need from xml files

In [None]:
from pathlib import Path

p = Path("reocr_batch_2/digibok/")
i = 0
for x in p.iterdir():
    code = x.name.split("digibok_")[1]
    i += 1
    #print(code)
print(i)