## Extract poems based on filename

In [1]:
import re
from pathlib import Path

dirname = "old/20150526_nbdig_txt01/idetfrikorpus-dist-txt"
p = Path(dirname)

txts = [e for e in p.iterdir() if e.suffix == ".txt"]
dirs = [e for e in p.iterdir() if e.is_dir()]

txts += [t for d in dirs for t in d.iterdir()]

print(f"Number of books in corpus {len(txts)} (but website says 26.344)")

pattern = r".*[dD]i[g|k]t.*|.*[lL]yrik.*|.*[Ss]alme.*|.*[Ss]ang.*"

poems = [t for t in txts if re.match(pattern, t.name)]

print(f"# of books where filename matches: {len(poems)}")

def get_publication_year_and_sort(poems):
    years_str = [poem.name.split("digibok")[1].split("-")[1] for poem in poems]
    y = [int(y) if y else 0 for y in years_str]
    
    name_y = list(zip((poem.name for poem in poems), y))
    name_y.sort(key=lambda x: x[1])
    
    return name_y

name_y = get_publication_year_and_sort(poems)
_, y = zip(*name_y) 

i = y.index(1900)
newer_books = name_y[i:]
print(f"# of books where filename matches and newer than 1900: {len(newer_books)}")

i = y.index(1930)
even_newer_books = name_y[i:]
print(f"# of books where filename matches and newer than 1930: {len(even_newer_books)}")

Number of books in corpus 21483 (but website says 26.344)
# of books where filename matches: 593
# of books where filename matches and newer than 1900: 181
# of books where filename matches and newer than 1930: 57


## Same but without unpacking corpus (tarfile)

In [2]:
import tarfile
import re

tar = tarfile.open('20150526_nbdig_txt01.tar.gz')

pattern = r".*[dD]i[g|k]t.*|.*[lL]yrik.*|.*[Ss]alme.*|.*[Ss]ang.*"

i = 0
j = 0

for file in tar:
    if file.isfile():
        j += 1
        if re.match(pattern, file.name):
            i += 1
            
print(f"Total number of files: {j}\nNumber of files that match with pattern {i}")

Total number of files: 21483
Number of files that match with pattern 593


## Find out how many of the 593 where the title matches (not filename)

In [3]:
import json 
import requests

def get_code(filename):
    return filename.split("_")[1].split("-")[0]

def get_metadata(codes):
    metadata = {}
    base_api = "https://api.nb.no/catalog/v1/items/URN:NBN:no-nb_digibok_"
    for code in codes:
        response = requests.get(base_api+code)
        j = response.json()
        metadata[code] = j["metadata"]
    return metadata

# # uncomment this to call api (commexnted out to not run unecessarily many times)
# codes = [get_code(poem.name) for poem in poems]
# metadata = get_metadata(codes)

# with open("metadata_593.json", "w+") as f:
#     json.dump(metadata, f)

In [4]:
import pandas as pd

def metadata_to_df(metadata):
    d = []
    for code, data in metadata.items():
        lang = data["languages"][0]["code"]
        year = int(data["originInfo"]["issued"])
        title = data["title"]
        
        data_code = data["identifiers"]["urn"].split("_")[-1]
        if code != data_code:
            print(f"Received code: {code}, metadata code: {data_code}")

        if "people" in data:
            author = data["people"][0]["name"]
        elif "statementOfResponsibility" in data:
            author = data["statementOfResponsibility"][0]
        d.append({"year": year, "author": author, "title": title, "filename code": code, "api code": data_code, "lang": lang})
    return pd.DataFrame(d)

with open("metadata_593.json") as f:
    metadata = json.load(f)

df_593 = metadata_to_df(metadata)

Received code: 2008042303004, metadata code: 2021120826007
Received code: 2009013010001, metadata code: 2010050510003
Received code: 2012011012011, metadata code: 2012011712004
Received code: 2008102303002, metadata code: 2015071427011
Received code: 2012011012013, metadata code: 2012011712005
Received code: 2006111600061, metadata code: 2017011248051
Received code: 2008010704054, metadata code: 2014102306006
Received code: 2012101224024, metadata code: 2012101224027
Received code: 2009040212006, metadata code: 2009040212008
Received code: 2007121410004, metadata code: 2008040800024
Received code: 2012030212009, metadata code: 2013032526002
Received code: 2008071410004, metadata code: 2014071728003
Received code: 2008071610001, metadata code: 2019120928001
Received code: 2009051513004, metadata code: 2011032412001
Received code: 2013050224034, metadata code: 2013050224102
Received code: 2010040912003, metadata code: 2017020748579
Received code: 2010041612002, metadata code: 20100416120

In [5]:
df_593

Unnamed: 0,year,author,title,filename code,api code,lang
0,1830,"Guldberg, O. Høegh (Ove)","Psalmebog, eller En Samling af gamle og nye Ps...",2010063003020,2010063003020,dan
1,1869,"Guldberg, O. Høegh (Ove)",Fra nordiske Digtere : et Album,2006112101006,2006112101006,dan
2,1862,"Barfod, Frederik","Ingemanns Jordefærd : Taler, Digte og Bladarti...",2009090403002,2009090403002,dan
3,1856,"Barfod, Frederik",Guds Man : en Psalme-Bog,2006111600006,2006111600006,nob
4,1834,"Grøndahl, Chr. (Christopher)","Evangelisk-christelig Psalmebog, tilligemed Co...",2008051403001,2008051403001,dan
...,...,...,...,...,...,...
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob
591,1912,"Munthe, Margrethe",Aase fiskerpike : eventyrkomedie med sange til...,2013073008139,2013073008139,nob


## Count files where where codes differ and compare titles

In [18]:
d1 = set(df_593["api code"]) - set(df_593["filename code"])
d2 = set(df_593["filename code"]) - set(df_593["api code"])
len(d1), len(d2)

(38, 38)

In [15]:
diff_code_poems = [e.name for e in poems if get_code(e.name) in d2]

poem_codes = [get_code(e) for e in diff_code_poems]
poem_titles = [" ".join(e.split("-")[5][:-4].split("_")) for e in diff_code_poems]
poem_authors = [e.split("-")[4] for e in diff_code_poems]

check1_df = pd.DataFrame({"title":poem_titles, "author":poem_authors, "filename code": poem_codes })
#check1_df

In [21]:
check2_df = df_593.loc[df_593["api code"].isin(d1)]
#check2_df

Looks like they are the same!

In [22]:
for (title1, title2 ) in zip(check1_df["title"], check2_df["title"]):
    print(f"{title1}\n{title2}\n")

Psalme Skat et Udvalg af gamle Psalmer uddrag
Psalme-Skat : Et Udvalg af gamle Psalmer, uddraget af Kingos, Pontoppidans og Brorsons Samlinger

En Ny Danske oc Norske fuldkommen Psalme Bog 
En Ny Danske oc Norske fuldkommen Psalme-Bog : hvor udi findis et Tusinde Psalmer oc Sange, hvilcke med største Flijd ere samlede af Mester Hans Thomesøns, samt alleandre fornemste gamle oc beste nye, tryckte Danske aandeligePsalme-Bøger

Melodier til Sangene i Lesebog for Folkeskole
Melodier til Sangene i Læsebog for Folkeskolen og Folkehjemmet : udsatte for Sopran, Alt og Bass, tilligemed en Tenor efter Behag, : for Skoler og Sangforeninger : 1. Skoletrin

Evangelisk christelig Psalmebog tilligemed Co
Evangelisk-christelig Psalmebog, tilligemed Collecter, Epistler og Evangelier, Vor Herres Jesu Christi Lidelses-Historie, Kirke- samt andre Bønner, til Brug ved Kirke- og Huus-Andagt

Melodier til Sangene i Lesebog for Folkeskole
Melodier til Sangene i Læsebog for Folkeskolen og Folkehjemmet : udsatt

## Only match on title instead of filename
6 books were not match on title, but on author name

In [25]:
actual_poems = [t for t in df_593["title"] if re.match(pattern, t)]

In [26]:
df_ = df_593.loc[df_593["title"].isin(actual_poems)]
df_

Unnamed: 0,year,author,title,filename code,api code,lang
0,1830,"Guldberg, O. Høegh (Ove)","Psalmebog, eller En Samling af gamle og nye Ps...",2010063003020,2010063003020,dan
1,1869,"Guldberg, O. Høegh (Ove)",Fra nordiske Digtere : et Album,2006112101006,2006112101006,dan
2,1862,"Barfod, Frederik","Ingemanns Jordefærd : Taler, Digte og Bladarti...",2009090403002,2009090403002,dan
3,1856,"Barfod, Frederik",Guds Man : en Psalme-Bog,2006111600006,2006111600006,nob
4,1834,"Grøndahl, Chr. (Christopher)","Evangelisk-christelig Psalmebog, tilligemed Co...",2008051403001,2008051403001,dan
...,...,...,...,...,...,...
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob
591,1912,"Munthe, Margrethe",Aase fiskerpike : eventyrkomedie med sange til...,2013073008139,2013073008139,nob


## Delete unwanted files

In [27]:
files_to_delete = df_593.loc[~df_593["title"].isin(actual_poems)]
files_to_delete

Unnamed: 0,year,author,title,filename code,api code,lang
17,1860,"Ahnfelt, Oscar","Melodierna i sifferskrift till ""Andeliga sånger""",2009101303013,2009101303013,swe
37,1786,"Bach, Johann Sebastian",Vierstimmige alte und neue Choralgesänge mit P...,2012012424002,2012012424002,ger
41,1814,"Sohm, Peter.","Sång på Årsdagen af Bataillen vid Leipzig, den...",2013062726004,2013062726004,swe
256,1861,"Runeberg, Johan Ludvig",Julqvällen : tre sånger,2009090203012,2009090203012,swe
512,1966,"Jøsang, Ottar",Geologiske og petrografiske undersøkelser i Mo...,2007042001008,2007042001008,nob
548,1814,"Valerius, Johan David",Förenings-sång,2011112224037,2011112224037,mul


In [48]:
p = Path("old/dikt-idetfri/")
n = Path("old/not_poetry/")
l = list(files_to_delete["filename code"])
for f in p.iterdir():
    code = get_code(f.name)
    if code in l:
        f.rename(n / f.name)
print(len(list(p.iterdir())))

587


## Remove poems that are not Norwegian by filename

Of the 593 poems selected on filename, 452 of them have "nob", "nno" or "nor" in the filename. 
That means 141 are in different languages. However several of these are danish in the dataframe!

In [128]:
langs = [(e.name, e.name.split("-")[2]) for e in poems]
norw = [(name, lang) for name, lang in langs if lang in ("nob", "nno", "nor")]
len(norw)

452

In [186]:
norw_codes = [get_code(name) for name, lang in norw]
norw_df = df_593.loc[df_593["filename code"].isin(norw_codes)]
norw_df

Unnamed: 0,year,author,title,filename code,api code,lang
0,1830,"Guldberg, O. Høegh (Ove)","Psalmebog, eller En Samling af gamle og nye Ps...",2010063003020,2010063003020,dan
3,1856,"Barfod, Frederik",Guds Man : en Psalme-Bog,2006111600006,2006111600006,nob
4,1834,"Grøndahl, Chr. (Christopher)","Evangelisk-christelig Psalmebog, tilligemed Co...",2008051403001,2008051403001,dan
5,1852,"Fearnley, Henry Emil",Smaadigte til Udenadslæsning : til Brug ved de...,2010031003059,2010031003059,nob
6,1852,"Hauge, A. Andreas","100 Missions-Psalmer ; med et Tillæg, indehold...",2006111400042,2006111400042,nob
...,...,...,...,...,...,...
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob
591,1912,"Munthe, Margrethe",Aase fiskerpike : eventyrkomedie med sange til...,2013073008139,2013073008139,nob


If we also remove the ones that dont have a title match we get 451 books

In [202]:
norw_df = df_.loc[df_["filename code"].isin(norw_codes)]
norw_df

Unnamed: 0,year,author,title,filename code,api code,lang
0,1830,"Guldberg, O. Høegh (Ove)","Psalmebog, eller En Samling af gamle og nye Ps...",2010063003020,2010063003020,dan
3,1856,"Barfod, Frederik",Guds Man : en Psalme-Bog,2006111600006,2006111600006,nob
4,1834,"Grøndahl, Chr. (Christopher)","Evangelisk-christelig Psalmebog, tilligemed Co...",2008051403001,2008051403001,dan
5,1852,"Fearnley, Henry Emil",Smaadigte til Udenadslæsning : til Brug ved de...,2010031003059,2010031003059,nob
6,1852,"Hauge, A. Andreas","100 Missions-Psalmer ; med et Tillæg, indehold...",2006111400042,2006111400042,nob
...,...,...,...,...,...,...
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob
591,1912,"Munthe, Margrethe",Aase fiskerpike : eventyrkomedie med sange til...,2013073008139,2013073008139,nob


Removing all books that are not listed as nob, nno, nor in metadata info. These are 46 books

In [218]:
not_norw = norw_df.loc[~norw_df["lang"].isin(("nob", "nno", "nor"))]
print(len(not_norw))
not_norw

46


Unnamed: 0,year,author,title,filename code,api code,lang
0,1830,"Guldberg, O. Høegh (Ove)","Psalmebog, eller En Samling af gamle og nye Ps...",2010063003020,2010063003020,dan
4,1834,"Grøndahl, Chr. (Christopher)","Evangelisk-christelig Psalmebog, tilligemed Co...",2008051403001,2008051403001,dan
8,1903,"Collin, Chr.",Engelske digte og sange,2012052505023,2012052505023,mul
9,1922,"Hægstad, Marius",Norsk diktning efter 1814,2006120101048,2006120101048,mul
12,1843,"Cappelen, Jørgen Wright","Evangelisk-christelig Psalmebog, tilligemed Co...",2008102303005,2008102303005,dan
14,1834,"Kingo, Thomas","Psalme-Skat : Et Udvalg af gamle Psalmer, uddr...",2008042303004,2021120826007,dan
34,1796,"Madsen, Johan",Zions Chor-Sange mod Verdens Synge-Chor. Der e...,2009031610002,2009031610002,dan
49,1848,"Cappelen, Jørgen Wright","Evangelisk-christelig Psalmebog, tilligemed Co...",2008102303002,2015071427011,dan
50,1799,"Brorson, Hans Adolph","De Sande Christnes udvalgte Psalmebog, uddrage...",2009030610001,2009030610001,dan
55,1844,"Cappelen, Jørgen Wright","Evangelisk-christelig Psalmebog, tilligemed Co...",2008102303001,2008102303001,dan


In the end, selecting by filename first, and then by language, yields 405 books

In [205]:
small_norw = norw_df.loc[norw_df["lang"].isin(("nob", "nno", "nor"))]
small_norw

Unnamed: 0,year,author,title,filename code,api code,lang
3,1856,"Barfod, Frederik",Guds Man : en Psalme-Bog,2006111600006,2006111600006,nob
5,1852,"Fearnley, Henry Emil",Smaadigte til Udenadslæsning : til Brug ved de...,2010031003059,2010031003059,nob
6,1852,"Hauge, A. Andreas","100 Missions-Psalmer ; med et Tillæg, indehold...",2006111400042,2006111400042,nob
10,1863,"Hægstad, Marius",Evangelisk-christelig Psalmebog : tilligemed C...,2008121003001,2008121003001,nob
13,1898,"Skard, Matias",Norsk Digtning efter 1814 : et Udvalg til Brug...,2010081103001,2010081103001,nob
...,...,...,...,...,...,...
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob
591,1912,"Munthe, Margrethe",Aase fiskerpike : eventyrkomedie med sange til...,2013073008139,2013073008139,nob


But only using language in df_ from the beginning yields 414!
This means that 9 of the books that were removed based on filename were actually Norwegian according to the API?

In [53]:
bigger_norw = df_.loc[df_["lang"].isin(("nob", "nno", "nor"))]
bigger_norw

Unnamed: 0,year,author,title,filename code,api code,lang
3,1856,"Barfod, Frederik",Guds Man : en Psalme-Bog,2006111600006,2006111600006,nob
5,1852,"Fearnley, Henry Emil",Smaadigte til Udenadslæsning : til Brug ved de...,2010031003059,2010031003059,nob
6,1852,"Hauge, A. Andreas","100 Missions-Psalmer ; med et Tillæg, indehold...",2006111400042,2006111400042,nob
10,1863,"Hægstad, Marius",Evangelisk-christelig Psalmebog : tilligemed C...,2008121003001,2008121003001,nob
13,1898,"Skard, Matias",Norsk Digtning efter 1814 : et Udvalg til Brug...,2010081103001,2010081103001,nob
...,...,...,...,...,...,...
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob
591,1912,"Munthe, Margrethe",Aase fiskerpike : eventyrkomedie med sange til...,2013073008139,2013073008139,nob


In [212]:
# av de i bigger, hvilke er ikke i smaller?
diff_codes = set(bigger_norw["filename code"]) - set(small_norw["filename code"])

In [214]:
code_names = [(get_code(e.name), e.name) for e in poems if get_code(e.name) in diff_codes]    
code_names

[('2008100603009',
  'digibok_2008100603009-1851-und-946--Psalmernes_Bog_oversat_efter_Foranstaltning_a.txt'),
 ('2009041603027',
  'digibok_2009041603027-1898-und-891--Udvalgte_digte_for_middelskolen_og_gymnasiet_.txt'),
 ('2011060804031',
  'digibok_2011060804031-1922-und-743-Lindeman_Ludv_M_-Koralbog_indeholdende_de_i_Landstads_Salmebog.txt'),
 ('2006112000053',
  'digibok_2006112000053-1905-mul-955-Lassen_Helene-Selma_Lagerlof_og_hendes_digtning_.txt'),
 ('2009080603025',
  'digibok_2009080603025-1862-und-948-Sinding_Otto-Skovstjerner_en_liden_Digtsamling.txt'),
 ('2013081206032',
  'digibok_2013081206032-1921-und-917-Bull_Jacob_B_-Digte_.txt'),
 ('2008032702005',
  'digibok_2008032702005-1928-und-462-Backe_Stein-Gamle_tufter_dikt.txt'),
 ('2008040404048',
  'digibok_2008040404048-1912-und-970-Eskeland_Lars-Stykke_til_diktat_og_attforteljing_samla_og_t.txt'),
 ('2012071208112',
  'digibok_2012071208112-1911-und-882-Janson_Kristofer-Digte_utgivne_paa_hans_syttiaarige_fodselsdag.txt'

In [54]:
keep_df = bigger_norw
keep_df

Unnamed: 0,year,author,title,filename code,api code,lang
3,1856,"Barfod, Frederik",Guds Man : en Psalme-Bog,2006111600006,2006111600006,nob
5,1852,"Fearnley, Henry Emil",Smaadigte til Udenadslæsning : til Brug ved de...,2010031003059,2010031003059,nob
6,1852,"Hauge, A. Andreas","100 Missions-Psalmer ; med et Tillæg, indehold...",2006111400042,2006111400042,nob
10,1863,"Hægstad, Marius",Evangelisk-christelig Psalmebog : tilligemed C...,2008121003001,2008121003001,nob
13,1898,"Skard, Matias",Norsk Digtning efter 1814 : et Udvalg til Brug...,2010081103001,2010081103001,nob
...,...,...,...,...,...,...
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob
591,1912,"Munthe, Margrethe",Aase fiskerpike : eventyrkomedie med sange til...,2013073008139,2013073008139,nob


In [61]:
import numpy as np

years = [int(year) for year in keep_df["year"]]

def print_year_statistics(years):
    print(f"""
    Median: {np.median(years)}
    Mean: {np.mean(years)}
    Max: {np.max(years)}
    Min: {np.min(years)}
    # of books: {len(years)}
    """)

print_year_statistics(years)


    Median: 1890.0
    Mean: 1886.9468599033817
    Max: 2005
    Min: 1790
    # of books: 414
    


## Delete unwanted files

In [49]:
files_to_delete = df_.loc[~df_["lang"].isin(("nob", "nno", "nor"))]
len(files_to_delete)

173

In [50]:
print(len(list(p.iterdir())))
p = Path("old/dikt-idetfri/")
n = Path("old/not_norwegian/")
l = list(files_to_delete["filename code"])
for f in p.iterdir():
    code = get_code(f.name)
    if code in l:
        f.rename(n / f.name)
print(len(list(p.iterdir())))

587
414


## Count language distribution

In [100]:
def print_count_lang_dist(df):

    nob = df.loc[df["lang"] == "nob"]
    nno = df.loc[df["lang"] == "nno"]
    nor = df.loc[df["lang"] == "nor"]

    print(f"""
    Number of bokmål files {len(nob)}\tor {len(nob)/len(df)*100}%
    Number of nynorsk files {len(nno)}\tor {len(nno)/len(df)*100}%
    Number of norsk files {len(nor)}\tor {len(nor)/len(df)*100}%

    """)
    
print_count_lang_dist(keep_df)


    Number of bokmål files 380	or 91.78743961352657%
    Number of nynorsk files 30	or 7.246376811594203%
    Number of norsk files 4	or 0.966183574879227%

    


## Get metadata for first batch
This batch was selected by Språkbanken. Because we got it before we had sorted out irrelevant books the number of received book is higher than we use in our data set.

In [59]:
p = Path("old/ny-ocr-dikt-idetfri/")

codes = [e.name[:-4] for e in p.iterdir()]
print(len(codes))

fb_df = keep_df.loc[keep_df["filename code"].isin(codes)]
fb_df

265


Unnamed: 0,year,author,title,filename code,api code,lang
3,1856,"Barfod, Frederik",Guds Man : en Psalme-Bog,2006111600006,2006111600006,nob
5,1852,"Fearnley, Henry Emil",Smaadigte til Udenadslæsning : til Brug ved de...,2010031003059,2010031003059,nob
6,1852,"Hauge, A. Andreas","100 Missions-Psalmer ; med et Tillæg, indehold...",2006111400042,2006111400042,nob
10,1863,"Hægstad, Marius",Evangelisk-christelig Psalmebog : tilligemed C...,2008121003001,2008121003001,nob
13,1898,"Skard, Matias",Norsk Digtning efter 1814 : et Udvalg til Brug...,2010081103001,2010081103001,nob
...,...,...,...,...,...,...
575,1894,"Nilsen, Karen",Gransus : nye Digte til beslægtede Sjæle,2013061308130,2013061308130,nob
576,1859,"Sundt, Johan L.",Christkirken i Nidaros : digtning,2009082103026,2009082103026,nob
579,1866,"Blom, P.","Aandeligt Strengespil : en Samling Sange, Sukk...",2009090203007,2009090203007,nob
581,1854,"Bentsen, H.",Lyriske Smaadigte : en Prøvesamling,2009021812003,2009021812003,nob


In [62]:
years = [int(year) for year in fb_df["year"]]
print_year_statistics(years)


    Median: 1875.0
    Mean: 1875.2072538860104
    Max: 1905
    Min: 1814
    # of books: 193
    


## Delete unwanted files 
(We received re-ocr on some files we didn't need)

In [63]:
diff_ = set(codes) - set(fb_df["filename code"])
print(f"We received {len(diff_)} books we didn't need")

diff_df = df_593.loc[df_593["filename code"].isin(diff_)]
diff_df

We received 72 books we didn't need (and one is missing from the df)
72


Unnamed: 0,year,author,title,filename code,api code,lang
0,1830,"Guldberg, O. Høegh (Ove)","Psalmebog, eller En Samling af gamle og nye Ps...",2010063003020,2010063003020,dan
1,1869,"Guldberg, O. Høegh (Ove)",Fra nordiske Digtere : et Album,2006112101006,2006112101006,dan
2,1862,"Barfod, Frederik","Ingemanns Jordefærd : Taler, Digte og Bladarti...",2009090403002,2009090403002,dan
4,1834,"Grøndahl, Chr. (Christopher)","Evangelisk-christelig Psalmebog, tilligemed Co...",2008051403001,2008051403001,dan
12,1843,"Cappelen, Jørgen Wright","Evangelisk-christelig Psalmebog, tilligemed Co...",2008102303005,2008102303005,dan
...,...,...,...,...,...,...
539,1862,"Wessel, Johan Herman",J.H. Wessels samlede Digte,2009092103009,2009092103009,dan
544,1855,"Staffeldt, A. W.",Digte,2010021003005,2010021003005,dan
549,1862,"Ploug, Carl",Samlede Digte,2009080703029,2009080703029,dan
556,1867,"Bergsøe, Vilhelm",I Ny og Næ : Digte,2009080703028,2009080703028,dan


In [64]:
p = Path("old/ny-ocr-dikt-idetfri/")
n = Path("old/ny-ocr-neitakk/")

print(len(list(p.iterdir())))
for f in p.iterdir():
    code = f.name[:-4]
    if code in diff_:
        f.rename(n / f.name)
print(len(list(p.iterdir())))

265
193


## Sort on years and get all after 1900
Selecting books for batch 2 of re-OCR

In [66]:
after_1900 = keep_df.loc[keep_df["year"] >= 1900]
after_1900

Unnamed: 0,year,author,title,filename code,api code,lang
30,1920,"Klewe, Andreas","Religiøse korsange for sopran, alt, tenor og b...",2012052908151,2012052908151,nob
36,1900,"Brun, Yngvar",Europæisk Digtning,2010070206172,2010070206172,nob
46,1914,udgivet af Den Internasjonale bibelstudieforening,Norsk daggry-sangbog,2011051212004,2011051212004,nob
65,1901,"Aarrestad, Sven",Sangbog for afholdsforeninger,2012051612001,2012051612001,nob
68,1906,udgivne af Kirke- og undervisningsdepartementet,Fædrelandssange,2006120101031,2006120101031,nob
...,...,...,...,...,...,...
587,1946,"Grieg, Nordahl",Håbet : dikt,2013012806125,2013012806125,nob
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob


## Count language distribution pre- and post-1900

In [103]:
before_1900 = keep_df.loc[keep_df["year"] < 1900]

print("AFTER 1900")
print_count_lang_dist(after_1900)

print("BEFORE 1900")
print_count_lang_dist(before_1900)

AFTER 1900

    Number of bokmål files 138	or 83.63636363636363%
    Number of nynorsk files 27	or 16.363636363636363%
    Number of norsk files 0	or 0.0%

    
BEFORE 1900

    Number of bokmål files 242	or 97.18875502008032%
    Number of nynorsk files 3	or 1.2048192771084338%
    Number of norsk files 4	or 1.6064257028112447%

    


## Subtract the ones we already have

In [70]:
we_need = after_1900.loc[~after_1900["filename code"].isin(fb_df["filename code"])]
print(f"Of the {len(after_1900)} books we want, we already have {len(after_1900)-len(we_need)}")
we_need

Of the 165 books we want, we already have 17


Unnamed: 0,year,author,title,filename code,api code,lang
30,1920,"Klewe, Andreas","Religiøse korsange for sopran, alt, tenor og b...",2012052908151,2012052908151,nob
46,1914,udgivet af Den Internasjonale bibelstudieforening,Norsk daggry-sangbog,2011051212004,2011051212004,nob
68,1906,udgivne af Kirke- og undervisningsdepartementet,Fædrelandssange,2006120101031,2006120101031,nob
79,1925,"Støylen, Bernt",Nynorsk salmebok : for kyrkja og heim og møte,2010102606030,2010102606030,nno
84,1931,"Luther, Martin",Luthers lille katekisme med bibelord og salmevers,2009013000065,2009013000065,nob
...,...,...,...,...,...,...
587,1946,"Grieg, Nordahl",Håbet : dikt,2013012806125,2013012806125,nob
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob


In [68]:
years = we_need["year"]
print_year_statistics(years)


    Median: 1920.5
    Mean: 1926.1756756756756
    Max: 2005
    Min: 1900
    # of books: 148
    


## Check diff we_need and actually received re-OCR
We received 138 of the books we wanted

In [69]:
from pathlib import Path

p = Path("reocr_hocr/digibok/")
rec_codes = [e.name.split("_")[1] for e in p.iterdir()]

pp = Path("reocr_hocr_batch_2/")
rec_codes_2 = set([e.name.split("_")[1] for e in pp.iterdir()])
rec_codes += list(rec_codes_2)
rec_codes = set(rec_codes)

rec_df = we_need.loc[we_need["filename code"].isin(rec_codes)]
print(f"Of the {len(we_need)} books published after 1900, we got {len(rec_df)}, meaning we miss {len(we_need)-len(rec_df)}.")
rec_df

Of the 148 books published after 1900, we got 138, meaning we miss 10.


Unnamed: 0,year,author,title,filename code,api code,lang
46,1914,udgivet af Den Internasjonale bibelstudieforening,Norsk daggry-sangbog,2011051212004,2011051212004,nob
79,1925,"Støylen, Bernt",Nynorsk salmebok : for kyrkja og heim og møte,2010102606030,2010102606030,nno
84,1931,"Luther, Martin",Luthers lille katekisme med bibelord og salmevers,2009013000065,2009013000065,nob
87,1914,"Jansen, Henrik",Bergen 1814-1914 : digte,2012022212004,2012022212004,nob
91,1926,"Støylen, Bernt",Utgreiding um nynorsk salmebok,2008022704011,2008022704011,nno
...,...,...,...,...,...,...
587,1946,"Grieg, Nordahl",Håbet : dikt,2013012806125,2013012806125,nob
588,1925,"Grieg, Nordahl",Stene i strømmen : digte,2013070308043,2013070308043,nob
589,1947,"Grieg, Nordahl",Samlede dikt,2012082408040,2012082408040,nob
590,1950,"Grieg, Nordahl",Samlede dikt,2011102708060,2011102708060,nob


In [107]:
rec_df.to_csv("batch2_info.tsv", sep="\t")

In [62]:
diff =  set(rec_codes) - set(rec_df["filename code"])
print(f"We got {len(diff)} books that we didn't need. (and one is missing from the df (??))")

diff_df = df_593.loc[df_593["filename code"].isin(diff)]
print(len(diff_df))
diff_df

We got 18 books that we didn't need. (and one is missing from the df (??))
17


Unnamed: 0,year,author,title,filename code,api code,lang
8,1903,"Collin, Chr.",Engelske digte og sange,2012052505023,2012052505023,mul
9,1922,"Hægstad, Marius",Norsk diktning efter 1814,2006120101048,2006120101048,mul
183,1946,"Aasen, Ivar",Dikting,2010061806033,2010061806033,mul
323,1911,"Munch, A.",Kongedatterens Brudefart : et Digt i tolv Roma...,2008040704116,2008040704116,dan
338,1907,"Nyrop, Kristoffer",Fortids sagn og sange. 2 : Den evige jøde,2010070205055,2010070205055,dan
339,1909,"Nyrop, Kristoffer",Fortids sagn og sange. 5 : Grevinden med de 36...,2011050604018,2011050604018,dan
340,1908,"Nyrop, Kristoffer",Fortids sagn og sange. 3 : Sangerens hjerte,2010052806041,2010052806041,dan
358,1915,"Baumann, Julius Berg",Fra vidderne : nye digte,2006120500055,2006120500055,mul
360,1922,"Oehlenschläger, Adam",Udvalgte Digte,2008040904047,2008040904047,dan
424,1957,"Jacobsen, J.P. (Jens Peter)",Dikt i utvalg,2007060701064,2007060701064,dan


In [71]:
years = rec_df["year"]
print_year_statistics(years)


    Median: 1920.5
    Mean: 1925.2173913043478
    Max: 1981
    Min: 1902
    # of books: 138
    


## Check if any of the books we already annotated are part of the ones we don't need
Luckily, this is not the case

In [63]:
p = Path("../../norwegian_rhyme_scheme_corpus/annotation_tool/poems/bokmål")
codes = set([e.name.split("_")[0] for e in p.iterdir()])
diff_df.loc[diff_df["api code"].isin(codes)]

Unnamed: 0,year,author,title,filename code,api code,lang


In [64]:
hmm = we_need.loc[we_need["api code"].isin(codes)]
print(f"We annotated {len(codes)} books. Of those, {len(hmm)} are in the ones we need")
hmm

We annotated 11 books. Of those, 11 are the ones we need


Unnamed: 0,year,author,title,filename code,api code,lang
84,1931,"Luther, Martin",Luthers lille katekisme med bibelord og salmevers,2009013000065,2009013000065,nob
115,1944,"Welhaven, J.S. (Johan Sebastian)",Norges demring : og andre dikt,2012032024065,2012032024065,nob
190,1956,"Larsen, Alf",Utvalgte dikt,2008010704054,2014102306006,nob
386,1914,"Bjørnson, Bjørnstjerne",Digte og Sange,2012102608111,2012102608111,nob
405,1950,"Bjørnson, Bjørnstjerne",Dikte og sange : i utvalg,2006081000055,2006081000055,nob
446,1950,"Nilsen, Rudolf",Samlede dikt,2011102708082,2011102708082,nob
553,1941,"Zwilgmeyer, Dagfinn",Nordlandssanger : i herr Petters fotefar,2007011001018,2007011001018,nob
558,1934,"Holme, Edin",Dikterverker. 4 : Høvdinger ; Tonen fra himlen,2011041408047,2011041408047,nob
559,1934,"Holme, Edin",Dikterverker. 1 : Ildprofeten ; Menneskesønnen,2011042608056,2011042608056,nob
561,1934,"Holme, Edin",Dikterverker. 2 : Mannen fra Tarsus,2011041408091,2011041408091,nob


## Check that examples from thesis are actually in our selection

In [69]:
hustru = "2009022603004" 
voler = "2010030903071"
keep_df.loc[keep_df["api code"].isin([hustru, voler])]

Unnamed: 0,year,author,title,filename code,api code,lang
89,1893,"Aagaard, Gustav",Sjøluft : billeder i digt,2009022603004,2009022603004,nob
109,1857,"Heyerdahl, Halvor",Historiske Sange fra Norges Fortid,2010030903071,2010030903071,nob


In [70]:
fb_df.loc[fb_df["api code"].isin([hustru, voler])]

Unnamed: 0,year,author,title,filename code,api code,lang
89,1893,"Aagaard, Gustav",Sjøluft : billeder i digt,2009022603004,2009022603004,nob
109,1857,"Heyerdahl, Halvor",Historiske Sange fra Norges Fortid,2010030903071,2010030903071,nob


## Gather received re-OCR

In [92]:
re_ocr = pd.concat((rec_df, fb_df))
pre_1900 = re_ocr.loc[re_ocr["year"] < 1900]
post_1900 = re_ocr.loc[re_ocr["year"] >= 1900]

In [108]:
re_ocr.to_csv("re-ocr_info.tsv", sep="\t")

In [104]:
print("FULL re-OCR")
years = re_ocr["year"]
print_year_statistics(years)
print_count_lang_dist(re_ocr)

FULL re-OCR

    Median: 1897.0
    Mean: 1896.0574018126888
    Max: 1981
    Min: 1814
    # of books: 331
    

    Number of bokmål files 300	or 90.6344410876133%
    Number of nynorsk files 30	or 9.06344410876133%
    Number of norsk files 1	or 0.3021148036253776%

    


In [105]:
print("PRE-1900 re-OCR")
years = pre_1900["year"]
print_year_statistics(years)
print_count_lang_dist(pre_1900)

PRE-1900 re-OCR

    Median: 1871.5
    Mean: 1872.6761363636363
    Max: 1899
    Min: 1814
    # of books: 176
    

    Number of bokmål files 172	or 97.72727272727273%
    Number of nynorsk files 3	or 1.7045454545454544%
    Number of norsk files 1	or 0.5681818181818182%

    


In [106]:
print("POST-1900 re-OCR")
years = post_1900["year"]
print_year_statistics(years)
print_count_lang_dist(post_1900)

POST-1900 re-OCR

    Median: 1920.0
    Mean: 1922.6064516129031
    Max: 1981
    Min: 1900
    # of books: 155
    

    Number of bokmål files 128	or 82.58064516129032%
    Number of nynorsk files 27	or 17.419354838709676%
    Number of norsk files 0	or 0.0%

    
