In [1]:
def extract_filename(path):
    return path.split('/')[-1].split('.')[0].split('_')[0]

In [2]:
import os
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles

In [3]:
import pandas as pd


In [4]:
df_files = pd.read_csv('archive_files_with_paths_and_descriptions.tsv',sep='\t')

In [5]:
def recover_filename(f_raw):
    f = str(f_raw)
    while len(f) < len('0001246757'):
        f = '0'+f
    return f

In [6]:
df_files['filename'] = df_files.apply(lambda row: recover_filename(row['filename']),
                                       axis=1)

In [7]:
df_files.head()

Unnamed: 0.1,Unnamed: 0,part_of.title,images.reference,filename,is_missing,path_to_image
0,1,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\757\0001246757.jpg,1246757,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...
1,2,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\758\0001246758.jpg,1246758,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...
2,3,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\759\0001246759.jpg,1246759,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...
3,4,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\760\0001246760.jpg,1246760,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...
4,5,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\761\0001246761.jpg,1246761,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...


In [8]:
# todo: classify typed vs handwritten (or get info for those already classified)

In [9]:
path_typed_old = '/ivi/ilps/projects/ArtDATIS/artdatis/tagging/OCRed/typed/'
path_typed_new = '/ivi/ilps/personal/vprovat/ocr/newest_results/typed/'

In [10]:
typed_files = list(set(getListOfFiles(path_typed_old)+getListOfFiles(path_typed_new)))

In [11]:
texts_typed = [f for f in typed_files if f.endswith('text.txt')]

In [12]:
len(texts_typed)

34196

In [13]:
# filenames_typed = set([f.split('/')[-1].split('.')[0] for f in texts_typed])

In [14]:
path_handwritten = '/ivi/ilps/personal/vprovat/backup_ocr/handwritten/'
path_no_text = '/ivi/ilps/personal/vprovat/backup_ocr/no_text/'
path_no_text_new = '/ivi/ilps/personal/vprovat/backup_ocr/new_results/non-typed/'


In [15]:
non_typed_file_paths = [f for f in list(set(getListOfFiles(path_handwritten)+getListOfFiles(path_no_text)+getListOfFiles(path_no_text_new)))
                        if f.endswith('path.txt')]

In [16]:
non_typed_file_osds = [f for f in list(set(getListOfFiles(path_handwritten)+getListOfFiles(path_no_text)+getListOfFiles(path_no_text_new)))
                        if f.endswith('osd.txt')]

In [17]:
dict_typed_text_paths = dict([(extract_filename(path), path) for path in texts_typed])
dict_non_typed_paths = dict([(extract_filename(path), path) for path in non_typed_file_paths])

In [18]:
# sanity check
set(dict_typed_text_paths.keys()).intersection(set(dict_non_typed_paths.keys()))

{'NL-HaRKD'}

In [19]:
# another sanity check: do we have them all?
len(set(dict_typed_text_paths.keys()))+len(set(dict_non_typed_paths.keys()))-len(df_files)

0

Now, let's try to detect handwritten files among the non-typed ones (and label others as likely drawings):

In [20]:
drawing_files = set()
handwritten_files = set()
dict_handwritten_osd_paths = {}

In [21]:
from tqdm import tqdm

In [22]:
for path in tqdm(non_typed_file_osds):
    name = extract_filename(path)
    osd = open(path,'r').read()
    if not osd:
        drawing_files.add(name)
    else:
        handwritten_files.add(name)
        dict_handwritten_osd_paths[name] = path

100%|██████████| 90881/90881 [07:24<00:00, 204.67it/s]


In [23]:
typed_files = set(dict_typed_text_paths.keys())

In [24]:
# Handling empty stuff to avoid errors
for f in df_files['filename'].tolist():
    if f not in dict_typed_text_paths:
        dict_typed_text_paths[f] = None
    if f not in dict_handwritten_osd_paths:
        dict_handwritten_osd_paths[f] = None

In [25]:
def get_page_class(f):
    if f in typed_files:
        return "typed"
    if f in handwritten_files:
        return "likely handwritten"
    if f in drawing_files:
        return "likely drawing"
    return None

In [26]:
df_files['page_type'] = df_files.apply(lambda row: get_page_class(row['filename']),
                                       axis=1)

In [27]:
df_files['path_to_text'] = df_files.apply(lambda row: dict_typed_text_paths[row['filename']],
                                       axis=1)

In [28]:
df_files['path_to_handwritten_info'] = df_files.apply(lambda row: dict_handwritten_osd_paths[row['filename']],
                                       axis=1)

In [29]:
df_files.head()

Unnamed: 0.1,Unnamed: 0,part_of.title,images.reference,filename,is_missing,path_to_image,page_type,path_to_text,path_to_handwritten_info
0,1,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\757\0001246757.jpg,1246757,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,typed,/ivi/ilps/projects/ArtDATIS/artdatis/tagging/O...,
1,2,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\758\0001246758.jpg,1246758,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,likely handwritten,,/ivi/ilps/personal/vprovat/backup_ocr/handwrit...
2,3,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\759\0001246759.jpg,1246759,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,likely handwritten,,/ivi/ilps/personal/vprovat/backup_ocr/new_resu...
3,4,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\760\0001246760.jpg,1246760,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,likely handwritten,,/ivi/ilps/personal/vprovat/backup_ocr/new_resu...
4,5,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\761\0001246761.jpg,1246761,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,likely handwritten,,/ivi/ilps/personal/vprovat/backup_ocr/new_resu...


In [30]:
remaining_paths_to_classify = df_files[df_files['page_type'].isna()]['path_to_image'].tolist()

In [32]:
len(remaining_paths_to_classify)

6505

In [31]:
# Classify the missing files with tesseract

In [38]:
import pytesseract
import json

try:
    from PIL import Image
except ImportError:
    import Image

RES_PATH = '/ivi/ilps/personal/vprovat/OCR_for_missing_folders'
    
    
def process_document(path):
    name = extract_filename(path)
    try:
        img = Image.open(path)
    except: # path not found
        return
    
    try:
        osd_dict = pytesseract.image_to_osd(img, output_type='dict')
    except: # Too few characters for osd -> most likely blank or drawing, could also be handwritten
        with open(f"{RES_PATH}/likely_drawing/{name}_path.txt","w") as f:
            f.write(path)
        return
        
    
    if osd_dict['script'] == 'Latin': # Is a typed document
        with open(f"{RES_PATH}/typed/{name}_path.txt","w") as f:
            f.write(path)
            
        text = pytesseract.image_to_string(img)
        with open(f"{RES_PATH}/typed/{name}_text.txt","w") as f:
            f.write(text)
    else: # We think it's handwritten, saving osd just in case
        with open(f"{RES_PATH}/likely_handwritten/{name}_path.txt","w") as f:
            f.write(path)
        with open(f"{RES_PATH}/likely_handwritten/{name}_osd.txt","w") as f:
            f.write(json.dumps(osd_dict))

In [105]:
# import parmap

In [None]:
# parmap.map(process_document, remaining_paths_to_classify[-1000:], pm_processes=8, pm_pbar=True)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

In [33]:
new_typed_dict = dict([(extract_filename(f), f) for f in getListOfFiles('/ivi/ilps/personal/vprovat/OCR_for_missing_folders/typed') if f.endswith('text.txt')])
new_handwritten_dict = dict([(extract_filename(f), f) for f in getListOfFiles('/ivi/ilps/personal/vprovat/OCR_for_missing_folders/likely_handwritten') if f.endswith('osd.txt')])
new_drawing_dict = dict([(extract_filename(f), f) for f in getListOfFiles('/ivi/ilps/personal/vprovat/OCR_for_missing_folders/likely_drawing')])

In [34]:
def path_already_processed(path):
    f = extract_filename(path)
    return (f in new_typed_dict) or (f in new_handwritten_dict) or (f in new_drawing_dict)

In [35]:
# in case we need to update
still_remaining_paths_to_classify = [path for path in remaining_paths_to_classify if not path_already_processed(path)
                                    ]

In [36]:
len(still_remaining_paths_to_classify)

2

In [39]:
import parmap
parmap.map(process_document, still_remaining_paths_to_classify, pm_processes=2, pm_pbar=True)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




[None, None]

In [42]:
import pickle
pickle.dump(still_remaining_paths_to_classify, open('paths_to_classify.p','wb'))

In [40]:
def update_classification(row):
    if row['page_type']:
        return row['page_type']
    f = row['filename']
    if f in new_typed_dict:
        return "typed"
    if f in new_handwritten_dict:
        return "likely handwritten"
    if f in new_drawing_dict:
        return "likely drawing"
    
    return None

In [41]:
def update_osd(row):
    if row['path_to_handwritten_info']:
        return row['path_to_handwritten_info']
    
    f = row['filename']
    if f in new_handwritten_dict:
        return new_handwritten_dict[f]
    return None
    

In [42]:
def update_text(row):
    if row['path_to_text']:
        return row['path_to_text']
    
    f = row['filename']
    if f in new_typed_dict:
        return new_typed_dict[f]
    return None
    

In [43]:
df_files['page_type'] = df_files.apply(lambda row: update_classification(row),
                                       axis=1)

df_files['path_to_handwritten_info'] = df_files.apply(lambda row: update_osd(row),
                                       axis=1)

df_files['path_to_text'] = df_files.apply(lambda row: update_text(row),
                                       axis=1)

In [47]:
import csv

In [48]:
df_files.to_csv('archive_files_with_paths_descriptions_and_page_classifications.tsv',sep='\t', quoting=csv.QUOTE_NONNUMERIC)

In [49]:
df_files

Unnamed: 0.1,Unnamed: 0,part_of.title,images.reference,filename,is_missing,path_to_image,page_type,path_to_text,path_to_handwritten_info
0,1,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\757\0001246757.jpg,0001246757,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,typed,/ivi/ilps/projects/ArtDATIS/artdatis/tagging/O...,
1,2,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\758\0001246758.jpg,0001246758,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,likely handwritten,,/ivi/ilps/personal/vprovat/backup_ocr/handwrit...
2,3,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\759\0001246759.jpg,0001246759,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,likely handwritten,,/ivi/ilps/personal/vprovat/backup_ocr/new_resu...
3,4,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\760\0001246760.jpg,0001246760,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,likely handwritten,,/ivi/ilps/personal/vprovat/backup_ocr/new_resu...
4,5,"Correspondence, editorial office 'Goed Wonen' ...",..\images\jpg\761\0001246761.jpg,0001246761,False,/ivi/ilps/projects/ArtDATIS/valkema/MMRKD07_00...,likely handwritten,,/ivi/ilps/personal/vprovat/backup_ocr/new_resu...
...,...,...,...,...,...,...,...,...,...
102156,103029,1943-1944,..\images\jpg\329\0001447329.jpg,0001447329,False,/ivi/ilps/personal/vprovat/missing_folders_Art...,likely drawing,,
102157,103030,1943-1944,..\images\jpg\330\0001447330.jpg,0001447330,False,/ivi/ilps/personal/vprovat/missing_folders_Art...,typed,/ivi/ilps/personal/vprovat/OCR_for_missing_fol...,
102158,103031,1943-1944,..\images\jpg\331\0001447331.jpg,0001447331,False,/ivi/ilps/personal/vprovat/missing_folders_Art...,likely drawing,,
102159,103032,1943-1944,..\images\jpg\332\0001447332.jpg,0001447332,False,/ivi/ilps/personal/vprovat/missing_folders_Art...,likely drawing,,


In [None]:
# prepare for network analysis on letters: find letters, extract names (to, from)
# 

In [10]:
descriptions = sorted(list(set(df_files['part_of.title'].tolist())))

In [11]:
descriptions

["'10 jaar Vrij Glas op de Rietveld Academie', Centrum De Vaart, Hilversum, 23 June - 22 July 1979",
 "'Encyclopédie, ou Dictionnaire Universel Raisonné des Connoissance Humaines'",
 "'Vrij Glas', Hengelose Kunstzaal, 18 February - 18 May 1978",
 "10th Jubilee 'Werkgroep Glas', 14 September 1979",
 '1934',
 '1935, or undated',
 '1936',
 '1936 (Part 1)',
 '1936 (Part 2)',
 '1937',
 '1937 (Part 3)',
 '1938',
 '1939',
 '1939, part 2',
 '1940',
 '1941',
 '1942',
 '1943',
 '1943-1944',
 '1943-1947',
 '1944',
 '1945',
 '1946',
 '1947',
 '1948',
 '1949',
 '1950',
 '1951',
 '1952',
 '1953',
 '1954',
 '1955',
 '1956',
 '1957',
 '1958',
 '1959',
 '1960',
 '1961',
 '1962',
 '1963',
 '1963-1964',
 '1963-1993',
 '1964',
 '1964-1965',
 '1965',
 '1965-1966',
 '1966',
 '1966-1967',
 '1967',
 '1968',
 '1970',
 '1971',
 '1972',
 '1973',
 '1973-1975, 1992-1993',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980-1981',
 '1981',
 '1982',
 '1983',
 '1983-1984',
 '1984',
 '1985',
 '1985-1