In [None]:
import pandas as pd

import os
from tqdm import tqdm

In [None]:
tblpath = "./tables/"
k = 1000000

---
# NEW


Fix:
 - columns `RelatedWorks` and `Provenance` have mixed data types (should both be str)  
     `pd.read_csv("v0.csv.gz", dtype=dict(Provenance="string", RelatedWorks="string"))`
     
 - get all data types right (`ObjectID` is saved/loaded as float)



# 1. Objects

In [None]:
id_fields = ["DepartmentID", "ClassificationID", "ObjectName", # this is the type of object
             "Medium", "Provenance", "Exhibitions"]
             
date_fields = ["BeginISODate", "EndISODate", "Dated"]

txt_fields = ["Title", "Description", "Notes", "CuratorialRemarks", "RelatedWorks", "HistAttributions"]

relevant_fields = id_fields + date_fields + txt_fields

objs = pd.read_csv(tblpath + f"Objects_{k}.csv.gz").set_index("ObjectID")[relevant_fields]

# 2. Dates

TODO:

 - get rid of objects with `BeginISODate == EndISODate == 0` -> 372k out of 1M
 - what about objects with `BeginISODate > EndISODate`?  --> remove, 4k out of 1M
 - replace objects whose `Dated == [NI]` with `"{BeginISODate} -- {EndISOdate}"` (or `"voor {EndISODate}"` if `BeginISODate == 0` and same with `"na {BeginISODate}"`) -> 190k out of 1M
 - do something with objects which have dates > 2021

In [None]:
def iso_date_to_int(date_col):
    d2int = lambda s: int(s.replace("1000000", "").replace(".0", "")[:4])
    return date_col.fillna(0).apply(lambda x: d2int(str(x)))

In [None]:
new_begin_dates = iso_date_to_int(objs.BeginISODate)
new_end_dates = iso_date_to_int(objs.EndISODate)

In [None]:
import matplotlib.pyplot as plt
print(((new_begin_dates + new_end_dates) == 0).astype("int").sum(), new_begin_dates.shape[0])

print((new_begin_dates == 0).sum(), (new_end_dates == 0).sum(), objs.shape[0])
         
print((new_begin_dates > new_end_dates).sum(), objs.shape[0])

print((objs.Dated == "[NI]").sum(), objs.shape[0])

In [None]:
objs["BeginISODate"] = new_begin_dates
objs["EndISODate"] = new_end_dates

In [None]:
objs = objs[objs.BeginISODate < 2020]
objs = objs[objs.EndISODate < 2020]

In [None]:
objs.EndISODate.value_counts()

# 3. Classifications

In [None]:
cls_fields = ["ClassificationID","Classification", "AATCN", "SubClassification"] #, "SubClassification2"] -> always the same as AATCN when present
cls = pd.read_csv(tblpath + "Classifications.csv.gz")[cls_fields].set_index("ClassificationID")
# clsxr = pd.read_csv(tblpath + f"ClassificationXRefs_{k}.csv.gz")

In [None]:
# need to drop duplicates, otherwise those will duplicate the rows in objs
objs = objs.join(cls.drop_duplicates(keep="last"), how="left", on="ClassificationID")

# 4. Titles

TODO:

  - some objects in the `Objects` table have no entry in the `Titles` table (their `ObjectID` doesn't exist)  
    --> use `Title` from `Titles` only if exists and from `Objects` otherwise

In [None]:
title_fields = []
titles = pd.read_csv(tblpath + "ObjTitles.csv.gz").set_index("ObjectID")

In [None]:
def merge_titles(obj_tbl):
    title_tbl = pd.read_csv(table_folder + "ObjTitles.csv.gz").set_index("ObjectID")
    
    common_keys = sorted(set(obj_tbl.index) & set(title_tbl.index))
    
    obj_tbl["vale_Title"] = obj_tbl.Title
    
    obj_tbl["vale_Title"].loc[common_keys] = title_tbl[common_keys]
    
    return obj_tbl

# 5. Locations?

### 5.1 Departments

In [None]:
dept_fields = ["DepartmentID", "Department"]

depts = pd.read_csv(tblpath + "Departments.csv.gz")
depts = depts.drop("GSRowVersion", axis="columns").drop_duplicates(keep="first")
depts = depts.set_index("DepartmentID")


# dpt_series = depts.Department[objs.DepartmentID]
# dpt_series.index = objs.index
# objs["Department"] = dpt_series

In [None]:
depts.index.unique(), objs.DepartmentID.unique()

In [None]:
def dept_id_to_int(id_col):
    for x in id_col:
        try:
            yield int(x)
        except ValueError:
            yield -10

dept_ids = list(dept_id_to_int(objs.DepartmentID))

objs["DepartmentID"] = dept_ids

In [None]:
objs = objs[objs.DepartmentID > -1]

In [None]:
objs.DepartmentID.value_counts()

In [None]:
objs["Department"] = list(depts.loc[objs.DepartmentID].Department)

# Drop Objects by Criteria & Save

In [None]:
objs = objs.drop_duplicates()

In [None]:
objs["ObjectID"] = objs.ObjectID.astype("int")

objs = objs.set_index("ObjectID")

In [None]:
objs.index

In [None]:
objs.to_csv("v0.csv.gz")

In [None]:
objs.loc[1140716].Title

In [None]:
objs

# Thesaurus Stuff

In [None]:
terms = pd.read_csv(tblpath + "TextEntries.csv.gz")

In [None]:
terms

In [None]:
terms.Term.value_counts()