In [1]:
import pandas as pd

import os
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime as dt

In [2]:
tblpath = "./tables/"
k = 1000000

In [3]:
def iso_date_to_int(date_col):
    d2int = lambda s: int(s.replace("1000000", "").replace(".0", "")[:4])
    return date_col.fillna(0).apply(lambda x: d2int(str(x)))

def replaceNI(row):
    return f"{row.BeginISODate} - {row.EndISODate}" if row.Dated == "[NI]" else row.Dated

def to_date_str(year_int):
    return dt.strptime(str(year_int).zfill(4),
                      "%Y").date()

def dept_id_to_int(id_col):
    for x in id_col:
        try:
            yield int(x)
        except ValueError:
            yield -10

In [None]:
pbar = tqdm(total=4)

# LOAD TABLES
id_fields = ["DepartmentID", "ClassificationID", "ObjectName", # this is the type of object
             "Medium", "Provenance", "Exhibitions"]
date_fields = ["BeginISODate", "EndISODate", "Dated"]
txt_fields = ["Title", "Description", "Notes", "CuratorialRemarks", "RelatedWorks", "HistAttributions"]
relevant_fields = id_fields + date_fields + txt_fields
objs = pd.read_csv(tblpath + f"Objects_{k}.csv.gz").set_index("ObjectID")[relevant_fields]

cls_fields = ["ClassificationID","Classification", "AATCN", "SubClassification"] #, "SubClassification2"] -> always the same as AATCN when present
clsss = pd.read_csv(tblpath + "Classifications.csv.gz")[cls_fields].set_index("ClassificationID")

dept_fields = ["DepartmentID", "Department"]
depts = pd.read_csv(tblpath + "Departments.csv.gz").drop("GSRowVersion", axis="columns").drop_duplicates(keep="first").set_index("DepartmentID")

pbar.update(1)

### DATES
new_begin_dates = iso_date_to_int(objs.BeginISODate)
new_end_dates = iso_date_to_int(objs.EndISODate)

objs["BeginISODate"] = new_begin_dates
objs["EndISODate"] = new_end_dates
new_dated = objs.apply(replaceNI, axis=1)
objs["Dated"] = new_dated

objs = objs[objs.BeginISODate < objs.EndISODate]
objs = objs[(objs.BeginISODate + objs.EndISODate) > 0]
objs = objs[objs.BeginISODate < 2022]
objs = objs[objs.EndISODate < 2022]
# highly aggressive
objs = objs[objs.BeginISODate > 0] ### !
objs = objs[objs.EndISODate > 0] ### !

objs["BeginISODate"] = objs.BeginISODate.apply(to_date_str)
objs["EndISODate"] = objs.EndISODate.apply(to_date_str)

pbar.update(1)

### CLASSIFICATIONS
objs = objs.join(clsss.drop_duplicates(keep="last"), how="left", on="ClassificationID")

pbar.update(1)

### DEPARTMENTS
objs["DepartmentID"] = list(dept_id_to_int(objs.DepartmentID))
objs = objs[objs.DepartmentID > -1]
objs["Department"] = list(depts.loc[objs.DepartmentID].Department)

pbar.update(1)


### FILTER OBJECTS
objs = objs.drop_duplicates()


### RENAME COLUMNS
objs = objs.reset_index().rename(columns=dict(
        Title="name",
        Description="description",
        ObjectID="ID",
        BeginISODate="start_date",
        EndISODate="end_date",
        Dated="date_string"
    )
).set_index("ID")
objs.index = objs.index.astype("int")

objs["source_url"] = "https://hdl.handle.net/20.500.11840/" + objs.index.astype(str)

objs.to_csv("NMvW.v0_4.csv.gz")

---
# Test Load

In [4]:
df = pd.read_csv("NMvW.v0_4.csv.gz")

In [16]:
df.dtypes

ID                     int64
DepartmentID           int64
ClassificationID     float64
ObjectName            object
Medium                object
Provenance            object
Exhibitions           object
start_date            object
end_date              object
date_string           object
name                  object
description           object
Notes                 object
CuratorialRemarks     object
RelatedWorks          object
HistAttributions      object
Classification        object
AATCN                 object
SubClassification     object
Department            object
source_url            object
dtype: object