In [39]:
# Starting to explore stability of commodities using shipping data in the BKB dataset
# 24.07.2024
# vera.provatorova@dh.huc.knaw.nl

## Steps:
* [Read the data](#read-data)
* [Convert shipping quantity values to decimal](#convert-values)
* [Aggregate the data](#aggregate)
* [Connect the commodities with GLOBALISE thesaurus](#globalise)
* [Calculate variation coefficients](#calculate-variation)

### Read the data <a class="anchor" id="read-data"></a>

In [40]:
DATA_DIR = 'data/' # change the path when running to point to your data directory

In [41]:
# First step: read the BKB data
import glob
import pandas as pd
from os.path import splitext, basename

def get_name(path):
    '''
    in: path/to/data/bgb_cargo.tsv
    out: cargo
    '''
    root, ext = splitext(path) # 'path/to/data/bgb_cargo.tsv' -> 'path/to/data/bgb_cargo', '.tsv'
    return basename(root).split('_')[-1] # 'path/to/data/bgb_cargo' -> 'bgb_cargo' -> 'cargo'

bkb_data = {get_name(path): pd.read_csv(path,sep='\t',low_memory=False) 
           for path in glob.glob(DATA_DIR+'bkb_cargo_logs/*')} # we assume the sheets are stored separately as .tsv files

In [42]:
# Display the data
from IPython.display import display
for key, df in bkb_data.items():
    print(key+':')
    display(df.head(3))

ship:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,3067,Vrouwe Agatha,5/21/08 15:37,mjo,5/21/08 15:37
1,3076,Faam,5/22/08 11:28,mjo,5/22/08 11:28
2,3065,Dordrecht,5/21/08 15:07,mjo,5/21/08 15:07


source:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,105,10758.0,5/6/08 11:47,admin,4/9/08 16:55
1,104,10757.0,5/6/08 11:47,admin,4/9/08 16:55
2,103,10756.0,5/6/08 11:47,admin,4/9/08 16:55


relVoyageShip:


Unnamed: 0,id,voyId,shipId,timestamp,DAS_voyage,DAS_shipID
0,46,99359.0,3070,5/21/08 16:52,95688.0,DAS_ship0660
1,51,99365.0,3077,5/22/08 11:48,95666.0,DAS_ship1806
2,49,99362.0,3074,5/22/08 10:47,95694.0,DAS_ship0496


regio:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,3059,Kaap de Goede Hoop,5/6/08 11:46,Admin,5/6/08 11:47
1,3061,Mauritius,5/6/08 11:46,Admin,5/6/08 11:47
2,3062,Mokka,5/6/08 11:46,Admin,5/6/08 11:47


unit:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,88,pees,5/21/08 10:03,jsc,5/21/08 10:03
1,52,aam,4/9/08 16:38,Admin,4/9/08 16:39
2,54,balie,4/9/08 16:38,Admin,4/9/08 16:39


place:


Unnamed: 0,id,naam,added_when,added_by,timestamp,regio,voc_place_ID,standardized toponym,URI,lat,long
0,902,Kupang,5/6/08 11:43,Admin,5/6/08 11:44,3169.0,vocUniquePlaceID_5164,Kupang ID,http://sws.geonames.org/2057087/,-10.17083,123.60694
1,903,Pontianak,5/6/08 11:43,Admin,5/6/08 11:44,3171.0,vocUniquePlaceID_5165,Pontianak ID,http://sws.geonames.org/1630789/,-0.03194,109.325
2,900,Kisar,5/6/08 11:43,Admin,5/6/08 11:44,3162.0,vocUniquePlaceID_5166,Pulau Kisar ID,http://sws.geonames.org/1639966/,-8.06112,127.182


product:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,1196,spreien,5/6/08 11:52,Admin,5/6/08 11:54
1,1195,spijker,5/6/08 11:52,Admin,1/9/13 13:36
2,4313,speciestok,9/26/12 14:56,Admin,9/26/12 14:56


voyage:


Unnamed: 0,url,voyId,voyBookingDay,voyBookingMonth,voyBookingYear,voyDeparturePlaceId,voyDepartureDay,voyDepartureMonth,voyDepartureYear,voyArrivalPlaceId,...,timestamp,voySourceId,voynumber,voyImage,voyRemarksForEndUser,voyDepartureRegioId,voyArrivalRegioId,voyFolioNummer,all_fields,first_ship_name
0,https://bgb.huygens.knaw.nl/bgb/voyage/1,99351,,,1790,934.0,,,,861.0,...,2013-09-10 14:30:24,147.0,1,,,3185,3129,3,1 Batavia Batavia Amsterdam Republiek 1789 1...,Juffrouw Johanna
1,https://bgb.huygens.knaw.nl/bgb/voyage/2,99352,,,1790,934.0,,,,861.0,...,2013-09-10 14:30:24,147.0,2,,,3185,3129,3,2 Batavia Batavia Amsterdam Republiek 1789 1...,Draak
2,https://bgb.huygens.knaw.nl/bgb/voyage/3,99353,,,1790,934.0,,,,861.0,...,2013-09-10 14:30:24,147.0,3,,,3185,3129,3,3 Batavia Batavia Amsterdam Republiek 1790 1...,Doggersbank


cargo:


Unnamed: 0,carId,carVoyageId,carProductId,carSpecificationId,carUnit,carQuantity,carQuantityNumeric,carValue,carValueGuldens,carValueStuivers,...,carValueLicht,carValueLichtGuldens,carValueLichtStuivers,carValueLichtPenningen,carRemarks,carOrder,changed_when,changed_by,timestamp,all_fields
0,645880,99353,1290.0,,,,,"1.623,30",1623.0,3.0,...,,,,,,9.0,6/25/08 12:42,jsc,8/5/13 10:50,samen
1,645881,99353,1230.0,848.0,88.0,4.0,4.0,3718,371.0,8.0,...,,,,,,10.0,5/21/08 14:41,mjo,8/5/13 10:50,"zakhorloge zilveren, voor Japan pees"
2,645877,99353,1133.0,16295.0,88.0,4.0,4.0,,,,...,,,,,,7.0,2/25/13 10:09,DorineS,8/5/13 10:50,"moir√© gouden, voor Japan pees"


specification:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,652,tot inktkokers,5/6/08 11:55,Admin,3/13/13 15:27
1,15877,Constantia rood,2/6/13 16:30,DorineS,2/6/13 16:30
2,654,arduinen,5/6/08 11:55,Admin,5/6/08 11:55


### Convert shipping quantities to decimal <a class="anchor" id="convert-values"></a>

In [43]:
# Preparing the data
df_cargo = bkb_data['cargo'].dropna(subset=['carQuantityNumeric','carProductId','carVoyageId'])
df_product = bkb_data['product'].dropna(subset=['id'])
df_voyage = bkb_data['voyage'].dropna(subset=['voyId','voyDepartureYear'])
df_specification = bkb_data['specification']

In [44]:
def quantity_to_decimal(q, qnum): # Cleaning up the messy BKB quantities data
    if q == '4.501,11/12': # One case like this, fixing manually
        return pd.to_numeric(4501 + 11/12)
    if qnum.count('.') < 2:  # already a numb  er
        return pd.to_numeric(qnum)
    if q.find(' ') == -1:
        try:
            return pd.to_numeric(qnum)
        except:
            print('Failed to convert qnum, using q instead: ',q, qnum)
            return pd.to_numeric(q.replace('.',''))
        
    q = q.replace('  ',' ') # removing extra spaces
    q = q.replace('/ ','/')
    q = q.replace(' /','/')
        
    try:
        num, frac = q.split(' ') # eg. 8 1/4
    except:
        print(q, qnum)
        num, frac = q.split(' ')
    try:
        frac = frac.replace(' ','')
        a, b = frac.split('/')
    except:
        print('Fraction problem with: ',q, qnum)
        a, b = frac.split('.') # data entry error, confused . and /
    return pd.to_numeric(num.replace('.','').replace(',','')) + pd.to_numeric(a)/pd.to_numeric(b)
    

In [46]:
df_specification

Unnamed: 0,id,naam,added_when,added_by,timestamp
0,652,tot inktkokers,5/6/08 11:55,Admin,3/13/13 15:27
1,15877,Constantia rood,2/6/13 16:30,DorineS,2/6/13 16:30
2,654,arduinen,5/6/08 11:55,Admin,5/6/08 11:55
3,655,baar,5/6/08 11:55,Admin,5/6/08 11:55
4,656,Bangkaas,5/6/08 11:55,Admin,5/6/08 11:55
...,...,...,...,...,...
15071,23688,"met zilverbeslag, geschenk van diverse vorsten",7/24/13 13:17,NicolienK,7/24/13 13:17
15072,23689,"met ijzerlaadje, geschenk voor de sultan van Java",7/24/13 15:59,NicolienK,7/24/13 15:59
15073,23690,geschenk voor de sultan van Java,7/24/13 15:59,NicolienK,7/24/13 15:59
15074,23691,"met koperen gevesten en scheden, geschenk voor...",7/24/13 15:59,NicolienK,7/24/13 15:59


In [49]:
import numpy as np

df_product['id'] = df_product['id'].astype(int)
df_voyage['voyId'] = df_voyage['voyId'].astype(str)
df_voyage['voyDepartureYear'] = df_voyage['voyDepartureYear'].astype(int)
df_specification['id'] = pd.to_numeric(df_specification['id'],errors='coerce')
df_cargo['carQuantityNumeric'] = df_cargo.apply(lambda row: quantity_to_decimal(row['carQuantity'],
                                                                               row['carQuantityNumeric']),
                                                axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_voyage['voyId'] = df_voyage['voyId'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_voyage['voyDepartureYear'] = df_voyage['voyDepartureYear'].astype(int)


Failed to convert qnum, using q instead:  8770 8.769.231
Failed to convert qnum, using q instead:  60.905 60.905.375
Fraction problem with:  46.126 1.6 46.126.166
Failed to convert qnum, using q instead:  33761 33.761.003
Failed to convert qnum, using q instead:  30660 30.659.997


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cargo['carQuantityNumeric'] = df_cargo.apply(lambda row: quantity_to_decimal(row['carQuantity'],


### Aggregate the data <a class="anchor" id="aggregate"></a>

In [50]:
# Merge cargo with product and voyage data
df_merged = df_cargo.merge(df_product, left_on='carProductId', right_on='id', how='left')\
                    .merge(df_voyage, left_on='carVoyageId', right_on='voyId', how='left').dropna(subset=['voyDepartureYear'])

df_merged.head()

Unnamed: 0,carId,carVoyageId,carProductId,carSpecificationId,carUnit,carQuantity,carQuantityNumeric,carValue,carValueGuldens,carValueStuivers,...,timestamp,voySourceId,voynumber,voyImage,voyRemarksForEndUser,voyDepartureRegioId,voyArrivalRegioId,voyFolioNummer,all_fields_y,first_ship_name
176,646292,99379,1034.0,731.0,69.0,1.078,1078.0,53312,533.0,12.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk
177,646293,99379,1128.0,,69.0,141.0,141.0,1768,176.0,8.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk
178,646294,99379,1140.0,,69.0,4.301,4301.0,70913,709.0,13.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk
179,646295,99379,1156.0,852.0,69.0,61.984,61984.0,"7.611,14",7611.0,14.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk
180,646296,99379,1156.0,838.0,69.0,6.25,6250.0,14139,1413.0,9.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk


In [56]:
def get_specification(cargo_id):
    df_target = df_specification[df_specification['id']==cargo_id]
    if len(df_target)>0:
        return df_target.iloc[0]['naam']
    print('Failed to find specification for id: ',cargo_id)
    return ''
df_merged['nameComplete'] = df_merged.apply(lambda row: row['naam'] if pd.isna(row['carSpecificationId'])
                                           else row['naam'] + ' ' + get_specification(row['carSpecificationId']),
                                           axis=1)

Failed to find specification for id:  920.0
Failed to find specification for id:  920.0
Failed to find specification for id:  920.0
Failed to find specification for id:  920.0
Failed to find specification for id:  920.0
Failed to find specification for id:  751.0
Failed to find specification for id:  751.0
Failed to find specification for id:  767.0
Failed to find specification for id:  920.0
Failed to find specification for id:  841.0
Failed to find specification for id:  920.0
Failed to find specification for id:  1260.0
Failed to find specification for id:  841.0
Failed to find specification for id:  841.0
Failed to find specification for id:  841.0
Failed to find specification for id:  920.0
Failed to find specification for id:  920.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  685.0
Failed to find specification for id:  1200.0
Failed to find specification f

Failed to find specification for id:  1200.0
Failed to find specification for id:  1877.0
Failed to find specification for id:  1058.0
Failed to find specification for id:  1200.0
Failed to find specification for id:  1883.0
Failed to find specification for id:  979.0
Failed to find specification for id:  979.0
Failed to find specification for id:  979.0
Failed to find specification for id:  1567.0
Failed to find specification for id:  979.0
Failed to find specification for id:  979.0
Failed to find specification for id:  979.0
Failed to find specification for id:  979.0
Failed to find specification for id:  1851.0
Failed to find specification for id:  1851.0
Failed to find specification for id:  979.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  1877.0
Failed to find specification for id:  1058.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  1877.0
Failed to find specification for id:  1887.0
Failed to find spe

Failed to find specification for id:  1625.0
Failed to find specification for id:  2134.0
Failed to find specification for id:  920.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1865.0
Failed to find specification for id:  841.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1058.0
Failed to find specification for id:  841.0
Failed to find specification for id:  2198.0
Failed to find specification for id:  841.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1058.0
Failed to find specification for id:  920.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1749.0
Failed to find specification for id:  767.0
Failed to find specifica

Failed to find specification for id:  2324.0
Failed to find specification for id:  767.0
Failed to find specification for id:  979.0
Failed to find specification for id:  1200.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1752.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  1058.0
Failed to find specification for id:  1058.0
Failed to find specification for id:  1058.0
Failed to find specification for id:  1246.0
Failed to find specification for id:  2324.0
Failed to find specification for id:  1246.0
Failed to find specification for id:  1851.0
Failed to find specification for id:  1752.0
Failed to find specification for id:  1200.0
Failed to find specification for id:  767.0
Failed to find specification for id:  2324.0
Failed to find specification for id:  1246.0
Failed to find specification for id:  1246.0
Failed to find 

Failed to find specification for id:  767.0
Failed to find specification for id:  2318.0
Failed to find specification for id:  2324.0
Failed to find specification for id:  2642.0
Failed to find specification for id:  1877.0
Failed to find specification for id:  2642.0
Failed to find specification for id:  1877.0
Failed to find specification for id:  767.0
Failed to find specification for id:  2324.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  2324.0
Failed to find specification for id:  2324.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1260.0
Failed to find specification for id:  1752.0
Failed to find specification for id:  1752.0
Failed to find specification for id:  1752.0
Failed to find specification for id:  1752.0
Failed to find specification for id:  979.0
Failed to find specification for id:  1260.0
Failed to find sp

Failed to find specification for id:  1625.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1865.0
Failed to find specification for id:  1752.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1625.0
Failed to find specification for id:  2048.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1246.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1246.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  767.0
Failed to find specification for id:  2048.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  1877.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  2758.0
Failed to find specification for id:  1246.0
Failed to find specification for id:  767.0
Failed to find spe

Failed to find specification for id:  2324.0
Failed to find specification for id:  2307.0
Failed to find specification for id:  3152.0
Failed to find specification for id:  920.0
Failed to find specification for id:  2134.0
Failed to find specification for id:  1246.0
Failed to find specification for id:  751.0
Failed to find specification for id:  2324.0
Failed to find specification for id:  1851.0
Failed to find specification for id:  2307.0
Failed to find specification for id:  2307.0
Failed to find specification for id:  1851.0
Failed to find specification for id:  3770.0
Failed to find specification for id:  2307.0
Failed to find specification for id:  3770.0
Failed to find specification for id:  1561.0
Failed to find specification for id:  1058.0
Failed to find specification for id:  1625.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1851.0
Failed to find specification for id:  1851.0
Failed to find specification for id:  1851.0
Failed to fin

Failed to find specification for id:  3520.0
Failed to find specification for id:  3400.0
Failed to find specification for id:  2397.0
Failed to find specification for id:  2572.0
Failed to find specification for id:  3400.0
Failed to find specification for id:  3770.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  2398.0
Failed to find specification for id:  1625.0
Failed to find specification for id:  1625.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  767.0
Failed to find specification for id:  2048.0
Failed to find specification for id:  2324.0
Failed to find specification for id:  2048.0
Failed to find specification for id:  767.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  2048.0
Failed to find 

Failed to find specification for id:  1246.0
Failed to find specification for id:  2048.0
Failed to find specification for id:  3392.0
Failed to find specification for id:  2048.0
Failed to find specification for id:  767.0
Failed to find specification for id:  2572.0
Failed to find specification for id:  2572.0
Failed to find specification for id:  767.0
Failed to find specification for id:  2572.0
Failed to find specification for id:  2572.0
Failed to find specification for id:  2048.0
Failed to find specification for id:  1625.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1625.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  1625.0
Failed to find specification for id:  1887.0
Failed to find specification for id:  767.0
Failed to find specification for id:  1246.0
Failed to find specification for id:  2048.0
Failed to find specification for id:  767.0
Failed to find specification for id:  2324.0
Failed to find 

In [57]:
df_merged['decade'] = df_merged['voyDepartureYear'].map(lambda x: str((x //10)*10)+'s')


In [61]:
df_merged_clean = df_merged[['naam','nameComplete','carQuantityNumeric','carVoyageId',
                            'voyDepartureYear','decade']].drop_duplicates()

df_merged_clean['appearance_count'] = df_merged_clean.groupby('nameComplete')['nameComplete'].transform('size')
df_merged_clean=df_merged_clean.sort_values(by='naam')
df_merged_clean

Unnamed: 0,naam,nameComplete,carQuantityNumeric,carVoyageId,voyDepartureYear,decade,appearance_count
174993,AB-boek,AB-boek,50.0,117173,1726.0,1720.0s,2
114759,AB-boek,AB-boek,30.0,113711,1711.0,1710.0s,2
145223,AB-boek,AB-boek voor Noord-Coromandel,20.0,115463,1713.0,1710.0s,1
116280,AB-bord,AB-bord,25.0,113732,1724.0,1720.0s,6
174736,AB-bord,AB-bord,50.0,117169,1725.0,1720.0s,6
...,...,...,...,...,...,...,...
144412,zwavelaarde,"zwavelaarde geraffineerd Javaans, in 4 halve amen",1013.0,115425,1713.0,1710.0s,1
17122,zwavelaarde,zwavelaarde,10000.0,102602,1777.0,1770.0s,9
70077,zwavelaarde,zwavelaarde,20513.0,110017,1751.0,1750.0s,9
26446,zwavelaarde,zwavelaarde,10000.0,104009,1766.0,1760.0s,9


### Connect the commodities with GLOBALISE thesaurus <a class="anchor" id="globalise"></a>

In [101]:
# Connecting with the GLOBALISE commodities
import json

# First, get a list of all labels of commodities to filter our data
commodities_raw = json.load(open('data/commoditiesV1.json','r')) # Data available at: https://globalise-data.diginfra.net/sparql
commodities_labels_list_flat = sum([[item['prefLabelNL']['value']]+[label for label in item['altLabelsNL']['value'].split('; ') if label]
                                     for item in commodities_raw['results']['bindings']],
                                    [])
commodities_labels_set = set([label.lower() for label in commodities_labels_list_flat])

# Next, get categories per commodity
parent_uri = {item['concept']['value']:
             item['concept_broader']['value']
             for item in commodities_raw['results']['bindings']
             if 'concept_broader' in item}

name_by_uri = {item['concept']['value']:
             item['prefLabelNL']['value']
             for item in commodities_raw['results']['bindings']}

uri_by_name = {label.lower():
             item['concept']['value']
               for item in commodities_raw['results']['bindings']
              for label in [item['prefLabelNL']['value']]+[lbl for lbl in item['altLabelsNL']['value'].split('; ') if lbl]
             }


def get_category_chain(url='', cur_label='', chain=[]):
    '''
    Recursively looking for nested categories in the thesaurus
    '''
    if not url:
        url = uri_by_name[cur_label]
    if url not in parent_uri: # can't go any higher
        return [cur_label]
    new_url = parent_uri[url]
    new_label = name_by_uri[new_url] if new_url in name_by_uri else name_by_uri[url]
    return [cur_label] + get_category_chain(new_url, new_label)

def get_category_chain_by_name(label):
    return get_category_chain(url='',cur_label=label)

def get_category_by_name(label):
    chain = get_category_chain_by_name(label)
    if chain[-1].startswith('Verwerkte'): # too broad, narrow down
        return chain[-2]
    if chain[-1].startswith('Voedsel'): # too broad
        return chain[-2]
    if chain[-1].startswith('Koffie, thee,'):
        return chain[-2]
    return chain[-1]

In [102]:
get_category_by_name('opium')

'Dranken en Tabak'

In [103]:
get_category_chain_by_name('laken')

['laken',
 'wol-zijde',
 'Textiel',
 'Textielgaren, weefsels, geconfectioneerde artikelen, n.e.g., en aanverwante producten',
 'Verwerkte goederen, hoofdzakelijk ingedeeld naar materiaal']

In [104]:
filtered_df = df_merged_clean[df_merged_clean['naam'].isin(commodities_labels_set)]
filtered_df['category'] = filtered_df['naam'].map(get_category_by_name)
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['category'] = filtered_df['naam'].map(get_category_by_name)


Unnamed: 0,naam,nameComplete,carQuantityNumeric,carVoyageId,voyDepartureYear,decade,appearance_count,category
140207,aam,aam hele,30.0,115118,1715.0,1710.0s,19,"Reisgoederen, handtassen en vergelijkbare cont..."
137098,aam,aam ledig,1.0,114942,1707.0,1700.0s,2,"Reisgoederen, handtassen en vergelijkbare cont..."
138199,aam,aam hele,30.0,115021,1707.0,1700.0s,19,"Reisgoederen, handtassen en vergelijkbare cont..."
173916,aam,aam hele,40.0,117104,1726.0,1720.0s,19,"Reisgoederen, handtassen en vergelijkbare cont..."
140010,aam,"aam ledig tot de azijn, verstrekt aan opperhoo...",1.0,115114,1715.0,1710.0s,1,"Reisgoederen, handtassen en vergelijkbare cont..."
...,...,...,...,...,...,...,...,...
144412,zwavelaarde,"zwavelaarde geraffineerd Javaans, in 4 halve amen",1013.0,115425,1713.0,1710.0s,1,"Ruwe materialen, oneetbaar, behalve brandstoffen"
17122,zwavelaarde,zwavelaarde,10000.0,102602,1777.0,1770.0s,9,"Ruwe materialen, oneetbaar, behalve brandstoffen"
70077,zwavelaarde,zwavelaarde,20513.0,110017,1751.0,1750.0s,9,"Ruwe materialen, oneetbaar, behalve brandstoffen"
26446,zwavelaarde,zwavelaarde,10000.0,104009,1766.0,1760.0s,9,"Ruwe materialen, oneetbaar, behalve brandstoffen"


In [105]:
set(filtered_df['category'].tolist())

{'Andere verwerkte goederen, n.e.g.',
 'Chemicaliën en verwante producten, n.e.g.',
 'Dierlijke en plantaardige oliën, vetten en wassen',
 'Dranken en Tabak',
 'Gereedschap',
 'Granen en graanbereidingen',
 'Groenten en fruit',
 'IJzer en staal',
 'Juwelen, goudsmid en zilversmid artikelen, en andere artikelen van kostbare materialen',
 'Kleding en kledingaccessoires',
 'Koffie, thee, cacao, specerijen, en fabrikaten daarvan',
 'Kurken en houten verwerkte goederen (exclusief meubilair)',
 'Leder, lederwaren, n.e.g., en bontwerk',
 'Levende dieren buiten de categorie zeedieren',
 'Lichtbronnen',
 'Machines en transportmiddelen',
 'Meubelen en delen daarvan; beddengoed, matrassen, matrasbeschermers, kussens en dergelijke gevulde meubelen',
 'Militair materieel',
 'Minerale brandstoffen, smeermiddelen en gerelateerd materieel',
 'NOT YET CLASSIFIED',
 'Niet elders geclasificeerde goederen en transacties',
 'Non-ferrometalen',
 'Onderdelen',
 'Overige eetbare producten en bereidingen, n.e.

### Calculate variation coefficients <a class="anchor" id="calculate-variation"></a>

In [108]:
# Group by product and time period to get quantities by decade

df_grouped = filtered_df.groupby(['nameComplete', 'category', 'decade']).agg({'carQuantityNumeric': ['mean','std']}).reset_index()

print(df_grouped)
# df_grouped = df_grouped.groupby(['naam', 'voyDepartureYear'])['carQuantityNumeric'].sum().reset_index()

# Flatten the column names
df_grouped.columns = ['commodity','category', 'decade', 'mean_quantity', 'std_dev_quantity']

# Fill NaNs in std_dev_quantity with 0
df_grouped['std_dev_quantity'] = df_grouped['std_dev_quantity'].fillna(0)

# Calculate Coefficient of Variation (CV)
df_grouped['cv_quantity'] = df_grouped['std_dev_quantity'] / df_grouped['mean_quantity']
df_grouped

                                            nameComplete  \
                                                           
0                                                    aam   
1                      aam halve tot de Suratse lijnolie   
2                                aam halve tot de duiten   
3                              aam halve tot de lijnolie   
4                                  aam halve tot de olie   
...                                                  ...   
26805                                        zwavelaarde   
26806                                zwavelaarde Javaans   
26807  zwavelaarde geraffineerd Javaans, in 4 halve amen   
26808                      zwavelaarde van de kruitmolen   
26809                                        zwavelbloem   

                                                category   decade  \
                                                                    
0      Reisgoederen, handtassen en vergelijkbare cont...  1780.0s   
1      Reisg

Unnamed: 0,commodity,category,decade,mean_quantity,std_dev_quantity,cv_quantity
0,aam,"Reisgoederen, handtassen en vergelijkbare cont...",1780.0s,12.0,0.000000,0.000000
1,aam halve tot de Suratse lijnolie,"Reisgoederen, handtassen en vergelijkbare cont...",1700.0s,1.0,0.000000,0.000000
2,aam halve tot de duiten,"Reisgoederen, handtassen en vergelijkbare cont...",1750.0s,15.0,0.000000,0.000000
3,aam halve tot de lijnolie,"Reisgoederen, handtassen en vergelijkbare cont...",1700.0s,2.0,0.000000,0.000000
4,aam halve tot de olie,"Reisgoederen, handtassen en vergelijkbare cont...",1780.0s,10.0,0.000000,0.000000
...,...,...,...,...,...,...
26805,zwavelaarde,"Ruwe materialen, oneetbaar, behalve brandstoffen",1770.0s,8164.5,2595.788994,0.317936
26806,zwavelaarde Javaans,"Ruwe materialen, oneetbaar, behalve brandstoffen",1700.0s,150.0,70.710678,0.471405
26807,"zwavelaarde geraffineerd Javaans, in 4 halve amen","Ruwe materialen, oneetbaar, behalve brandstoffen",1710.0s,1013.0,0.000000,0.000000
26808,zwavelaarde van de kruitmolen,"Ruwe materialen, oneetbaar, behalve brandstoffen",1770.0s,3671.0,0.000000,0.000000


In [121]:
df_grouped.groupby('category')['cv_quantity'].mean().reset_index().sort_values('cv_quantity').reset_index().drop('index',axis=1)[::-1]

Unnamed: 0,category,cv_quantity
45,Non-ferrometalen,0.61456
44,Scheepstextiel,0.548247
43,Granen en graanbereidingen,0.541609
42,"Leder, lederwaren, n.e.g., en bontwerk",0.493967
41,"Minerale brandstoffen, smeermiddelen en gerela...",0.444757
40,"Papier, karton, artikelen van papierpulp, van ...",0.437754
39,Niet elders geclasificeerde goederen en transa...,0.40409
38,"Koffie, thee, cacao, specerijen, en fabrikaten...",0.402949
37,"Chemicaliën en verwante producten, n.e.g.",0.399912
36,Groenten en fruit,0.396664


In [122]:
'gewicht' in commodities_labels_set

True

In [115]:
df_volatility = df_grouped.groupby('commodity')['cv_quantity'].mean().reset_index()


In [110]:
df_volatility = df_volatility.sort_values('cv_quantity').reset_index().drop('index',axis=1)

In [111]:
df_volatility.tail(30)[::-1]

Unnamed: 0,commodity,cv_quantity
18459,rijst,4.268298
18458,boter Hollands,3.114819
18457,tarwe Bengaals,2.894697
18456,koffieboon Mauritiaans,2.522523
18455,nootmuskaat gepekeld,2.39251
18454,papier groot formaat Chinees,2.282414
18453,tarwe,2.22623
18452,tarwe Kaaps,2.129953
18451,vlees,2.100351
18450,sekwijn,2.099624


In [122]:
pd.set_option('display.max_colwidth', None)
df_volatility[df_volatility['commodity'].str.startswith('opium')]

Unnamed: 0,commodity,cv_quantity
10693,opium,0.839001
10694,opium Bengaals,0.499623
10695,opium geschenk voor de koning en rijksgroten van Tidore,0.0
10696,opium in 10 kisten,0.0
10697,opium in 152 kisten,0.0
10698,opium in 2 kisten,0.0
10699,opium in 4 kisten,0.0
10700,opium in 449 kisten,0.0
10701,"opium in een doosje, tot geschenk voor de goegoegoe Marafaolij",0.0
10702,opium ruw,0.0


In [127]:
df_volatility[df_volatility['commodity'].str.startswith('thee')].sort_values('cv_quantity')[::-1]

Unnamed: 0,commodity,cv_quantity
16542,thee bing,1.526116
16566,"thee in soort, met emballage",1.084698
16573,thee wit,1.080758
16535,"thee Chinees in soort, met emballage",1.037632
16574,thee wit Chinees,1.036850
...,...,...
16592,theekop geschenk aan radja Sieta Lela tot Baros,0.000000
16557,"thee boei, in 53 canasters",0.000000
16556,"thee boei, in 36 canasters",0.000000
16554,"thee boei, in 132 kannen en kasten",0.000000


In [126]:
df_volatility[df_volatility['cv_quantity']>0].head(30)

Unnamed: 0,commodity,cv_quantity
2,bergcinnaber,0.036262
3,antimonium,0.068182
4,lheimenias,0.08704
5,juchtleer,0.09574
6,roos (bloem),0.098974
7,hevel,0.101105
8,djarak-olie,0.10375
9,bamboe,0.124252
10,zand,0.133333
11,stempel,0.144016
