In [1]:
# Starting to explore stability of commodities using shipping data in the BKB dataset
# 24.07.2024
# vera.provatorova@dh.huc.knaw.nl

## Steps:
* [Read the data](#read-data)
* [Convert shipping quantity values to decimal](#convert-values)

### Read the data <a class="anchor" id="read-data"></a>

In [2]:
DATA_DIR = '../BKB/data/' # change the path when running to point to your data directory

In [4]:
# First step: read the BKB data
import glob
import pandas as pd
from os.path import splitext, basename

def get_name(path):
    '''
    in: path/to/data/bgb_cargo.tsv
    out: cargo
    '''
    root, ext = splitext(path) # 'path/to/data/bgb_cargo.tsv' -> 'path/to/data/bgb_cargo', '.tsv'
    return basename(root).split('_')[-1] # 'path/to/data/bgb_cargo' -> 'bgb_cargo' -> 'cargo'

bkb_data = {get_name(path): pd.read_csv(path,sep='\t',low_memory=False) 
           for path in glob.glob(DATA_DIR+'bkb_cargo_logs/*')} # we assume the sheets are stored separately as .tsv files

In [5]:
# Display the data
from IPython.display import display
for key, df in bkb_data.items():
    print(key+':')
    display(df.head(3))

ship:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,3067,Vrouwe Agatha,5/21/08 15:37,mjo,5/21/08 15:37
1,3076,Faam,5/22/08 11:28,mjo,5/22/08 11:28
2,3065,Dordrecht,5/21/08 15:07,mjo,5/21/08 15:07


source:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,105,10758.0,5/6/08 11:47,admin,4/9/08 16:55
1,104,10757.0,5/6/08 11:47,admin,4/9/08 16:55
2,103,10756.0,5/6/08 11:47,admin,4/9/08 16:55


relVoyageShip:


Unnamed: 0,id,voyId,shipId,timestamp,DAS_voyage,DAS_shipID
0,46,99359.0,3070,5/21/08 16:52,95688.0,DAS_ship0660
1,51,99365.0,3077,5/22/08 11:48,95666.0,DAS_ship1806
2,49,99362.0,3074,5/22/08 10:47,95694.0,DAS_ship0496


regio:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,3059,Kaap de Goede Hoop,5/6/08 11:46,Admin,5/6/08 11:47
1,3061,Mauritius,5/6/08 11:46,Admin,5/6/08 11:47
2,3062,Mokka,5/6/08 11:46,Admin,5/6/08 11:47


unit:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,88,pees,5/21/08 10:03,jsc,5/21/08 10:03
1,52,aam,4/9/08 16:38,Admin,4/9/08 16:39
2,54,balie,4/9/08 16:38,Admin,4/9/08 16:39


place:


Unnamed: 0,id,naam,added_when,added_by,timestamp,regio,voc_place_ID,standardized toponym,URI,lat,long
0,902,Kupang,5/6/08 11:43,Admin,5/6/08 11:44,3169.0,vocUniquePlaceID_5164,Kupang ID,http://sws.geonames.org/2057087/,-10.17083,123.60694
1,903,Pontianak,5/6/08 11:43,Admin,5/6/08 11:44,3171.0,vocUniquePlaceID_5165,Pontianak ID,http://sws.geonames.org/1630789/,-0.03194,109.325
2,900,Kisar,5/6/08 11:43,Admin,5/6/08 11:44,3162.0,vocUniquePlaceID_5166,Pulau Kisar ID,http://sws.geonames.org/1639966/,-8.06112,127.182


product:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,1196,spreien,5/6/08 11:52,Admin,5/6/08 11:54
1,1195,spijker,5/6/08 11:52,Admin,1/9/13 13:36
2,4313,speciestok,9/26/12 14:56,Admin,9/26/12 14:56


voyage:


Unnamed: 0,url,voyId,voyBookingDay,voyBookingMonth,voyBookingYear,voyDeparturePlaceId,voyDepartureDay,voyDepartureMonth,voyDepartureYear,voyArrivalPlaceId,...,timestamp,voySourceId,voynumber,voyImage,voyRemarksForEndUser,voyDepartureRegioId,voyArrivalRegioId,voyFolioNummer,all_fields,first_ship_name
0,https://bgb.huygens.knaw.nl/bgb/voyage/1,99351,,,1790,934.0,,,,861.0,...,2013-09-10 14:30:24,147.0,1,,,3185,3129,3,1 Batavia Batavia Amsterdam Republiek 1789 1...,Juffrouw Johanna
1,https://bgb.huygens.knaw.nl/bgb/voyage/2,99352,,,1790,934.0,,,,861.0,...,2013-09-10 14:30:24,147.0,2,,,3185,3129,3,2 Batavia Batavia Amsterdam Republiek 1789 1...,Draak
2,https://bgb.huygens.knaw.nl/bgb/voyage/3,99353,,,1790,934.0,,,,861.0,...,2013-09-10 14:30:24,147.0,3,,,3185,3129,3,3 Batavia Batavia Amsterdam Republiek 1790 1...,Doggersbank


cargo:


Unnamed: 0,carId,carVoyageId,carProductId,carSpecificationId,carUnit,carQuantity,carQuantityNumeric,carValue,carValueGuldens,carValueStuivers,...,carValueLicht,carValueLichtGuldens,carValueLichtStuivers,carValueLichtPenningen,carRemarks,carOrder,changed_when,changed_by,timestamp,all_fields
0,645880,99353,1290.0,,,,,"1.623,30",1623.0,3.0,...,,,,,,9.0,6/25/08 12:42,jsc,8/5/13 10:50,samen
1,645881,99353,1230.0,848.0,88.0,4.0,4.0,3718,371.0,8.0,...,,,,,,10.0,5/21/08 14:41,mjo,8/5/13 10:50,"zakhorloge zilveren, voor Japan pees"
2,645877,99353,1133.0,16295.0,88.0,4.0,4.0,,,,...,,,,,,7.0,2/25/13 10:09,DorineS,8/5/13 10:50,"moir√© gouden, voor Japan pees"


specification:


Unnamed: 0,id,naam,added_when,added_by,timestamp
0,652,tot inktkokers,5/6/08 11:55,Admin,3/13/13 15:27
1,15877,Constantia rood,2/6/13 16:30,DorineS,2/6/13 16:30
2,654,arduinen,5/6/08 11:55,Admin,5/6/08 11:55


### Convert shipping quantities to decimal <a class="anchor" id="convert-values"></a>

In [6]:
df_cargo = bkb_data['cargo'].dropna(subset=['carQuantityNumeric','carProductId','carVoyageId'])
df_product = bkb_data['product'].dropna(subset=['id'])
df_voyage = bkb_data['voyage'].dropna(subset=['voyId','voyDepartureYear'])

In [7]:
def quantity_to_decimal(q, qnum): # Cleaning up the messy BKB quantities data
    if q == '4.501,11/12': # One case like this, fixing manually
        return pd.to_numeric(4501 + 11/12)
    if qnum.count('.') < 2:  # already a numb  er
        return pd.to_numeric(qnum)
    if q.find(' ') == -1:
        try:
            return pd.to_numeric(qnum)
        except:
            print('Failed to convert qnum, using q instead: ',q, qnum)
            return pd.to_numeric(q.replace('.',''))
        
    q = q.replace('  ',' ') # removing extra spaces
    q = q.replace('/ ','/')
    q = q.replace(' /','/')
        
    try:
        num, frac = q.split(' ') # eg. 8 1/4
    except:
        print(q, qnum)
        num, frac = q.split(' ')
    try:
        frac = frac.replace(' ','')
        a, b = frac.split('/')
    except:
        print('Fraction problem with: ',q, qnum)
        a, b = frac.split('.') # data entry error, confused . and /
    return pd.to_numeric(num.replace('.','').replace(',','')) + pd.to_numeric(a)/pd.to_numeric(b)
    

In [8]:
import numpy as np

df_product['id'] = df_product['id'].astype(int)
df_voyage['voyId'] = df_voyage['voyId'].astype(str)
df_voyage['voyDepartureYear'] = df_voyage['voyDepartureYear'].astype(int)
df_cargo['carQuantityNumeric'] = df_cargo.apply(lambda row: quantity_to_decimal(row['carQuantity'],
                                                                               row['carQuantityNumeric']),
                                                axis=1)

# Merge cargo with product and voyage data
df_merged = df_cargo.merge(df_product, left_on='carProductId', right_on='id', how='left')\
                    .merge(df_voyage, left_on='carVoyageId', right_on='voyId', how='left').dropna(subset=['voyDepartureYear'])

df_merged.head()
# Extract year and month for aggregation
# df_merged['voyYearMonth'] = pd.to_datetime(df_merged[['voyBookingYear', 'voyBookingMonth']].astype(str).agg('-'.join, axis=1))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_voyage['voyId'] = df_voyage['voyId'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_voyage['voyDepartureYear'] = df_voyage['voyDepartureYear'].astype(int)


Failed to convert qnum, using q instead:  8770 8.769.231
Failed to convert qnum, using q instead:  60.905 60.905.375
Fraction problem with:  46.126 1.6 46.126.166
Failed to convert qnum, using q instead:  33761 33.761.003
Failed to convert qnum, using q instead:  30660 30.659.997


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cargo['carQuantityNumeric'] = df_cargo.apply(lambda row: quantity_to_decimal(row['carQuantity'],


Unnamed: 0,carId,carVoyageId,carProductId,carSpecificationId,carUnit,carQuantity,carQuantityNumeric,carValue,carValueGuldens,carValueStuivers,...,timestamp,voySourceId,voynumber,voyImage,voyRemarksForEndUser,voyDepartureRegioId,voyArrivalRegioId,voyFolioNummer,all_fields_y,first_ship_name
176,646292,99379,1034.0,731.0,69.0,1.078,1078.0,53312,533.0,12.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk
177,646293,99379,1128.0,,69.0,141.0,141.0,1768,176.0,8.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk
178,646294,99379,1140.0,,69.0,4.301,4301.0,70913,709.0,13.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk
179,646295,99379,1156.0,852.0,69.0,61.984,61984.0,"7.611,14",7611.0,14.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk
180,646296,99379,1156.0,838.0,69.0,6.25,6250.0,14139,1413.0,9.0,...,2013-09-10 14:30:24,147.0,27.0,,,3129.0,3185.0,65,27 Amsterdam Republiek Batavia Batavia 1789 ...,Valk


In [9]:
set(df_merged['voyDepartureYear'].tolist())

{1700.0,
 1701.0,
 1702.0,
 1703.0,
 1704.0,
 1706.0,
 1707.0,
 1708.0,
 1709.0,
 1710.0,
 1711.0,
 1712.0,
 1713.0,
 1714.0,
 1715.0,
 1721.0,
 1722.0,
 1723.0,
 1724.0,
 1725.0,
 1726.0,
 1727.0,
 1728.0,
 1729.0,
 1730.0,
 1731.0,
 1732.0,
 1734.0,
 1735.0,
 1736.0,
 1737.0,
 1738.0,
 1739.0,
 1740.0,
 1741.0,
 1742.0,
 1743.0,
 1750.0,
 1751.0,
 1752.0,
 1753.0,
 1754.0,
 1755.0,
 1756.0,
 1757.0,
 1758.0,
 1759.0,
 1760.0,
 1761.0,
 1762.0,
 1763.0,
 1764.0,
 1765.0,
 1766.0,
 1767.0,
 1768.0,
 1769.0,
 1771.0,
 1772.0,
 1773.0,
 1774.0,
 1775.0,
 1776.0,
 1777.0,
 1778.0,
 1779.0,
 1780.0,
 1781.0,
 1782.0,
 1783.0,
 1784.0,
 1785.0,
 1786.0,
 1787.0,
 1788.0,
 1789.0,
 1790.0,
 1800.0,
 1801.0}

In [10]:
df_merged['decade'] = df_merged['voyDepartureYear'].map(lambda x: str((x //10)*10)+'s')


In [11]:
df_merged.columns

Index(['carId', 'carVoyageId', 'carProductId', 'carSpecificationId', 'carUnit',
       'carQuantity', 'carQuantityNumeric', 'carValue', 'carValueGuldens',
       'carValueStuivers', 'carValuePenningen', 'carValueLicht',
       'carValueLichtGuldens', 'carValueLichtStuivers',
       'carValueLichtPenningen', 'carRemarks', 'carOrder', 'changed_when_x',
       'changed_by_x', 'timestamp_x', 'all_fields_x', 'id', 'naam',
       'added_when', 'added_by', 'timestamp_y', 'url', 'voyId',
       'voyBookingDay', 'voyBookingMonth', 'voyBookingYear',
       'voyDeparturePlaceId', 'voyDepartureDay', 'voyDepartureMonth',
       'voyDepartureYear', 'voyArrivalPlaceId', 'voyArrivalDay',
       'voyArrivalMonth', 'voyArrivalYear', 'voyInvoiceValue',
       'voyInvoiceValueLicht', 'voyRemarksForEditor', 'voyageDAS',
       'created_when', 'created_by', 'changed_when_y', 'changed_by_y',
       'timestamp', 'voySourceId', 'voynumber', 'voyImage',
       'voyRemarksForEndUser', 'voyDepartureRegioId', 'voy

In [None]:
df_merged_clean = df_merged[['naam','carQuantityNumeric','carVoyageId',
                            'voyDepartureYear','decade']].drop_duplicates()

df_merged_clean['appearance_count'] = df_merged_clean.groupby('naam')['naam'].transform('size')
df_merged_clean.sort_values(by='naam')

In [None]:
# Connecting with the GLOBALISE commodities
import json

commodities_raw = json.load(open('../DHB_extended/data/commoditiesV1.json','r'))

In [None]:
df_merged_clean.describe()

df_merged_clean = df_merged_clean[df_merged_clean['appearance_count']>99].reset_index().drop('index',axis=1)
df_merged_clean.sort_values(by='naam')

In [None]:
# Group by product and time period to get monthly quantities
df_grouped = df_merged_clean.groupby(['naam', 'decade']).agg({'carQuantityNumeric': ['mean','std']}).reset_index()
# df_grouped = df_grouped.groupby(['naam', 'voyDepartureYear'])['carQuantityNumeric'].sum().reset_index()

# Flatten the column names
df_grouped.columns = ['commodity', 'decade', 'mean_quantity', 'std_dev_quantity']

# Fill NaNs in std_dev_quantity with 0
df_grouped['std_dev_quantity'] = df_grouped['std_dev_quantity'].fillna(0)

# Calculate Coefficient of Variation (CV)
df_grouped['cv_quantity'] = df_grouped['std_dev_quantity'] / df_grouped['mean_quantity']
df_grouped

In [None]:
df_volatility = df_grouped.groupby('commodity')['cv_quantity'].mean().reset_index()


In [None]:
df_volatility.sort_values('cv_quantity').tail(20)

In [None]:
df_grouped.sort_values(by='carQuantityNumeric')

In [None]:
# Calculate standard deviation and CV per commodity
commodity_stats = df_grouped.groupby('naam').agg({
    'carQuantityNumeric': ['mean', 'std']
}).reset_index()

commodity_stats


In [None]:
# Flatten the column names
commodity_stats.columns = ['commodity', 'mean_quantity', 'std_dev_quantity']

# Calculate Coefficient of Variation (CV)
commodity_stats['cv_quantity'] = commodity_stats['std_dev_quantity'] / commodity_stats['mean_quantity']

print(commodity_stats)

In [None]:
bkb_data['cargo'].columns

In [None]:
sorted(set(bkb_data['product']['naam'].tolist())) # all different commodities shipped