In [25]:
import json
import re
import os

import boto3
import awswrangler as wr
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from dataclasses import dataclass
load_dotenv()

True

# Define a class for the ship particulars

In [2]:
@dataclass
class ShipTechnicalSpecs:
    IMO_number: str
    # call_sign: str
    # mmsi: str
    # flag: str
    built_year: str
    length: str
    beam: str
    # depth: str
    # draft: str
    gross_tonnage: str
    # net_tonnage: str
    # engines: str
    # propulsion: str
    # speed: str
    dwt: str
    # capacity: str

In [64]:
def extract_field(value, field_name):
    pattern = {
        "IMO": r"IMO number:\s*(\d+)",
        "Call Sign": r"Call\s?sign[:\s]*([A-Za-z0-9]+)",
        "MMSI": r"MMSI number[:\s]*([A-Za-z0-9]+)",
        "GT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*(?:GT|GRT)",
        "NT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*NT",
        "DWT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*(?:t[\s\u00A0]*)?DWT"
    }
    match = re.search(pattern.get(field_name, ""), value, re.IGNORECASE)
    if match:
        return match.group(1).replace(",", "")  # Remove commas from the numeric value
    return "N/A"

In [35]:
def get_year(ship_details, name1:str, name2:str):
    year_of_completion = ship_details.get(name1, "")
    if year_of_completion:
        return year_of_completion
    
    return ship_details.get(name2, "Missing")

In [8]:
def get_similar_values(ship_details, name1:str, name2:str):
    beam = ship_details.get(name1, "")
    if beam:
        match = re.search(r"\d+\.?\d*\s*m", beam, re.IGNORECASE)
        if match:
            return match.group(0)
    
    # Try Breadth dictionary format
    return ship_details.get(name2, "Missing")

In [5]:
def get_tonnage_value(ship_details, primary_field, backup_field, regex_pattern):
    direct_value = ship_details.get(backup_field)
    if direct_value and direct_value != "Missing":
        return str(direct_value)
    
    tonnage = ship_details.get("Tonnage", "")
    if tonnage:
        extracted = extract_field(tonnage, regex_pattern)
        if extracted != "N/A":
            return extracted
            
    return "Missing"

In [65]:
def process_ship_data(ship_data):
    processed_data = []

    for imo_number, ship_details in ship_data.items():
        imo_number = imo_number
        
        identification = ship_details.get("Identification", "Missing")
        # mmsi = extract_field(identification, "MMSI")
        # call_sign = extract_field(identification, "Call Sign")
        
        # flag = ship_details.get("Flag", "Missing")
        # built_year = ship_details.get("Built year", "Missing")
        built_year = get_year(ship_details, "Completed", "Built year")
        # displacement = ship_details.get("Displacement", "Missing")
        # length = ship_details.get("Length", "Missing")
        length = get_similar_values(ship_details, "Length", "Length (m)")
        # beam = ship_details.get("Beam", "Missing") or ship_details.get("Breadth", "Missing")
        beam = get_similar_values(ship_details, "Beam", "Breadth (m)")
        # breadth = ship_details.get("Breadth", "Missing")
        # depth = ship_details.get("Depth", "Missing")
        # draft = ship_details.get("Draft", "Missing") or ship_details.get("Draught", "Missing")
        # draft = get_similar_values(ship_details, "Draft", "Draught")
        # draught = ship_details.get("Draught", "Missing")
        
        # tonnage = ship_details.get("Tonnage", "Missing")
        # gross_tonnage = extract_field(tonnage, "GT") or ship_details.get("grossTonnage", "Missing")
        # net_tonnage = extract_field(tonnage, "NT") or ship_details.get("netTonnage", "Missing")
        # deadweight = extract_field(tonnage, "DWT") or ship_details.get("DWT", "Missing")
        
        gross_tonnage = get_tonnage_value(ship_details, "Tonnage", "grossTonnage", "GT")
        # net_tonnage = get_tonnage_value(ship_details, "netTonnage", "NT", "NT")
        deadweight = get_tonnage_value(ship_details, "DWT", "deadweight", "DWT")
        
        # engines = ship_details.get("Installed power", "Missing")
        # propulsion = ship_details.get("Propulsion", "Missing")
        # speed = ship_details.get("Speed", "Missing")
        # capacity = ship_details.get("Capacity", "Missing")
        
        ship = ShipTechnicalSpecs(
            IMO_number=imo_number,
            # mmsi=mmsi,
            # call_sign=call_sign,
            # flag=flag,
            built_year=built_year,
            length=length,
            beam=beam,
            # depth=depth,
            # draft=draft,
            gross_tonnage=gross_tonnage,
            # net_tonnage=net_tonnage,
            dwt=deadweight,
            # engines=engines,
            # propulsion=propulsion,
            # speed=speed,
            # capacity=capacity
        )
        processed_data.append(ship)
    return processed_data

In [90]:
file_names = ["wikipedia_ship_data_v2", "pleiades_fleet_v2"]
dataframes = []

for name in file_names:
    with open(f'../data/raw/ship_particulars/{name}.json', 'r') as f:
        data = json.load(f)
        processed_data = process_ship_data(data)
        output_data = [ship.__dict__ for ship in processed_data]
        dataframes.append(pd.DataFrame(output_data))

In [91]:
df = pd.concat(dataframes, ignore_index=True)

In [None]:
df

In [2]:
full_data = pd.read_csv('ship_specs_full.csv')

In [3]:
full_data

Unnamed: 0,IMO_number,built_year,length,beam,gross_tonnage,dwt
0,7361312,1975,184.55 m,26.40 m,21545,3335.0
1,7361324,1976,184.55 m,26.40 m,528,3335.0
2,7528611,1978,154 m,21.67 m,5466,8661.0
3,7826790,1981,168.49 m,26.04 m,30317,3515.0
4,7827213,1992,158.90 m,25.20 m,25076,1703.0
...,...,...,...,...,...,...
214,9892690,7 December 2022,212.4 m,30.6 m,50629,5936.0
215,9208629,2001,215.44 m,31.85 m,59925,8850.0
216,9283186,28 July 2004,272 m,40 m,69132,71373.0
217,9283239,9 August 2005,272 m,40 m,69132,71415.0


In [6]:
regex_pattern = r'(\d+(\.\d+)?)\s*m'
full_data['length (m)'] = full_data['length'].str.extract(regex_pattern)[0]
full_data['beam (m)'] = full_data['beam'].str.extract(regex_pattern)[0]

In [8]:
full_data['built_year'] = pd.to_datetime(full_data['built_year'])
full_data['year'] = full_data['built_year'].dt.year

In [9]:
full_data.drop(['built_year', 'length', 'beam'], axis=1, inplace=True)
full_data.rename(columns={'year': 'built_year', 'dwt': 'dwt (tonnes)'}, inplace=True)

In [11]:
full_data['length (m)'] = full_data['length (m)'].astype(float)
full_data['beam (m)'] = full_data['beam (m)'].astype(float)

In [12]:
full_data.dtypes

IMO_number         int64
gross_tonnage      int64
dwt (tonnes)     float64
length (m)       float64
beam (m)         float64
built_year         int64
dtype: object

In [13]:
full_data

Unnamed: 0,IMO_number,gross_tonnage,dwt (tonnes),length (m),beam (m),built_year
0,7361312,21545,3335.0,184.55,26.40,1975
1,7361324,528,3335.0,184.55,26.40,1976
2,7528611,5466,8661.0,154.00,21.67,1978
3,7826790,30317,3515.0,168.49,26.04,1981
4,7827213,25076,1703.0,158.90,25.20,1992
...,...,...,...,...,...,...
214,9892690,50629,5936.0,212.40,30.60,2022
215,9208629,59925,8850.0,215.44,31.85,2001
216,9283186,69132,71373.0,272.00,40.00,2004
217,9283239,69132,71415.0,272.00,40.00,2005


## Get the list of IMO numbers and names

In [26]:
DATABASE = os.environ['DATABASE']
TABLE = os.environ['TABLE']
OUTPUT_LOCATION = os.environ['QUERY_LOCATION']

In [27]:
my_session = boto3.session.Session(
    region_name=os.environ['REGION'], 
    aws_access_key_id=os.environ['ACCESS_KEY'], 
    aws_secret_access_key=os.environ['SECRET_KEY']
)

In [46]:
query = f"""
    WITH latest_versions AS (
        SELECT CAST(year AS INTEGER) AS year, MAX(CAST(version AS INTEGER)) AS latest_version
        FROM "{DATABASE}"."{TABLE}"
        GROUP BY CAST(year AS INTEGER)
    ),

    latest_data AS (
        SELECT *
        FROM "{DATABASE}"."{TABLE}" se
        JOIN latest_versions lv
        ON CAST(se.year AS INT) = lv.year
        AND CAST(se.version AS INT) = lv.latest_version
    )
    
    SELECT DISTINCT imo_number FROM latest_data;
"""

In [47]:
distinc_imo_numbers = wr.athena.read_sql_query(query, database=DATABASE, boto3_session=my_session)

In [48]:
distinc_imo_numbers.head()

Unnamed: 0,imo_number
0,7358755
1,7360605
2,7360681
3,7361324
4,7422881


In [49]:
distinc_imo_numbers.shape

(21014, 1)

In [63]:
imo_numbers_for_synthetic_data = distinc_imo_numbers[~distinc_imo_numbers['imo_number'].isin(full_data['IMO_number'].to_list())].reset_index(drop=True)
imo_numbers_for_synthetic_data.shape

(20795, 1)

# Generate synthetic data

In [14]:
import sdv
from sdv.metadata import Metadata

In [15]:
metadata = Metadata.detect_from_dataframe(
    data=full_data,
    table_name='ship_specs')

In [70]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)

# Step 2: Train the synthesizer
synthesizer.fit(full_data)

# Step 3: Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=20795)

In [71]:
synthetic_data.head()

Unnamed: 0,IMO_number,gross_tonnage,dwt (tonnes),length (m),beam (m),built_year
0,12499215,148633,184679.8,377.11,41.201,2006
1,15733211,19057,22.0,231.04,27.546,2001
2,11006768,93868,86920.0,326.49,42.174,1986
3,14630690,40968,14110.1,211.34,27.567,2009
4,15479617,79858,8.9,288.07,22.534,2008


In [72]:
synthetic_data.tail()

Unnamed: 0,IMO_number,gross_tonnage,dwt (tonnes),length (m),beam (m),built_year
20790,9241455,89496,9.7,357.3,32.874,2004
20791,12206667,20261,8.8,133.79,23.291,1987
20792,2587706,17492,1729.2,238.08,21.66,2009
20793,6954265,94272,226518.8,344.39,44.088,2002
20794,8906628,45361,14.0,260.47,36.459,2017


In [73]:
synthetic_data = imo_numbers_for_synthetic_data.join(synthetic_data)

In [82]:
synthetic_data['synthetic'] = True

In [83]:
synthetic_data

Unnamed: 0,imo_number,gross_tonnage,dwt (tonnes),length (m),beam (m),built_year,synthetic
0,7358755,148633,184679.8,377.11,41.201,2006,True
1,7360605,19057,22.0,231.04,27.546,2001,True
2,7360681,93868,86920.0,326.49,42.174,1986,True
3,7422881,40968,14110.1,211.34,27.567,2009,True
4,7527306,79858,8.9,288.07,22.534,2008,True
...,...,...,...,...,...,...,...
20790,9960306,89496,9.7,357.30,32.874,2004,True
20791,9962524,20261,8.8,133.79,23.291,1987,True
20792,9969144,17492,1729.2,238.08,21.660,2009,True
20793,9976070,94272,226518.8,344.39,44.088,2002,True


In [75]:
synthetic_data.drop(['IMO_number'], axis=1, inplace=True)

In [76]:
synthetic_data

Unnamed: 0,imo_number,gross_tonnage,dwt (tonnes),length (m),beam (m),built_year
0,7358755,148633,184679.8,377.11,41.201,2006
1,7360605,19057,22.0,231.04,27.546,2001
2,7360681,93868,86920.0,326.49,42.174,1986
3,7422881,40968,14110.1,211.34,27.567,2009
4,7527306,79858,8.9,288.07,22.534,2008
...,...,...,...,...,...,...
20790,9960306,89496,9.7,357.30,32.874,2004
20791,9962524,20261,8.8,133.79,23.291,1987
20792,9969144,17492,1729.2,238.08,21.660,2009
20793,9976070,94272,226518.8,344.39,44.088,2002


In [84]:
full_data = full_data.rename(columns={'IMO_number': 'imo_number'})
full_data['synthetic'] = False

In [85]:
full_data

Unnamed: 0,imo_number,gross_tonnage,dwt (tonnes),length (m),beam (m),built_year,synthetic
0,7361312,21545,3335.0,184.55,26.40,1975,False
1,7361324,528,3335.0,184.55,26.40,1976,False
2,7528611,5466,8661.0,154.00,21.67,1978,False
3,7826790,30317,3515.0,168.49,26.04,1981,False
4,7827213,25076,1703.0,158.90,25.20,1992,False
...,...,...,...,...,...,...,...
214,9892690,50629,5936.0,212.40,30.60,2022,False
215,9208629,59925,8850.0,215.44,31.85,2001,False
216,9283186,69132,71373.0,272.00,40.00,2004,False
217,9283239,69132,71415.0,272.00,40.00,2005,False


In [86]:
ship_specs_final = pd.concat([synthetic_data, full_data], ignore_index=True).reset_index(drop=True)

In [87]:
ship_specs_final

Unnamed: 0,imo_number,gross_tonnage,dwt (tonnes),length (m),beam (m),built_year,synthetic
0,7358755,148633,184679.8,377.11,41.201,2006,True
1,7360605,19057,22.0,231.04,27.546,2001,True
2,7360681,93868,86920.0,326.49,42.174,1986,True
3,7422881,40968,14110.1,211.34,27.567,2009,True
4,7527306,79858,8.9,288.07,22.534,2008,True
...,...,...,...,...,...,...,...
21009,9892690,50629,5936.0,212.40,30.600,2022,False
21010,9208629,59925,8850.0,215.44,31.850,2001,False
21011,9283186,69132,71373.0,272.00,40.000,2004,False
21012,9283239,69132,71415.0,272.00,40.000,2005,False


In [88]:
ship_specs_final.to_csv('../data/processed/ship_technical_specs.csv', index=False)