In [16]:
import json
import re
import os

import boto3
import awswrangler as wr
import pandas as pd
import numpy as np

from dataclasses import dataclass

# Define a class for the ship particulars

In [6]:
@dataclass
class ShipTechnicalSpecs:
    IMO_number: str
    call_sign: str
    mmsi: str
    flag: str
    built_year: str
    length: str
    beam: str
    depth: str
    draft: str
    gross_tonnage: str
    net_tonnage: str
    engines: str
    propulsion: str
    speed: str
    dwt: str
    capacity: str

In [3]:
def extract_field(value, field_name):
    pattern = {
        "IMO": r"IMO number:\s*(\d+)",
        "Call Sign": r"Call\s?sign[:\s]*([A-Za-z0-9]+)",
        "MMSI": r"MMSI number[:\s]*([A-Za-z0-9]+)",
        "GT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*GT",
        "NT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*NT",
        "DWT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*DWT"
    }
    match = re.search(pattern.get(field_name, ""), value, re.IGNORECASE)
    if match:
        return match.group(1).replace(",", "")  # Remove commas from the numeric value
    return "N/A"

In [7]:
def process_ship_data(ship_data):
    processed_data = []

    for imo_number, ship_details in ship_data.items():
        imo_number = imo_number
        
        identification = ship_details.get("Identification", "Missing")
        mmsi = extract_field(identification, "MMSI")
        call_sign = extract_field(identification, "Call Sign")
        
        flag = ship_details.get("Flag", "Missing")
        built_year = ship_details.get("Built year", "Missing")
        # displacement = ship_details.get("Displacement", "Missing")
        length = ship_details.get("Length", "Missing")
        beam = ship_details.get("Beam", "Missing") or ship_details.get("Breadth", "Missing")
        # breadth = ship_details.get("Breadth", "Missing")
        depth = ship_details.get("Depth", "Missing")
        draft = ship_details.get("Draft", "Missing") or ship_details.get("Draught", "Missing")
        # draught = ship_details.get("Draught", "Missing")
        
        tonnage = ship_details.get("Tonnage", "Missing")
        gross_tonnage = extract_field(tonnage, "GT") or ship_details.get("grossTonnage", "Missing")
        net_tonnage = extract_field(tonnage, "NT") or ship_details.get("netTonnage", "Missing")
        deadweight = extract_field(tonnage, "DWT") or ship_details.get("DWT", "Missing")
        
        engines = ship_details.get("Installed power", "Missing")
        propulsion = ship_details.get("Propulsion", "Missing")
        speed = ship_details.get("Speed", "Missing")
        capacity = ship_details.get("Capacity", "Missing")
        
        ship = ShipTechnicalSpecs(
            IMO_number=imo_number,
            mmsi=mmsi,
            call_sign=call_sign,
            flag=flag,
            built_year=built_year,
            # displacement=displacement,
            length=length,
            beam=beam,
            depth=depth,
            draft=draft,
            gross_tonnage=gross_tonnage,
            net_tonnage=net_tonnage,
            dwt=deadweight,
            engines=engines,
            propulsion=propulsion,
            speed=speed,
            capacity=capacity
        )
        processed_data.append(ship)
    return processed_data

In [8]:
file_names = ["wikipedia_ship_data_v2", "sflcorp_fleet", "pleiades_fleet", "kaggle_ship_data", "cmb-tech-fleet"]
dataframes = []

for name in file_names:
    print(name)
    with open(f'../data/raw/ship_particulars/{name}.json', 'r') as f:
        data = json.load(f)
        processed_data = process_ship_data(data)
        output_data = [ship.__dict__ for ship in processed_data]
        dataframes.append(pd.DataFrame(output_data))

wikipedia_ship_data_v2
sflcorp_fleet
pleiades_fleet
kaggle_ship_data
cmb-tech-fleet


In [9]:
df = pd.concat(dataframes, ignore_index=True)

In [10]:
df

Unnamed: 0,IMO_number,call_sign,mmsi,flag,built_year,length,beam,depth,draft,gross_tonnage,net_tonnage,engines,propulsion,speed,dwt,capacity
0,1007213,ZCIS,319866000,Missing,Missing,126.20 m (414 ft 1 in),21.00 m (68 ft 11 in),Missing,5.76 m (18 ft 11 in),9932,2979,"8 diesel engines\ntotal 19,200 hp (14,300 kW)",2 propellers,19 knots (35 km/h; 22 mph) max,,26
1,6602898,CQSC,255717000,Missing,Missing,135 metres (443 ft),15.8 metres (52 ft),Missing,Missing,,,Missing,Missing,18 knots,,535 (normal)\n621 (maximum)
2,7037806,,,Missing,Missing,123.40 m (404 ft 10 in),19.20 m (63 ft 0 in),Missing,Missing,9149,,8 × NOHAB SF 112 diesels[2],2 × controllable pitch propellers\n1 × bow thr...,21 kn (38.89 km/h; 24.17 mph),,1165 passengers\n240 cars
3,7207451,FNKC,247322000,Missing,Missing,114.59 m (375 ft 11 in),18.62 m (61 ft 1 in),Missing,Missing,,,Two Pielstick 16PC2-2V-400 diesel engines,Missing,21.8 knots (40.4 km/h),,"1,000 passengers, 48 berths, 160 cars. 35 rail..."
4,7350090,5BWZ3,210832000,Missing,Missing,Missing,Missing,Missing,Missing,4774,,MAK 9M453AK diesel engine,Missing,Missing,,"1,200 passengers, 404 cars (120 lorries or 700..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4371,185337,,,UK,2022,23.05,Missing,Missing,1.8,,,Missing,Missing,Missing,,Missing
4372,186553,,,UK,2022,23.05,Missing,Missing,1.8,,,Missing,Missing,Missing,,Missing
4373,913329,,,UK,2007,15.87,Missing,Missing,1.8,,,Missing,Missing,Missing,,Missing
4374,913037,,,UK,2007,15.86,Missing,Missing,1.8,,,Missing,Missing,Missing,,Missing


In [12]:
df['beam'].value_counts(dropna=False)

Missing                                             3927
32.2 m (106 ft)                                        8
30.4 m (99 ft 9 in)                                    7
29.00 m (95 ft 2 in)                                   5
32 m (105 ft)                                          5
                                                    ... 
25.46 m (83.5 ft)                                      1
23.8 m (78.1 ft)                                       1
25.70 m (84 ft)                                        1
32.2 m (106 ft) (moulded)\n38.1 m (125 ft) (max)       1
42.0 m (137 ft 10 in)                                  1
Name: beam, Length: 345, dtype: int64

In [13]:
regex_pattern = r'(\d+(\.\d+)?)\s*m'
df['length (m)'] = df['length'].str.extract(regex_pattern)[0]
df['beam (m)'] = df['beam'].str.extract(regex_pattern)[0]
df['depth (m)'] = df['depth'].str.extract(regex_pattern)[0]
df['draft (m)'] = df['draft'].str.extract(regex_pattern)[0]

In [None]:
df

In [14]:
df.drop(['length', 'beam', 'depth', 'draft'], axis=1, inplace=True)

In [18]:
df.replace('Missing', np.nan, inplace=True)

In [19]:
df.replace('N/A', np.nan, inplace=True)

In [20]:
df

Unnamed: 0,IMO_number,call_sign,mmsi,flag,built_year,gross_tonnage,net_tonnage,engines,propulsion,speed,dwt,capacity,length (m),beam (m),depth (m),draft (m)
0,1007213,ZCIS,319866000,,,9932,2979,"8 diesel engines\ntotal 19,200 hp (14,300 kW)",2 propellers,19 knots (35 km/h; 22 mph) max,,26,126.20,21.00,,5.76
1,6602898,CQSC,255717000,,,,,,,18 knots,,535 (normal)\n621 (maximum),135,15.8,,
2,7037806,,,,,9149,,8 × NOHAB SF 112 diesels[2],2 × controllable pitch propellers\n1 × bow thr...,21 kn (38.89 km/h; 24.17 mph),,1165 passengers\n240 cars,123.40,19.20,,
3,7207451,FNKC,247322000,,,,,Two Pielstick 16PC2-2V-400 diesel engines,,21.8 knots (40.4 km/h),,"1,000 passengers, 48 berths, 160 cars. 35 rail...",114.59,18.62,,
4,7350090,5BWZ3,210832000,,,4774,,MAK 9M453AK diesel engine,,,,"1,200 passengers, 404 cars (120 lorries or 700...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4371,185337,,,UK,2022,,,,,,,,,,,
4372,186553,,,UK,2022,,,,,,,,,,,
4373,913329,,,UK,2007,,,,,,,,,,,
4374,913037,,,UK,2007,,,,,,,,,,,


In [21]:
df['IMO_number'] = df['IMO_number'].astype(int)

In [22]:
df.dtypes

IMO_number        int64
call_sign        object
mmsi             object
flag             object
built_year       object
gross_tonnage    object
net_tonnage      object
engines          object
propulsion       object
speed            object
dwt              object
capacity         object
length (m)       object
beam (m)         object
depth (m)        object
draft (m)        object
dtype: object

In [24]:
import sdv
from sdv.metadata import Metadata

In [25]:
metadata = Metadata.detect_from_dataframe(
    data=df,
    table_name='ship_specs')

In [31]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)

# Step 2: Train the synthesizer
synthesizer.fit(df)

# Step 3: Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=100)



In [33]:
synthetic_data.head()

Unnamed: 0,IMO_number,call_sign,mmsi,flag,built_year,gross_tonnage,net_tonnage,engines,propulsion,speed,dwt,capacity,length (m),beam (m),depth (m),draft (m)
0,8746561,,,,,,,,,,,,,,,
1,9555012,,,,,,,,,,,,,,,
2,9714734,,,,,,,,,,,,,,,
3,9497837,,,,,,,,,21 knots (39 km/h; 24 mph) (service)\n22 knots...,,,,30.4,,
4,9186575,,,,,,,,,,,"2,506 passengers",330.0,,,
