In [1]:
import json
import re
import os

import boto3
import awswrangler as wr
import pandas as pd
import numpy as np

from dataclasses import dataclass

# Define a class for the ship particulars

In [19]:
@dataclass
class ShipTechnicalSpecs:
    IMO_number: str
    call_sign: str
    mmsi: str
    flag: str
    built_year: str
    length: str
    beam: str
    depth: str
    draft: str
    gross_tonnage: str
    # net_tonnage: str
    # engines: str
    # propulsion: str
    # speed: str
    dwt: str
    # capacity: str

In [3]:
def extract_field(value, field_name):
    pattern = {
        "IMO": r"IMO number:\s*(\d+)",
        "Call Sign": r"Call\s?sign[:\s]*([A-Za-z0-9]+)",
        "MMSI": r"MMSI number[:\s]*([A-Za-z0-9]+)",
        "GT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*GT",
        "NT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*NT",
        "DWT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*DWT"
    }
    match = re.search(pattern.get(field_name, ""), value, re.IGNORECASE)
    if match:
        return match.group(1).replace(",", "")  # Remove commas from the numeric value
    return "N/A"

In [17]:
def get_similar_values(ship_details, name1:str, name2:str):
    beam = ship_details.get(name1, "")
    print(beam)
    if beam:
        print(beam)
        match = re.search(r"\d+\.?\d*\s*m", beam, re.IGNORECASE)
        if match:
            return match.group(0)
    
    # Try Breadth dictionary format
    return ship_details.get(name2, "Missing")

In [5]:
def get_tonnage_value(ship_details, primary_field, backup_field, regex_pattern):
    direct_value = ship_details.get(backup_field)
    if direct_value and direct_value != "Missing":
        return str(direct_value)
    
    tonnage = ship_details.get("Tonnage", "")
    if tonnage:
        extracted = extract_field(tonnage, regex_pattern)
        if extracted != "N/A":
            return extracted
            
    return "Missing"

In [24]:
def process_ship_data(ship_data):
    processed_data = []

    for imo_number, ship_details in ship_data.items():
        imo_number = imo_number
        
        identification = ship_details.get("Identification", "Missing")
        mmsi = extract_field(identification, "MMSI")
        call_sign = extract_field(identification, "Call Sign")
        
        flag = ship_details.get("Flag", "Missing")
        built_year = ship_details.get("Built year", "Missing")
        # displacement = ship_details.get("Displacement", "Missing")
        # length = ship_details.get("Length", "Missing")
        length = get_similar_values(ship_details, "Length", "Length (m)")
        # beam = ship_details.get("Beam", "Missing") or ship_details.get("Breadth", "Missing")
        beam = get_similar_values(ship_details, "Beam", "Breadth (m)")
        # breadth = ship_details.get("Breadth", "Missing")
        depth = ship_details.get("Depth", "Missing")
        # draft = ship_details.get("Draft", "Missing") or ship_details.get("Draught", "Missing")
        draft = get_similar_values(ship_details, "Draft", "Draught")
        # draught = ship_details.get("Draught", "Missing")
        
        # tonnage = ship_details.get("Tonnage", "Missing")
        # gross_tonnage = extract_field(tonnage, "GT") or ship_details.get("grossTonnage", "Missing")
        # net_tonnage = extract_field(tonnage, "NT") or ship_details.get("netTonnage", "Missing")
        # deadweight = extract_field(tonnage, "DWT") or ship_details.get("DWT", "Missing")
        
        gross_tonnage = get_tonnage_value(ship_details, "Tonnage", "grossTonnage", "GT")
        # net_tonnage = get_tonnage_value(ship_details, "netTonnage", "NT", "NT")
        deadweight = get_tonnage_value(ship_details, "DWT", "deadweight", "DWT")
        
        # engines = ship_details.get("Installed power", "Missing")
        # propulsion = ship_details.get("Propulsion", "Missing")
        # speed = ship_details.get("Speed", "Missing")
        # capacity = ship_details.get("Capacity", "Missing")
        
        ship = ShipTechnicalSpecs(
            IMO_number=imo_number,
            mmsi=mmsi,
            call_sign=call_sign,
            flag=flag,
            built_year=built_year,
            length=length,
            beam=beam,
            depth=depth,
            draft=draft,
            gross_tonnage=gross_tonnage,
            # net_tonnage=net_tonnage,
            dwt=deadweight,
            # engines=engines,
            # propulsion=propulsion,
            # speed=speed,
            # capacity=capacity
        )
        processed_data.append(ship)
    return processed_data

In [None]:
file_names = ["wikipedia_ship_data_v2", "pleiades_fleet_v2"]
dataframes = []

for name in file_names:
    with open(f'../data/raw/ship_particulars/{name}.json', 'r') as f:
        data = json.load(f)
        processed_data = process_ship_data(data)
        output_data = [ship.__dict__ for ship in processed_data]
        dataframes.append(pd.DataFrame(output_data))

wikipedia_ship_data_v2
126.20 m (414 ft 1 in)
126.20 m (414 ft 1 in)
21.00 m (68 ft 11 in)
21.00 m (68 ft 11 in)
5.76 m (18 ft 11 in)
5.76 m (18 ft 11 in)
135 metres (443 ft)
135 metres (443 ft)
15.8 metres (52 ft)
15.8 metres (52 ft)

123.40 m (404 ft 10 in)
123.40 m (404 ft 10 in)
19.20 m (63 ft 0 in)
19.20 m (63 ft 0 in)

114.59 m (375 ft 11 in)
114.59 m (375 ft 11 in)
18.62 m (61 ft 1 in)
18.62 m (61 ft 1 in)




184.55 meters
184.55 meters
26.40 meters
26.40 meters

184.55 meters
184.55 meters
26.40 meters
26.40 meters

154 m (505 ft); 143 m (469 ft)
154 m (505 ft); 143 m (469 ft)
21.67 m (71.1 ft)
21.67 m (71.1 ft)

122.36 m (401 ft 5 in)
122.36 m (401 ft 5 in)
18.82 m (61 ft 9 in)
18.82 m (61 ft 9 in)

152.91 m (501 ft 8 in)
152.91 m (501 ft 8 in)
26.26 m (86 ft 2 in)
26.26 m (86 ft 2 in)

128.42 metres (421.3 ft)
128.42 metres (421.3 ft)
21.52 metres (70.6 ft)
21.52 metres (70.6 ft)

133.07 m (436 ft 7 in)
133.07 m (436 ft 7 in)
21.65 m (71 ft 0 in)
21.65 m (71 ft 0 in)

152.00

In [26]:
df = pd.concat(dataframes, ignore_index=True)

In [34]:
df[df['IMO_number'] == '7361312']

Unnamed: 0,IMO_number,call_sign,mmsi,flag,built_year,length,beam,depth,draft,gross_tonnage,net_tonnage,dwt
5,7361312,IBQI,247162200,Missing,Missing,184.55 m,26.40 m,Missing,6.20 meters,21545,Missing,3335


In [None]:
df['beam'].value_counts(dropna=False)

In [13]:
regex_pattern = r'(\d+(\.\d+)?)\s*m'
df['length (m)'] = df['length'].str.extract(regex_pattern)[0]
df['beam (m)'] = df['beam'].str.extract(regex_pattern)[0]
df['depth (m)'] = df['depth'].str.extract(regex_pattern)[0]
df['draft (m)'] = df['draft'].str.extract(regex_pattern)[0]

In [None]:
df

In [14]:
df.drop(['length', 'beam', 'depth', 'draft'], axis=1, inplace=True)

In [18]:
df.replace('Missing', np.nan, inplace=True)

In [19]:
df.replace('N/A', np.nan, inplace=True)

In [None]:
df

In [21]:
df['IMO_number'] = df['IMO_number'].astype(int)

In [None]:
df.dtypes

In [24]:
import sdv
from sdv.metadata import Metadata

In [25]:
metadata = Metadata.detect_from_dataframe(
    data=df,
    table_name='ship_specs')

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)

# Step 2: Train the synthesizer
synthesizer.fit(df)

# Step 3: Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=100)

In [None]:
synthetic_data.head()