In [1]:
import json
import re
import pandas as pd

from dataclasses import dataclass

# Load the data 

In [31]:
with open('../data/raw/ship_particulars/sflcorp_fleet.json', 'r') as f:
    data = json.load(f)

In [33]:
len(data.keys())

72

# Define a class for the ship particulars

In [23]:
@dataclass
class ShipTechnicalSpecs:
    IMO_number: str
    call_sign: str
    mmsi: str
    flag: str
    built_year: str
    displacement: str
    length: str
    beam: str
    depth: str
    draft: str
    gross_tonnage: str
    net_tonnage: str
    engines: str
    propulsion: str
    speed: str
    dwt: str
    capacity: str

In [14]:
def extract_field(value, field_name):
    pattern = {
        "IMO": r"IMO number:\s*(\d+)",
        "Call Sign": r"Call\s?sign[:\s]*([A-Za-z0-9]+)",
        "MMSI": r"MMSI number[:\s]*([A-Za-z0-9]+)",
        "GT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*GT",
        "NT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*NT",
        "DWT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*DWT"
    }
    match = re.search(pattern.get(field_name, ""), value, re.IGNORECASE)
    if match:
        return match.group(1).replace(",", "")  # Remove commas from the numeric value
    return "N/A"

In [34]:
processed_data = []

for imo_number, ship_details in data.items():
    imo_number = imo_number
    
    identification = ship_details.get('Identification', "Missing")
    mmsi = extract_field(identification, 'MMSI')
    call_sign = extract_field(identification, 'Call Sign')
    
    flag = ship_details.get("Flag", "Missing")
    built_year = ship_details.get("Built year", "Missing")
    displacement = ship_details.get("Displacement", "Missing")
    length = ship_details.get("Length", "Missing")
    beam = ship_details.get("Beam", "Missing")
    breadth = ship_details.get("Breadth", "Missing")
    depth = ship_details.get("Depth", "Missing")
    draft = ship_details.get("Draft", "Missing")
    draught = ship_details.get("Draught", "Missing")
    
    tonnage = ship_details.get("Tonnage", "Missing")
    gross_tonnage = extract_field(tonnage, "GT")
    net_tonnage = extract_field(tonnage, "NT")
    deadweight = extract_field(tonnage, "DWT")
    
    engines = ship_details.get("Installed power", "Missing")
    propulsion = ship_details.get("Propulsion", "Missing")
    speed = ship_details.get("Speed", "Missing")
    capacity = ship_details.get("Capacity", "Missing")
    
    # Create an instance of ShipParticulars
    ship = ShipTechnicalSpecs(
        IMO_number=imo_number,
        mmsi=mmsi,
        call_sign=call_sign,
        flag=flag,
        built_year=built_year,
        displacement=displacement,
        length=length,
        beam=beam,
        depth=depth,
        draft=draft,
        gross_tonnage=gross_tonnage,
        net_tonnage=net_tonnage,
        dwt=deadweight,
        engines=engines,
        propulsion=propulsion,
        speed=speed,
        capacity=capacity
    )
    processed_data.append(ship)

In [35]:
output_data = [ship.__dict__ for ship in processed_data]

In [36]:
sflcorp_df = pd.DataFrame(output_data)

In [37]:
sflcorp_df

Unnamed: 0,IMO_number,call_sign,mmsi,flag,built_year,displacement,length,beam,depth,draft,gross_tonnage,net_tonnage,engines,propulsion,speed,dwt,capacity
0,9617947,,,Hong Kong,2012,Missing,229m,Missing,Missing,Missing,,,Missing,Missing,Missing,,82.000 DWT
1,9600839,,,Hong Kong,2010,Missing,190m,Missing,Missing,Missing,,,Missing,Missing,Missing,,57.000 DWT
2,9539834,,,Hong Kong,2011,Missing,190m,Missing,Missing,Missing,,,Missing,Missing,Missing,,57.000 DWT
3,9615676,,,Hong Kong,2011,Missing,190m,Missing,Missing,Missing,,,Missing,Missing,Missing,,57.000 DWT
4,9615561,,,Hong Kong,2012,Missing,190m,Missing,Missing,Missing,,,Missing,Missing,Missing,,57.000 DWT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,9667162,,,Liberia,2014,Missing,Missing,Missing,Missing,Missing,,,Missing,Missing,Missing,,15.400 TEU
68,9667150,,,Liberia,2014,Missing,Missing,Missing,Missing,Missing,,,Missing,Missing,Missing,,15.400 TEU
69,9227340,,,Liberia,2002,Missing,282m,Missing,Missing,Missing,,,Missing,Missing,Missing,,4.100 TEU
70,9309150,,,Liberia,2005,Missing,176m,Missing,Missing,Missing,,,Missing,Missing,Missing,,1.700 TEU
