In [1]:
import json
import re
import pandas as pd

from dataclasses import dataclass

# Load the data 

In [31]:
with open('../data/raw/ship_particulars/sflcorp_fleet.json', 'r') as f:
    data = json.load(f)

In [33]:
len(data.keys())

72

# Define a class for the ship particulars

In [23]:
@dataclass
class ShipTechnicalSpecs:
    IMO_number: str
    call_sign: str
    mmsi: str
    flag: str
    built_year: str
    displacement: str
    length: str
    beam: str
    depth: str
    draft: str
    gross_tonnage: str
    net_tonnage: str
    engines: str
    propulsion: str
    speed: str
    dwt: str
    capacity: str

In [14]:
def extract_field(value, field_name):
    pattern = {
        "IMO": r"IMO number:\s*(\d+)",
        "Call Sign": r"Call\s?sign[:\s]*([A-Za-z0-9]+)",
        "MMSI": r"MMSI number[:\s]*([A-Za-z0-9]+)",
        "GT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*GT",
        "NT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*NT",
        "DWT": r"(\d{1,3}(?:,\d{3})*(?:\.\d+)?)[\s\u00A0]*DWT"
    }
    match = re.search(pattern.get(field_name, ""), value, re.IGNORECASE)
    if match:
        return match.group(1).replace(",", "")  # Remove commas from the numeric value
    return "N/A"

In [34]:
def process_ship_data(ship_data):
    processed_data = []

    for imo_number, ship_details in ship_data.items():
        imo_number = imo_number
        
        identification = ship_details.get('Identification', "Missing")
        mmsi = extract_field(identification, 'MMSI')
        call_sign = extract_field(identification, 'Call Sign')
        
        flag = ship_details.get("Flag", "Missing")
        built_year = ship_details.get("Built year", "Missing")
        displacement = ship_details.get("Displacement", "Missing")
        length = ship_details.get("Length", "Missing")
        beam = ship_details.get("Beam", "Missing")
        breadth = ship_details.get("Breadth", "Missing")
        depth = ship_details.get("Depth", "Missing")
        draft = ship_details.get("Draft", "Missing")
        draught = ship_details.get("Draught", "Missing")
        
        tonnage = ship_details.get("Tonnage", "Missing")
        gross_tonnage = extract_field(tonnage, "GT")
        net_tonnage = extract_field(tonnage, "NT")
        deadweight = extract_field(tonnage, "DWT")
        
        engines = ship_details.get("Installed power", "Missing")
        propulsion = ship_details.get("Propulsion", "Missing")
        speed = ship_details.get("Speed", "Missing")
        capacity = ship_details.get("Capacity", "Missing")
        
        ship = ShipTechnicalSpecs(
            IMO_number=imo_number,
            mmsi=mmsi,
            call_sign=call_sign,
            flag=flag,
            built_year=built_year,
            displacement=displacement,
            length=length,
            beam=beam,
            depth=depth,
            draft=draft,
            gross_tonnage=gross_tonnage,
            net_tonnage=net_tonnage,
            dwt=deadweight,
            engines=engines,
            propulsion=propulsion,
            speed=speed,
            capacity=capacity
        )
        processed_data.append(ship)
    return processed_data

In [None]:
file_names = ["wikipedia_ship_data_v2", "sflcorp_fleet", "pleiades_fleet", "kaggle_ship_data", "cmb-tech-fleet"]
dataframes = []

for name in file_names:
    print(name)
    with open(f'../data/raw/ship_particulars/{name}.json', 'r') as f:
        data = json.load(f)
        processed_data = process_ship_data(data)
        output_data = [ship.__dict__ for ship in processed_data]
        dataframes.append(pd.DataFrame(output_data))

In [35]:
output_data = [ship.__dict__ for ship in processed_data]

In [36]:
sflcorp_df = pd.DataFrame(output_data)

In [37]:
sflcorp_df

Unnamed: 0,IMO_number,call_sign,mmsi,flag,built_year,displacement,length,beam,depth,draft,gross_tonnage,net_tonnage,engines,propulsion,speed,dwt,capacity
0,9617947,,,Hong Kong,2012,Missing,229m,Missing,Missing,Missing,,,Missing,Missing,Missing,,82.000 DWT
1,9600839,,,Hong Kong,2010,Missing,190m,Missing,Missing,Missing,,,Missing,Missing,Missing,,57.000 DWT
2,9539834,,,Hong Kong,2011,Missing,190m,Missing,Missing,Missing,,,Missing,Missing,Missing,,57.000 DWT
3,9615676,,,Hong Kong,2011,Missing,190m,Missing,Missing,Missing,,,Missing,Missing,Missing,,57.000 DWT
4,9615561,,,Hong Kong,2012,Missing,190m,Missing,Missing,Missing,,,Missing,Missing,Missing,,57.000 DWT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,9667162,,,Liberia,2014,Missing,Missing,Missing,Missing,Missing,,,Missing,Missing,Missing,,15.400 TEU
68,9667150,,,Liberia,2014,Missing,Missing,Missing,Missing,Missing,,,Missing,Missing,Missing,,15.400 TEU
69,9227340,,,Liberia,2002,Missing,282m,Missing,Missing,Missing,,,Missing,Missing,Missing,,4.100 TEU
70,9309150,,,Liberia,2005,Missing,176m,Missing,Missing,Missing,,,Missing,Missing,Missing,,1.700 TEU


In [38]:
wikipedia_df

Unnamed: 0,IMO_number,call_sign,mmsi,flag,built_year,displacement,length,beam,depth,draft,gross_tonnage,net_tonnage,engines,propulsion,speed,dwt,capacity
0,1007213,ZCIS,319866000,Missing,Missing,"8,850 t",126.20 m (414 ft 1 in),21.00 m (68 ft 11 in),Missing,5.76 m (18 ft 11 in),9932,2979,"8 diesel engines\ntotal 19,200 hp (14,300 kW)",2 propellers,19 knots (35 km/h; 22 mph) max,,26
1,6602898,CQSC,255717000,Missing,Missing,Missing,135 metres (443 ft),15.8 metres (52 ft),Missing,Missing,,,Missing,Missing,18 knots,,535 (normal)\n621 (maximum)
2,7037806,,,Missing,Missing,Missing,123.40 m (404 ft 10 in),19.20 m (63 ft 0 in),Missing,Missing,9149,,8 × NOHAB SF 112 diesels[2],2 × controllable pitch propellers\n1 × bow thr...,21 kn (38.89 km/h; 24.17 mph),,1165 passengers\n240 cars
3,7207451,FNKC,247322000,Missing,Missing,Missing,114.59 m (375 ft 11 in),18.62 m (61 ft 1 in),Missing,Missing,,,Two Pielstick 16PC2-2V-400 diesel engines,Missing,21.8 knots (40.4 km/h),,"1,000 passengers, 48 berths, 160 cars. 35 rail..."
4,7350090,5BWZ3,210832000,Missing,Missing,Missing,Missing,Missing,Missing,Missing,4774,,MAK 9M453AK diesel engine,Missing,Missing,,"1,200 passengers, 404 cars (120 lorries or 700..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,9056583,,246213000,Missing,Missing,Missing,166.77 m (547 ft 2 in),23.4 m (76 ft 9 in),Missing,5.8 m (19 ft 0 in),17464,,4 x Sulzer,Missing,22 knots (41 km/h; 25 mph),,114 berths
468,9208629,ZNNK8,235249000,Missing,Missing,"25,113 long tons (25,516 t)",215.44 m (706 ft 10 in),31.85 m (104 ft 6 in),Missing,Missing,59925,,Missing,"4 × Wärtsilä 9L46C\n37,800 kW (50,700 hp) (com...",22 knots (41 km/h; 25 mph)[1],8850,"1,360 passengers\n530 cabins (incl. 6 suites, ..."
469,9283186,,,Missing,Missing,Missing,272 m (892.4 ft)[1],40 m (131.2 ft)[1],24.2 m (79.4 ft)[1],Missing,69132,,HSD Engine Co. Ltd. 8RTA96C-B[6],Missing,23 knots[7],,Missing
470,9283239,,,Missing,Missing,Missing,272 m (892.4 ft)[1],40 m (131.2 ft)[1],24.2 m (79.4 ft)[1],Missing,69132,,HSD Engine Co. Ltd. 8RTA96C-B[6],Missing,23 knots[7],,Missing
