In [1]:
import pandas as pd
import numpy as np
import warnings
import sys
#warnings.filterwarnings('ignore')

In [5]:
# read data from file
filename = '../data/external/seilas-2022.csv'
df = pd.read_csv(filename, delimiter=';', decimal=',', encoding='ISO-8859-1')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160564 entries, 0 to 160563
Data columns (total 43 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   seilas_id                      160564 non-null  int64  
 1   skips_id                       160564 non-null  int64  
 2   imo_nummer                     159684 non-null  float64
 3   mmsi_nummer                    159675 non-null  float64
 4   kallesignal                    160366 non-null  object 
 5   fartoynavn                     160564 non-null  object 
 6   byggeaar                       160368 non-null  float64
 7   bruttotonnasje_bt              160554 non-null  float64
 8   doedvekttonn_dwt               147475 non-null  float64
 9   lengde                         159856 non-null  float64
 10  bredde                         159854 non-null  float64
 11  hoeyde                         23323 non-null   float64
 12  hoeyde_aktuell                

In [7]:
df.head()

Unnamed: 0,seilas_id,skips_id,imo_nummer,mmsi_nummer,kallesignal,fartoynavn,byggeaar,bruttotonnasje_bt,doedvekttonn_dwt,lengde,...,ankomsthavn_kode,ankomsthavn_navn,ankomsttidspunkt,land_ankomst,landkode_ankomst_totegn,fylkesnavn_ankomst,fylkesnr_ankomst,kommunenavn_ankomst,kommunenr_ankomst,lokasjonstype_ankomst
0,1910084,311277,9349863.0,259222000.0,LNWC,COLOR MAGIC,2007.0,75156.0,6133.0,223.899994,...,DEKEL,Kiel,2022-06-23 10:00:00+02:00,TYSKLAND,DE,,,,,Havn
1,1712083,230941,8826541.0,273437080.0,UENY,GALATIS,1990.0,889.0,414.0,54.82,...,XZBSE,Barentshavet,2022-01-09 14:00:00+01:00,INTERNASJONALT FARVANN,XZ,,,,,Sted i sjøen
2,1910154,354157,9586617.0,219348000.0,OYPJ2,BERGENSFJORD,2014.0,32491.0,3900.0,170.0,...,DKHIR,Hirtshals,2022-06-22 19:00:00+02:00,DANMARK,DK,,,,,Havn
3,1910576,205648,8318063.0,257080050.0,LFYQ,BARENTS OCEAN,1984.0,1525.0,1837.0,76.449997,...,NOLAO,Langøya,2022-06-23 16:00:00+02:00,NORGE,NO,Vestfold og Telemark,38.0,Holmestrand,3802.0,Havn
4,1910191,354332,9592575.0,244810617.0,PCYE,REGGEBORG,2014.0,14224.0,23249.0,169.75,...,NOHRY,Herøya,2022-06-27 20:10:00+02:00,NORGE,NO,Vestfold og Telemark,38.0,Porsgrunn,3806.0,Havn


In [14]:
# some dataset characteristics
columns = ['mmsi_nummer', 'ankomsthavn_navn', 'avgangshavn_navn', 'skipstype', 'skipsgruppe']
for column in columns:
    print(f'{df[column].nunique()} unique {column}')

# print list of ship types and groups
print('\n')
print(f'Ship types: \n {sorted(df.skipstype.unique())}')
print('\n')
print(f'Ship categories: \n {sorted(df.skipsgruppe.unique())}')

4142 unique mmsi_nummer
1595 unique ankomsthavn_navn
1584 unique avgangshavn_navn
97 unique skipstype
6 unique skipsgruppe


Ship types: 
 ['Accommodation Platform, semi submersible', 'Accommodation Ship', 'Aggregates Carrier', 'Anchor Handling Tug Supply', 'Anchor Handling Vessel', 'Asphalt/Bitumen Tanker', 'Barge', 'Bulk Carrier', 'Bulk Carrier, Self-discharging', 'Bunkering Tanker', 'Buoy & Lighthouse Tender', 'Buoy Tender', 'CO2 Tanker', 'Cable Layer', 'Cement Carrier', 'Chemical Tanker', 'Chemical/Products Tanker', 'Combination Gas Tanker (LNG/LPG)', 'Container Ship (Fully Cellular)', 'Crane Vessel', 'Crew Boat', 'Crew/Supply Vessel', 'Crude Oil Tanker', 'Crude/Oil Products Tanker', 'Deck Cargo Ship', 'Diving Support Vessel', 'Drilling Rig, jack up', 'Drilling Rig, semi Submersible', 'Drilling Ship', 'FPSO, Oil', 'FSO, Oil', 'Fish Carrier', 'Fish Factory Ship', 'Fish Farm Support Vessel', 'Fishery Patrol Vessel', 'Fishery Research Vessel', 'Fishery Support Vessel', 'Fishing Vessel

In [16]:
# filter data to only contain ships of category 'Last' (= cargo)
df_cargo = df[df['skipsgruppe']=='Last']
df_cargo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84156 entries, 3 to 160562
Data columns (total 43 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   seilas_id                      84156 non-null  int64  
 1   skips_id                       84156 non-null  int64  
 2   imo_nummer                     83988 non-null  float64
 3   mmsi_nummer                    83855 non-null  float64
 4   kallesignal                    84012 non-null  object 
 5   fartoynavn                     84156 non-null  object 
 6   byggeaar                       84099 non-null  float64
 7   bruttotonnasje_bt              84148 non-null  float64
 8   doedvekttonn_dwt               82098 non-null  float64
 9   lengde                         83894 non-null  float64
 10  bredde                         83892 non-null  float64
 11  hoeyde                         9220 non-null   float64
 12  hoeyde_aktuell                 78656 non-null  flo

In [21]:
df_cargo[df_cargo['mmsi_nummer'].isnull()]

Unnamed: 0,seilas_id,skips_id,imo_nummer,mmsi_nummer,kallesignal,fartoynavn,byggeaar,bruttotonnasje_bt,doedvekttonn_dwt,lengde,...,ankomsthavn_kode,ankomsthavn_navn,ankomsttidspunkt,land_ankomst,landkode_ankomst_totegn,fylkesnavn_ankomst,fylkesnr_ankomst,kommunenavn_ankomst,kommunenr_ankomst,lokasjonstype_ankomst
452,1915397,332355,9361744.0,,PGDL,SOLENTBANK,2007.0,2998.0,4950.0,89.949997,...,NOSLV,Sløvåg Industriområde,2022-07-04 16:00:00+02:00,NORGE,NO,Vestland,46.0,Gulen,4635.0,Havn
2260,1956479,200418,8214358.0,,V2HG3,ST. PAULI,1983.0,3075.0,3219.0,92.410004,...,NOSPG,Sarpsborg,2022-09-19 21:00:00+02:00,NORGE,NO,Viken,30.0,Sarpsborg,3003.0,Havn
2446,1959828,200418,8214358.0,,V2HG3,ST. PAULI,1983.0,3075.0,3219.0,92.410004,...,DEWIS,Wismar,2022-09-24 12:00:00+02:00,TYSKLAND,DE,,,,,Havn
3249,1961549,200418,8214358.0,,V2HG3,ST. PAULI,1983.0,3075.0,3219.0,92.410004,...,NOSPG,Sarpsborg,2022-09-26 10:00:00+02:00,NORGE,NO,Viken,30.0,Sarpsborg,3003.0,Havn
3553,1961550,200418,8214358.0,,V2HG3,ST. PAULI,1983.0,3075.0,3219.0,92.410004,...,SEVAG,Varberg,2022-09-28 06:00:00+02:00,SVERIGE,SE,,,,,Havn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160044,1914882,352035,9431006.0,,LAMV8,WILSON NEWCASTLE,2011.0,6118.0,8326.0,123.040001,...,NOIKA,Vikan - Bodø,2022-07-04 23:55:00+02:00,NORGE,NO,Nordland,18.0,Bodø,1804.0,Havn
160131,1969306,352035,9431006.0,,LAMV8,WILSON NEWCASTLE,2011.0,6118.0,8326.0,123.040001,...,NOHOA,Holla,2022-10-16 10:35:00+02:00,NORGE,NO,Trøndelag,50.0,Heim,5055.0,Havn
160193,1829551,352035,9431006.0,,LAMV8,WILSON NEWCASTLE,2011.0,6118.0,8326.0,123.040001,...,NOHOA,Holla,2022-01-23 22:00:00+01:00,NORGE,NO,Trøndelag,50.0,Heim,5055.0,Havn
160267,1835755,352035,9431006.0,,LAMV8,WILSON NEWCASTLE,2011.0,6118.0,8326.0,123.040001,...,NOLEP,Leirpollen,2022-01-31 09:00:00+01:00,NORGE,NO,Troms og Finnmark,54.0,Deatnu Tana,5441.0,Havn
