In [1]:
import pandas as pd
import numpy as np
import warnings
import sys
#warnings.filterwarnings('ignore')

In [2]:
# read data from file
filename = '../data/external/seilas2019.csv'
df = pd.read_csv(filename, delimiter=';', decimal=',', encoding='ISO-8859-1')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156709 entries, 0 to 156708
Data columns (total 43 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   seilas_id                      156709 non-null  int64  
 1   skips_id                       156709 non-null  int64  
 2   imo_nummer                     156020 non-null  float64
 3   mmsi_nummer                    155412 non-null  float64
 4   kallesignal                    156332 non-null  object 
 5   fartoynavn                     156709 non-null  object 
 6   byggeaar                       156553 non-null  float64
 7   bruttotonnasje_bt              156708 non-null  float64
 8   doedvekttonn_dwt               148086 non-null  float64
 9   lengde                         156709 non-null  float64
 10  bredde                         156646 non-null  float64
 11  hoeyde                         21979 non-null   float64
 12  hoeyde_aktuell                

In [4]:
df.head()

Unnamed: 0,seilas_id,skips_id,imo_nummer,mmsi_nummer,kallesignal,fartoynavn,byggeaar,bruttotonnasje_bt,doedvekttonn_dwt,lengde,...,ankomsthavn_kode,ankomsthavn_navn,ankomsttidspunkt,land_ankomst,landkode_ankomst_totegn,fylkesnavn_ankomst,fylkesnr_ankomst,kommunenavn_ankomst,kommunenr_ankomst,lokasjonstype_ankomst
0,1306838,285944,9213973.0,257327000.0,LMWR,MALENE S,2000.0,2190.0,2500.0,68.300003,...,XZNAO,Nord-Atlanteren,2019-01-21 17:19:00+01,INTERNASJONALT FARVANN,XZ,,,,,Sted i sjøen
1,1306813,352829,9625023.0,257957000.0,LDWW,SKANDI MAROY,2012.0,3588.0,3594.0,82.199997,...,XZZNO,Ekofisk,2019-01-21 13:00:00+01,INTERNASJONALT FARVANN,XZ,,,,,Offshoreinstallasjon
2,1307030,282727,6721151.0,375219000.0,J8XH6,LOLA 1,1967.0,624.0,838.0,54.970001,...,NOLAO,Langøya,2019-01-23 22:00:00+01,NORGE,NO,Vestfold og Telemark,38.0,Tønsberg,3803.0,Havn
3,1306904,245973,8915524.0,257312000.0,LADP8,SILVER BIRD,1990.0,3625.0,3546.0,92.900002,...,NOTOS,Tromsø,2019-01-22 04:00:00+01,NORGE,NO,Troms og Finnmark,54.0,Tromsø,5401.0,Havn
4,1306954,360227,9750701.0,477150100.0,VRRA7,VLADIMIR RUSANOV,2018.0,128806.0,96844.0,299.0,...,NOHVG,Honningsvåg,2019-01-24 11:00:00+01,NORGE,NO,Troms og Finnmark,54.0,Nordkapp,5435.0,Havn


In [5]:
# some dataset characteristics
columns = ['mmsi_nummer', 'ankomsthavn_navn', 'avgangshavn_navn', 'skipstype', 'skipsgruppe']
for column in columns:
    print(f'{df[column].nunique()} unique {column}')

# print list of ship types and groups
print('\n')
print(f'Ship types: \n {sorted(df.skipstype.unique())}')
print('\n')
print(f'Ship categories: \n {sorted(df.skipsgruppe.unique())}')

4373 unique mmsi_nummer
1627 unique ankomsthavn_navn
1647 unique avgangshavn_navn
94 unique skipstype
6 unique skipsgruppe


Ship types: 
 ['Anchor Handling Tug Supply', 'Anchor Handling Vessel', 'Asphalt/Bitumen Tanker', 'Barge', 'Bulk Carrier', 'Bulk Carrier, Self-discharging', 'Bulk/Caustic Soda Carrier (CABU)', 'Bunkering Tanker', 'Buoy & Lighthouse Tender', 'Buoy Tender', 'CO2 Tanker', 'Cable Layer', 'Cable Repair Ship', 'Cement Carrier', 'Chemical Tanker', 'Chemical/Products Tanker', 'Combination Gas Tanker (LNG/LPG)', 'Container Ship (Fully Cellular)', 'Crane Vessel', 'Crew Boat', 'Crew/Supply Vessel', 'Crude Oil Tanker', 'Crude/Oil Products Tanker', 'Deck Cargo Ship', 'Diving Support Vessel', 'Dredger (unspecified)', 'Drilling Rig, jack up', 'Drilling Rig, semi Submersible', 'Drilling Ship', 'FPSO, Oil', 'Fish Carrier', 'Fish Factory Ship', 'Fish Farm Support Vessel', 'Fishery Research Vessel', 'Fishery Support Vessel', 'Fishing Vessel', 'General Cargo Ship', 'General Cargo Shi

In [6]:
# filter data to only contain ships of category 'Last' (= cargo)
df_cargo = df[df['skipsgruppe']=='Last']
df_cargo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 85756 entries, 1 to 156708
Data columns (total 43 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   seilas_id                      85756 non-null  int64  
 1   skips_id                       85756 non-null  int64  
 2   imo_nummer                     85668 non-null  float64
 3   mmsi_nummer                    85468 non-null  float64
 4   kallesignal                    85711 non-null  object 
 5   fartoynavn                     85756 non-null  object 
 6   byggeaar                       85681 non-null  float64
 7   bruttotonnasje_bt              85756 non-null  float64
 8   doedvekttonn_dwt               84988 non-null  float64
 9   lengde                         85756 non-null  float64
 10  bredde                         85705 non-null  float64
 11  hoeyde                         9886 non-null   float64
 12  hoeyde_aktuell                 43609 non-null  flo

In [7]:
df_cargo[df_cargo['mmsi_nummer'].isnull()]

Unnamed: 0,seilas_id,skips_id,imo_nummer,mmsi_nummer,kallesignal,fartoynavn,byggeaar,bruttotonnasje_bt,doedvekttonn_dwt,lengde,...,ankomsthavn_kode,ankomsthavn_navn,ankomsttidspunkt,land_ankomst,landkode_ankomst_totegn,fylkesnavn_ankomst,fylkesnr_ankomst,kommunenavn_ankomst,kommunenr_ankomst,lokasjonstype_ankomst
787,1349382,200418,8214358.0,,V2HG3,ST. PAULI,1983.0,3075.0,3219.0,92.410004,...,DEWIS,Wismar,2019-04-25 07:00:00+02,TYSKLAND,DE,,,,,Havn
1053,1349381,200418,8214358.0,,V2HG3,ST. PAULI,1983.0,3075.0,3219.0,92.410004,...,NOSPG,Sarpsborg,2019-04-23 07:00:00+02,NORGE,NO,Viken,30.0,Sarpsborg,3003.0,Havn
1154,1353475,351671,9507374.0,,LXWE,WILSON ALICANTE,2010.0,2451.0,3600.0,88.239998,...,NLRTM,Rotterdam,2019-05-10 19:00:00+02,NEDERLAND,NL,,,,,Havn
1320,1352084,351671,9507374.0,,LXWE,WILSON ALICANTE,2010.0,2451.0,3600.0,88.239998,...,NOMQN,Mo i Rana,2019-04-30 14:49:00+02,NORGE,NO,Nordland,18.0,Rana,1833.0,Havn
1720,1354383,200418,8214358.0,,V2HG3,ST. PAULI,1983.0,3075.0,3219.0,92.410004,...,SEVAG,Varberg,2019-05-06 09:30:00+02,SVERIGE,SE,,,,,Havn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155612,1339878,277096,9174751.0,,,CITY OF AMSTERDAM,1999.0,9950.0,2779.0,99.919998,...,NOZTR,Trondheim Reden,2019-03-31 08:12:00+02,NORGE,NO,Trøndelag,50.0,Trondheim,5001.0,Ankerplass
155708,1339029,277096,9174751.0,,,CITY OF AMSTERDAM,1999.0,9950.0,2779.0,99.919998,...,NOTRD,Trondheim,2019-03-31 16:36:00+02,NORGE,NO,Trøndelag,50.0,Trondheim,5001.0,Havn
155773,1336817,277096,9174751.0,,,CITY OF AMSTERDAM,1999.0,9950.0,2779.0,99.919998,...,NOBGO,Bergen,2019-03-27 08:54:00+01,NORGE,NO,Vestland,46.0,Bergen,4601.0,Havn
155849,1340274,277096,9174751.0,,,CITY OF AMSTERDAM,1999.0,9950.0,2779.0,99.919998,...,BEZEE,Zeebrugge,2019-04-06 12:00:00+02,BELGIA,BE,,,,,Havn
