# Combining AIS data

AIS data record ship location information; we have acquired two years of ship location information from Spire Inc.

The two years of data are stored on the GOST team's S3 bucket, but we will look at other opportunities to store the data, probably on DDH.

In [44]:
import sys, os, json, time
import rasterio, boto3, pygeohash, pyarrow

import geopandas as gpd
import pandas as pd
import pyarrow.feather as feather

def tPrint(s):
    print("%s\t%s" % (time.strftime("%H:%M:%S"), s))

In [32]:
aws_bucket = "wbgdecinternal-ntl"
path = "AIS"
out_folder = "/home/wb411133/data/Global/"
docs_folder = "../Documentation"
ship_types_file = os.path.join(docs_folder, 'ship_types.json')
with open(ship_types_file, 'r') as ship_file: # https://faq.spire.com/determining-ais-ship-type
    ship_types = json.load(ship_file)

ship_status_file = os.path.join(docs_folder, 'ship_statuses.json')
with open(ship_status_file, 'r') as ship_status: 
    ship_status = json.load(ship_status)

{'0': 'Under way using its engine',
 '1': 'Anchored',
 '2': 'Not under command',
 '3': 'Has restricted maneuverability',
 '4': 'Ship draught is limiting its movement',
 '5': 'Moored (tied to another object to limit free movement)',
 '6': 'Aground',
 '7': 'Engaged in fishing',
 '8': 'Under way sailing',
 '9': '(Number reserved for modifying reported status of ships carrying dangerous goods/harmful substances/marine pollutants)',
 '10': '(Number reserved for modifying reported status of ships carrying dangerous goods/harmful substances/marine pollutants)',
 '11': 'Power-driven vessel towing astern',
 '12': 'Power-driven vessel pushing ahead/towing alongside',
 '13': '(Reserved for future use)',
 '14': 'Any of the following are active: AIS-SART (Search and Rescue Transmitter), AIS-MOB (Man Overboard), AIS-EPIRB (Emergency Position Indicating Radio Beacon)',
 '15': 'Undefined (default)'}

In [3]:
# List all the AIS files on the S3 bucket
client = boto3.client('s3')
ais_file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix='AIS', MaxKeys=5000)

keep_processing = True
continuation_token = ''
try:
    del final
except:
    pass
loop_cnt = 0

# Generate a list of all the files, using the continuation_token 
while keep_processing:
    loop_cnt = loop_cnt + 1
    print(loop_cnt)
    try:
        ais_file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix='AIS', ContinuationToken=continuation_token)
    except:
        ais_file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix='AIS')
    keep_processing = ais_file_list['IsTruncated']
    try:
        continuation_token = ais_file_list['NextContinuationToken']
    except:
        pass
    try:
        final = final + ais_file_list['Contents']
    except:
        final = ais_file_list['Contents']

In [None]:
# Loop through all the input files, read and append them
try:
    del final_ais
except:
    pass
loop_cnt = 0
for ais_file_info in final:
    loop_cnt = loop_cnt + 1
    tPrint(f'{loop_cnt} of {len(final)}: {ais_file_info["Key"]}')
    curD = pd.read_csv(os.path.join("s3://", aws_bucket, ais_file_info['Key']))
    # Limit the returned columns to shrink datasize
    curD = curD.loc[:,['timestamp','mmsi','status','ship_and_cargo_type','latitude','longitude']]
    try:
        final_ais = final_ais.append(curD)
    except:
        final_ais = curD
        
    

12:15:07	1 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000000.csv


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


12:15:29	2 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000001.csv
12:15:50	3 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000002.csv
12:16:12	4 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000003.csv
12:16:35	5 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000004.csv
12:16:58	6 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000005.csv
12:17:21	7 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000006.csv
12:17:44	8 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000007.csv
12:18:06	9 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000008.csv
12:18:27	10 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000009.csv
12:18:50	11 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000010.csv
12:19:14	12 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000011.csv
12:19:36	13 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_00000

12:57:24	99 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000098.csv
12:57:53	100 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000099.csv
12:58:23	101 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000100.csv
12:58:51	102 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000101.csv
12:59:21	103 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000102.csv
12:59:50	104 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000103.csv
13:00:20	105 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000104.csv
13:00:50	106 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000105.csv
13:01:21	107 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000106.csv
13:01:51	108 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000107.csv
13:02:21	109 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000108.csv
13:02:51	110 of 5000: AIS/WorldBank_SAIS_globalAOI_2019

13:55:40	195 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000194.csv
13:56:33	196 of 5000: AIS/WorldBank_SAIS_globalAOI_20190101_20201231_000000000195.csv


In [None]:
# Convert time to just the YYYY-MM-DD
final_ais['timestamp'] = final_ais['timestamp'].apply(lambda x: x[:10])
feather.write_feather(final_ais, os.path.join(out_folder, "AIS_Combined_2019_2020.feather"))

# DEBUGGING

In [35]:
# The geohash bit is cool but didn't shrink size at all
# curD['geohash'] = curD.apply(lambda x: pygeohash.encode(x['latitude'], x['longitude']), axis=1)

In [39]:

feather.write_feather(curD.loc[:,['timestamp','mmsi','status','ship_and_cargo_type','latitude','longitude']], 
                      os.path.join(out_folder, ais_file_info['Key'].replace(".csv", ".feather")))

In [33]:
feather.write_feather(curD, 
                      os.path.join(out_folder, ais_file_info['Key'].replace(".csv", "_full.feather")))

In [37]:
feather.write_feather(curD.loc[:,['timestamp','geohash']], 
                      os.path.join(out_folder, ais_file_info['Key'].replace(".csv", "_small.feather")))

In [38]:
curD.columns

Index(['created_at', 'timestamp', 'mmsi', 'msg_type', 'latitude', 'longitude',
       'speed', 'course', 'heading', 'rot', 'imo', 'name', 'call_sign', 'flag',
       'draught', 'ship_and_cargo_type', 'length', 'width', 'eta',
       'destination', 'status', 'maneuver', 'accuracy', 'collection_type',
       'to_bow', 'to_stern', 'to_port', 'to_starboard', 'geohash'],
      dtype='object')