## Imports

In [2]:
import time

import numpy as np
from dotenv import load_dotenv

import logger_setup

import pandas as pd
from influxdb_client import InfluxDBClient
# Ciekawy statek:
# 215131000

import os
from influxdb_client import Point
from influxdb_client.client.write_api import SYNCHRONOUS

from csv_reader import ais_csv_to_df

## Functions

In [3]:
def create_point(row: pd.Series, measurement_name: str,
                 mmsi_fieldname="MMSI", vessel_name_fieldname="VesselName",
                 latitude_fieldname="LAT", longitude_fieldname="LON", time_fieldname="BaseDateTime"
                 ):
    t = "vessels_ais_31_12"
    point = (
        Point(measurement_name=measurement_name)
        .tag("mmsi", row[mmsi_fieldname])
        .tag("vessel_name", row[vessel_name_fieldname])
        .field("lat", row[latitude_fieldname])
        .field("lon", row[longitude_fieldname])
        .time(row[time_fieldname])
    )
    return point


def upload_df_to_influx_in_batches(df: pd.DataFrame, influx_client: InfluxDBClient, bucket_name: str,
                                   organization_id: str,
                                   batch_size: int = 100000,
                                   data_frame_tag_columns=["MMSI", "VesselName", "CallSign", "VesselType", "Status",
                                                           "Length", "Width", "Cargo", "TransceiverClass"]):
    logger.debug(f"Uploading to influxdb. Batch size: {batch_size}.")
    write_api = influx_client.write_api(write_options=SYNCHRONOUS)

    rows = df.shape[0]
    divisions = rows // batch_size + 1
    dfs = np.array_split(df, divisions)

    for i in range(divisions):
        logger.debug(f"Uploading division {i}/{divisions - 1}. Shape: {dfs[i].shape}. Processing...")
        write_api.write(bucket=bucket_name, org=organization_id,
                        record=dfs[i],
                        data_frame_measurement_name="vessels_ais_31_12",
                        data_frame_tag_columns=data_frame_tag_columns,
                        data_frame_timestamp_column="BaseDateTime",
                        )

### Setup

In [21]:
logger = logger_setup.setup_logging()
load_dotenv()
# token = os.environ.get("API_INFLUX_KEY_10")
# org = os.environ.get("INFLUX_ORG_ID_10")
# url = "http://localhost:" + os.environ.get("INFLUX_PORT", "55001")
token = os.environ.get("API_INFLUX_KEY_CLOUD")
org = os.environ.get("INFLUX_CLOUD_ORG")
url = os.environ.get("INFLUX_CLOUD_HOST")

logger.debug(f"Token: {token}")
logger.debug(f"Organization id: {org}")
logger.info(f"Database endpoint: {url}")

2024-09-03 21:14:00,369 - DEBUG: Token: Z4xDybyzy6j6pv92BvOwWB6fmQLFBLPMcGGJb8pEaC0PLcv3TY5KrBpTO6MXJhbj6lPMWHZ1rDEjUrB0iGdD1A==
2024-09-03 21:14:00,370 - DEBUG: Organization id: Geoinformatyka
2024-09-03 21:14:00,370 - INFO: Database endpoint: https://us-east-1-1.aws.cloud2.influxdata.com


### Setting spatial index

Influx uses s2 cells for this purpose.

In query this is `geo.shapeData()` function that does it. 

`s2_cell_id` has to be **saved as a tag** for other functions (such as `geo.filterRows()`) to work. 

In [23]:
import datetime

client = InfluxDBClient(url=url, token=token, org=org, timeout=60000)  # Set timeout to 60 seconds
query_api = client.query_api()


def generate_time_ranges_for_day(date: datetime.date, interval_minutes: float):
    current_start = datetime.datetime.combine(date, datetime.time.min)
    end_of_day = datetime.datetime.combine(date, datetime.time.max)
    while current_start < end_of_day:
        current_end = current_start + datetime.timedelta(minutes=interval_minutes)
        if current_end > end_of_day:
            current_end = end_of_day
        yield current_start.isoformat() + 'Z', current_end.isoformat() + 'Z'
        current_start = current_end


day = datetime.date(2020, 12, 31)
interval = 30

# Are these correct?
raw_data_bucket = "temp_bucket_2"
# indexed_data_bucket = "shapedData_bucket2"
# indexed_data_bucket = "shapedData_bucket3"
indexed_data_bucket = "aisdata_s2indexed_lvl20"

lat_field_name = "LAT"
lon_field_name = "LON"
level = 20

for start, end in generate_time_ranges_for_day(day, interval):
    print(f"From: {start}, to: {end}", end=" ")
    flux_query = f"""
import "experimental/geo"

from(bucket: "{raw_data_bucket}")
    |> range(start: {start}, stop: {end})
    |> filter(fn: (r) => r._measurement == "vessels_ais_31_12")
    |> filter(fn: (r) => r._field == "LAT" or r._field == "LON")
    |> geo.shapeData(latField: "{lat_field_name}", lonField: "{lon_field_name}", level: {level})
    |> to
        (bucket: "{indexed_data_bucket}", tagColumns: ["s2_cell_id", "MMSI"], fieldFn: (r) => ({{"lat": r.lat, "lon": r.lon}}))
    """

    # Execute the query
    result = query_api.query(flux_query)
    print(f"Level {level}, Finished.")
    


From: 2020-12-31T00:00:00Z, to: 2020-12-31T00:30:00Z Level 20, Finished.
From: 2020-12-31T00:30:00Z, to: 2020-12-31T01:00:00Z Level 20, Finished.
From: 2020-12-31T01:00:00Z, to: 2020-12-31T01:30:00Z Level 20, Finished.
From: 2020-12-31T01:30:00Z, to: 2020-12-31T02:00:00Z Level 20, Finished.
From: 2020-12-31T02:00:00Z, to: 2020-12-31T02:30:00Z Level 20, Finished.
From: 2020-12-31T02:30:00Z, to: 2020-12-31T03:00:00Z Level 20, Finished.
From: 2020-12-31T03:00:00Z, to: 2020-12-31T03:30:00Z Level 20, Finished.
From: 2020-12-31T03:30:00Z, to: 2020-12-31T04:00:00Z Level 20, Finished.
From: 2020-12-31T04:00:00Z, to: 2020-12-31T04:30:00Z Level 20, Finished.
From: 2020-12-31T04:30:00Z, to: 2020-12-31T05:00:00Z Level 20, Finished.
From: 2020-12-31T05:00:00Z, to: 2020-12-31T05:30:00Z Level 20, Finished.
From: 2020-12-31T05:30:00Z, to: 2020-12-31T06:00:00Z Level 20, Finished.
From: 2020-12-31T06:00:00Z, to: 2020-12-31T06:30:00Z Level 20, Finished.
From: 2020-12-31T06:30:00Z, to: 2020-12-31T07:00:00

### Spatial query and spatiotemporal query

Well to be honest it's a spatiotemporal query, because influx requires you to specify time range for every query. This still takes full time into consideration.

In [37]:
class Bbox:
    def __init__(self, min_lon, min_lat, max_lon, max_lat, id):
        self.min_lon = min_lon
        self.min_lat = min_lat
        self.max_lon = max_lon
        self.max_lat = max_lat
        self.id = id

    def get_coords(self):
        return [self.min_lon, self.min_lat, self.max_lon, self.max_lat]
    
    def get_id(self):
        return self.id
    
bounding_boxes = [
    Bbox(-76, 34, -75.9, 34.3, 0), 
    Bbox(-123.247925, 48.136125, -122.739476, 48.362910, 0), 
]

In [66]:
client = InfluxDBClient(url=url, token=token, org=org, timeout=60000)  # Set timeout to 60 seconds
query_api = client.query_api()

# bucket = "shapedData_bucket2"
# bucket = "temp"
# bucket = "aisdata_s2indexed_lvl24"
start_date = "2020-12-31T00:00:00Z"
stop_date = "2020-12-31T00:00:59Z"

bucket = "aisdata"
start_date = "2024-09-01T00:00:00Z"
stop_date = "2024-09-01T00:00:59Z"

min_lon, min_lat, max_lon, max_lat = bounding_boxes[0].get_coords()
level = 10
strict = "true"
start_time = time.time()
MMSI = "636017540"
query = f"""
import "experimental/geo"

region = {{
    minLat: {min_lat},
    maxLat: {max_lat},
    minLon: {min_lon},  
    maxLon: {max_lon},
}}

from(bucket: "{bucket}")
    |> range(start: {start_date}, stop: {stop_date})
    |> filter(fn: (r) => r._measurement == "vessels_ais_31_12")
    |> filter(fn: (r) => r._field == "LAT" or r._field == "LON")
    |> filter(fn: (r) => r.MMSI == "{MMSI}")
    |> geo.shapeData(latField: "LAT", lonField: "LON", level: {level})
    // |> geo.filterRows(region: region, level: {level}, strict: true)
"""
# //   |> geo.shapeData(latField: "LAT", lonField: "LON", level: 24)
# //   |> drop(columns: ["_measurement", "Status", "TransceiverClass", "VesselName", "VesselType", "CallSign"])
# //   |> geo.filterRows(region: region, level: 24, strict: true)
tables = query_api.query(query)

end_time = time.time()
record_count = sum(len(table.records) for table in tables)
logger.info(f"Query took {end_time - start_time} seconds, no. of results: {record_count}")

logger.info("Closing database connection...")
client.close()

2024-09-03 21:41:32,785 - INFO: Query took 0.6288387775421143 seconds, no. of results: 1
2024-09-03 21:41:32,787 - INFO: Closing database connection...


In [67]:
for table in tables:
    for record in table.records:
        print(record)

FluxRecord() table: 0, {'result': '_result', 'table': 0, 'MMSI': '636017540', 'Status': '0.0', 'TransceiverClass': 'A', '_measurement': 'vessels_ais_31_12', '_start': datetime.datetime(2024, 9, 1, 0, 0, tzinfo=tzutc()), '_stop': datetime.datetime(2024, 9, 1, 0, 0, 59, tzinfo=tzutc()), '_time': datetime.datetime(2024, 9, 1, 0, 0, 1, tzinfo=tzutc()), 'lat': 34.25975, 'lon': -75.98961, 's2_cell_id': '89a7b1'}


In [29]:
import folium
def get_results_folium(influx_tables=[],  bounding_boxes=[]):
    CHICAGO_COORDINATES = (42, -95)

    map_attributions = ('&copy; <a href="http://www.openstreetmap.org/copyright">OpenStreetMap</a> '
                        'contributors, &copy; <a href="http://cartodb.com/attributions">CartoDB</a>')

    m = folium.Map(location=CHICAGO_COORDINATES,
                   attr=map_attributions,
                   zoom_start=5,
                   control_scale=True,
                   height=800,
                   width=1400)
    
    for table in influx_tables:
        for record in table:
            popup = folium.Popup(f"mmsi: {record['MMSI']}, time: {record['_time']}")
            folium.Marker(location=[record["lat"], record["lon"]],
                          popup=popup,
                          icon=folium.Icon(color='blue', icon='ship', prefix='fa')).add_to(m)

    for bbox in bounding_boxes:
        coords = bbox.get_coords()
        min_lon, min_lat, max_lon, max_lat = coords

        # Create the bounding box as a rectangle
        folium.Rectangle(
            bounds=[(min_lat, min_lon), (max_lat, max_lon)],
            color="blue",  # You can change the color as needed
            fill=True,
            popup=bbox.get_id(),
            fill_opacity=0.2
        ).add_to(m)

    return m
    
m: folium.Map = get_results_folium(tables, bounding_boxes[0:1])
m

NameError: name 'tables' is not defined

### Time query

### Uploading data - slow

In [11]:
# Uploading data one by one - slow

# safety check - don't run unless you really want to
exit()

client: InfluxDBClient = InfluxDBClient(url=url, token=token, org=org)
bucket = "temp_bucket_2"

df = ais_csv_to_df("data/AIS_2020_12_31.csv")
df["VesselName"] = df["VesselName"].str.replace(" ", "\ ")
print("Creating points...")
df["Points"] = df.apply(create_point, axis=1, args=("vessels_ais_31_12",))
write_api = client.write_api(write_options=SYNCHRONOUS)
start_time = time.time()
print("Uploading points...")
for i, point in enumerate(df["Points"]):
    if i % 1000 == 0 and i != 0:
        print(
            f"Point {i}: {point}. Time elapsed: {time.time() - start_time}. Average time per point: {(time.time() - start_time) / i}")
    write_api.write(bucket=bucket, org=org, record=point)

logger.info("Closing database connection...")
client.close()

2024-07-08 08:30:26,167 - DEBUG: Loading data...


KeyboardInterrupt: 

### Uploading data - fast

In [28]:
# Uploading data do influx database in batches - fast

# safety check - don't run unless you really want to
# exit()

client: InfluxDBClient = InfluxDBClient(url=url, token=token, org=org)
bucket = "aisdata"

df = ais_csv_to_df("data/AIS_2020_12_31_first_2_sec.csv")
# df = df[["MMSI", "VesselName", "LAT", "LON", "BaseDateTime"]]
df["VesselName"] = df["VesselName"].str.replace(" ", "_")
df["CallSign"] = df["CallSign"].str.replace(" ", "_")
logger.debug(f"Dataframe shape: {df.shape}")

logger.debug("Beware! Executing The Command!")
start_time = time.time()
upload_df_to_influx_in_batches(df, client, bucket, org, 200000)
end_time = time.time()
logger.info(f"Upload time: {end_time - start_time}")

logger.info("Closing database connection...")
client.close()

2024-09-03 21:18:19,362 - DEBUG: Loading data...
2024-09-03 21:18:19,370 - DEBUG: Dataframe shape: (302, 17)
2024-09-03 21:18:19,371 - DEBUG: Beware! Executing The Command!
2024-09-03 21:18:19,371 - DEBUG: Uploading to influxdb. Batch size: 200000.
  return bound(*args, **kwds)
2024-09-03 21:18:19,373 - DEBUG: Uploading division 0/0. Shape: (302, 17). Processing...
2024-09-03 21:18:20,462 - INFO: Upload time: 1.090437650680542
2024-09-03 21:18:20,463 - INFO: Closing database connection...


## How to connect to the database

In [10]:
client: InfluxDBClient = InfluxDBClient(url=url, token=token, org=org)
bucket = "temp_bucket_2"

logger.info("Closing database connection...")
client.close()

2024-07-08 08:29:04,515 - INFO: Closing database connection...
