In [1]:
# import all packages

import pandas as pd
import numpy as np
import joblib
from  feature_engineering import get_feature_df, create_feature_bins, create_network_graph
from data_preprocessing import get_time_frame_data, select_columns, split_network_train_set, remove_sinlge_occurences, keep_classes, get_subset_exclude_ids, get_subset_include_ids
from constants import START_DATE, END_DATE, CUTOFF_NEGATIVE_DURATION_PERCENTAGE, CLASSES_TO_USE
import datetime
import networkx as nx
import operator
import time
import os
from xgboost import XGBClassifier
from dateutil.relativedelta import relativedelta


## Train-test gap (static and resampled)

In [2]:
#   EXPERIMENT Static Gap
def static_gap(df_portcalls, results_directory):
    # create necessary files for logging
    current_results_directory = results_directory + "experiment_static_gap/"
    if not os.path.isdir(current_results_directory):
        os.mkdir(current_results_directory)


    # Retrieve the desired timeframe
    df = get_time_frame_data(df_portcalls, START_DATE, END_DATE, "ATA_LT")


    #   only keep relevant columns and set the id column
    relevant_columns = {"ATA_LT": "datetime64[ns]", 
                    "ATD_LT": "datetime64[ns]",
                        "Port Name": "string",
                        "IMO Number": "string",
                        "(ATA) Ship Type Description": "string"
                        }
    new_column_names = ["Arrival Time", "Departure Time", "Port Name", "IMO number", "Ship Type"]

    id_column= "IMO number"

    df = select_columns(df,relevant_columns=relevant_columns, new_column_names=new_column_names)

    #   only keep the classes we want to classify
    df = keep_classes(df, classes_to_keep=CLASSES_TO_USE, target_column='Ship Type', id_column='IMO number')
   
   

    #   remove single occurences from network set
    df_classify = remove_sinlge_occurences(df, column_name="IMO number")


    batch_size = relativedelta(months=1)
    encoding_size = relativedelta(months=12)

    for run in range(30):
        start_network = time.time()

        iteration_directory = current_results_directory + f'/run_{run+1}/'
        if not os.path.isdir(iteration_directory):
            os.mkdir(iteration_directory)

        current_run_log = iteration_directory + 'experiment_log.txt'

        f = open(current_run_log, 'a')


        end_date_train = datetime.datetime(2018, 4, 1) + (run * relativedelta(months=1))
        start_date_test = datetime.datetime(2018, 4, 1, microsecond=1) + (run * relativedelta(months=1))
        
        if start_date_test >= datetime.datetime(2019, 11, 1, microsecond=1):
            end_date_train = end_date_train + relativedelta(months=3)
            start_date_test = start_date_test + relativedelta(months=3)
        
        if start_date_test + batch_size <= END_DATE:
            end_date_test = start_date_test + batch_size
        else:
            end_date_test = END_DATE

        test_sample = get_time_frame_data(df_classify, start_date_test, end_date_test, "Arrival Time")
        test_sample, _ = split_network_train_set(df=test_sample, id_column="IMO number", label_column="Ship Type", network_size=0.1)
        test_imos = pd.unique(test_sample["IMO number"])

        f.write(f"start date test sample: {start_date_test} and end date test sample: {end_date_test}\n")
        f.write(f'Amount of imos in test sample {len(test_imos)}\n')        

        test_batch_12 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=12), end_date=end_date_test)
        f.write(f"start date test 12: {end_date_test - relativedelta(months=12)} and end date test 12: {end_date_test}\n")
        imos_12 = pd.unique(test_batch_12["IMO number"])

        f.write(f'in test there are {len(imos_12)} imos and {len(test_batch_12)} portcalls\n')


        for x in range(20):
            gap_size = x * batch_size
            f.write(f'train batch {x+1}:\n')
            f.write(f'gap size : {x} months')
            # account for nov/dec 2019
            if (end_date_train - (x * batch_size) == datetime.datetime(2019, 12, 1)) or (end_date_train - (x * batch_size) == datetime.datetime(2020, 1, 1)):
                f.write("ACCOUNTING FOR THE DATA LOSS IN NOVEMBER/DECEMBER 2019 \n ")
                train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=datetime.datetime(2019, 11, 1) - batch_size , end_date=datetime.datetime(2019, 11, 1))
                train_imos = pd.unique(train_sample["IMO number"])
                f.write(f"start date train sample: {datetime.datetime(2019, 11, 1) - batch_size} and end date train sample: {datetime.datetime(2019, 11, 1)}\n")
                f.write(f'there are {len(train_imos)} imos sampled for train \n')
                
                train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=datetime.datetime(2019, 11, 1) - encoding_size, end_date=datetime.datetime(2019, 11, 1))

                train_imos_temp = pd.unique(train_batch["IMO number"])
                f.write(f"start date train: {datetime.datetime(2019, 11, 1) - encoding_size} and end date train: {datetime.datetime(2019, 11, 1)}\n")
                f.write(f'there are {len(train_imos_temp)} imos in train and {len(train_batch)} portcalls\n')

                overlap = np.intersect1d(train_imos, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
                overlap = np.intersect1d(train_imos_temp, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
                f.write('-----\n')

            else: 
                train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=end_date_train - ((1+x) * batch_size) , end_date=end_date_train - (x * batch_size))
                train_imos = pd.unique(train_sample["IMO number"])
                f.write(f"start date train sample: {end_date_train - ((1+x) * batch_size)} and end date train sample: {end_date_train - (x * batch_size)}\n")
                f.write(f'there are {len(train_imos)} imos sampled for train \n')
                
                train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=end_date_train - (x * batch_size) - encoding_size, end_date=end_date_train - (x * batch_size))

                train_imos_temp = pd.unique(train_batch["IMO number"])
                f.write(f"start date train: {end_date_train - (x * batch_size) - encoding_size} and end date train: {end_date_train - (x * batch_size)}\n")
                f.write(f'there are {len(train_imos_temp)} imos in train and {len(train_batch)} portcalls\n')

                overlap = np.intersect1d(train_imos, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
                overlap = np.intersect1d(train_imos_temp, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
                f.write('-----\n')
            
            if (end_date_train - (2 * gap_size) - batch_size == datetime.datetime(2019, 12, 1)) or (end_date_train - (2 * gap_size) - batch_size == datetime.datetime(2020, 1, 1)):
                network_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                            start_date=datetime.datetime(2019, 11, 1) - batch_size, end_date=datetime.datetime(2019, 11, 1))
                network_imos = pd.unique(network_sample["IMO number"])
                f.write(f"start date network sample: {datetime.datetime(2019, 11, 1) - batch_size} and end date network sample: {datetime.datetime(2019, 11, 1)}\n")
                f.write(f'there are {len(network_imos)} imos sampled for network \n')
                
                network_batch = get_subset_include_ids(df=df_classify, include_ids=network_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=datetime.datetime(2019, 11, 1) - encoding_size, end_date=datetime.datetime(2019, 11, 1))

                network_imos_temp = pd.unique(network_batch["IMO number"])
                f.write(f"start date network: {datetime.datetime(2019, 11, 1) - encoding_size} and end date network: {datetime.datetime(2019, 11, 1)}\n")
                f.write(f'there are {len(network_imos_temp)} imos in network and {len(network_batch)} portcalls\n')

                overlap = np.intersect1d(network_imos, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and test\n')
                overlap = np.intersect1d(network_imos_temp, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and test\n')
                f.write('-----\n')
            else:
                network_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                            start_date=end_date_train - (2 * gap_size) - (2*batch_size), end_date=end_date_train - (2 * gap_size) - batch_size)
                network_imos = pd.unique(network_sample["IMO number"])
                f.write(f"start date network sample: {end_date_train - (2 * gap_size) - (2*batch_size)} and end date network sample: {end_date_train - (2 * gap_size) - batch_size}\n")
                f.write(f'there are {len(network_imos)} imos sampled for network \n')
                
                network_batch = get_subset_include_ids(df=df_classify, include_ids=network_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=end_date_train - (2 * gap_size) - batch_size - encoding_size, end_date=end_date_train - (2 * gap_size) - batch_size)

                network_imos_temp = pd.unique(network_batch["IMO number"])
                f.write(f"start date network: {end_date_train - (2 * gap_size) - batch_size - encoding_size} and end date network: {end_date_train - (2 * gap_size) - batch_size}\n")
                f.write(f'there are {len(network_imos_temp)} imos in network and {len(network_batch)} portcalls\n')

                overlap = np.intersect1d(network_imos, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and test\n')
                overlap = np.intersect1d(network_imos_temp, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and test\n')
                f.write('-----\n')

            G, travel_times, port_stay_times, df_edges, processing_info = create_network_graph(network_batch, id_column="IMO number")
            feature_bins = create_feature_bins(G, travel_times=travel_times, port_stay_times=port_stay_times)

            df_features_train = get_feature_df(train_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_12 = get_feature_df(test_batch_12, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')

            fold_directory = iteration_directory + f'/train_batch_{x+1}/'
            if not os.path.isdir(fold_directory):
                os.mkdir(fold_directory)

            df_features_train = df_features_train.copy()
            test_12 = df_features_test_12.copy()

            imo_train = df_features_train['IMO'].to_numpy()
            imo_test_12 = test_12['IMO'].to_numpy()


            np.save(f'{fold_directory}imo_test.npy', np.array(imo_test_12))
            np.save(f'{fold_directory}imo_train.npy', np.array(imo_train))
        
            for target in CLASSES_TO_USE:
                name = target.replace('/', '-')
                class_directory = fold_directory + f'/class_{name}/'
                if not os.path.isdir(class_directory):
                    os.mkdir(class_directory)

                df = df_features_train.copy()
                df_test_12 = test_12.copy()
                
                #   transform every irrelevant classname to 'other'
                df.loc[df['Target'] != target, 'Target'] = 'Other'
                X_train = df.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_12.loc[df_test_12['Target'] != target, 'Target'] = 'Other'
                X_test_12 = df_test_12.drop(['Target', 'IMO'], axis=1).to_numpy()

                #   Retrieve a list of unique labels (both string and numeric)
                labels = np.array([ 'Other', target])

                #   Retrieve numeric labels train
                ship_labels = df['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_train = stratify

                #   Retrieve numeric labels test 12
                ship_labels = df_test_12['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_12 = stratify


                bst = XGBClassifier(n_estimators=100, max_depth=3, objective='binary:logistic', tree_method='gpu_hist')
                np.save(f'{class_directory}y_train.npy', np.array(y_train))

                # fit model
                bst.fit(X_train, y_train)
                bst.save_model(f'{class_directory}model.json')

                test_directory = class_directory + f'/test_12/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_12)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_12))
            

        end_network = time.time()
        dur_network = end_network - start_network
        f.write(f"In total from network creation until features took: {dur_network}, which is {dur_network/60} minutes\n")

        f.close()
            

In [3]:
#   EXPERIMENT Resampled Gap
def resampled_gap(df_portcalls, results_directory):
    # create necessary files for logging
    current_results_directory = results_directory + "experiment_resampled_gap/"
    if not os.path.isdir(current_results_directory):
        os.mkdir(current_results_directory)


    # Retrieve the desired timeframe
    df = get_time_frame_data(df_portcalls, START_DATE, END_DATE, "ATA_LT")


    #   only keep relevant columns and set the id column
    relevant_columns = {"ATA_LT": "datetime64[ns]", 
                    "ATD_LT": "datetime64[ns]",
                        "Port Name": "string",
                        "IMO Number": "string",
                        "(ATA) Ship Type Description": "string"
                        }
    new_column_names = ["Arrival Time", "Departure Time", "Port Name", "IMO number", "Ship Type"]

    id_column= "IMO number"

    df = select_columns(df,relevant_columns=relevant_columns, new_column_names=new_column_names)

    #   only keep the classes we want to classify
    df = keep_classes(df, classes_to_keep=CLASSES_TO_USE, target_column='Ship Type', id_column='IMO number')
   
   

    #   remove single occurences from network set
    df_classify = remove_sinlge_occurences(df, column_name="IMO number")


    batch_size = relativedelta(months=1)
    encoding_size = relativedelta(months=12)

    for run in range(30):
        start_network = time.time()

        iteration_directory = current_results_directory + f'/run_{run+1}/'
        if not os.path.isdir(iteration_directory):
            os.mkdir(iteration_directory)

        current_run_log = iteration_directory + 'experiment_log.txt'

        f = open(current_run_log, 'a')


        end_date_train = datetime.datetime(2018, 4, 1) + (run * relativedelta(months=1))
        start_date_test = datetime.datetime(2018, 4, 1, microsecond=1) + (run * relativedelta(months=1))
        
        if start_date_test >= datetime.datetime(2019, 11, 1, microsecond=1):
            end_date_train = end_date_train + relativedelta(months=3)
            start_date_test = start_date_test + relativedelta(months=3)
        
        if start_date_test + batch_size <= END_DATE:
            end_date_test = start_date_test + batch_size
        else:
            end_date_test = END_DATE

        test_sample = get_time_frame_data(df_classify, start_date_test, end_date_test, "Arrival Time")
        test_sample, _ = split_network_train_set(df=test_sample, id_column="IMO number", label_column="Ship Type", network_size=0.1)
        test_imos = pd.unique(test_sample["IMO number"])

        f.write(f"start date test sample: {start_date_test} and end date test sample: {end_date_test}\n")
        f.write(f'Amount of imos in test sample {len(test_imos)}\n')        

        test_batch_12 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=12), end_date=end_date_test)
        f.write(f"start date test 12: {end_date_test - relativedelta(months=12)} and end date test 12: {end_date_test}\n")
        imos_12 = pd.unique(test_batch_12["IMO number"])

        f.write(f'in test 12 there are {len(imos_12)} imos and {len(test_batch_12)} portcalls\n')



        for x in range(40):
            f.write(f'train batch {x+1}:\n')
            # account for nov/dec 2019
            if (end_date_train - (x * batch_size) == datetime.datetime(2019, 12, 1)) or (end_date_train - (x * batch_size) == datetime.datetime(2020, 1, 1)):
                f.write("ACCOUNTING FOR THE DATA LOSS IN NOVEMBER/DECEMBER 2019 \n ")
                train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=datetime.datetime(2019, 11, 1) - batch_size , end_date=datetime.datetime(2019, 11, 1))
                train_imos = pd.unique(train_sample["IMO number"])
                f.write(f"start date train sample: {datetime.datetime(2019, 11, 1) - batch_size} and end date train sample: {datetime.datetime(2019, 11, 1)}\n")
                f.write(f'there are {len(train_imos)} imos sampled for train \n')
                
                train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=datetime.datetime(2019, 11, 1) - encoding_size, end_date=datetime.datetime(2019, 11, 1))

                train_imos_temp = pd.unique(train_batch["IMO number"])
                f.write(f"start date train: {datetime.datetime(2019, 11, 1) - encoding_size} and end date train: {datetime.datetime(2019, 11, 1) - encoding_size}\n")
                f.write(f'there are {len(train_imos_temp)} imos in train and {len(train_batch)} portcalls\n')

                overlap = np.intersect1d(train_imos, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
                overlap = np.intersect1d(train_imos_temp, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
                f.write('-----\n')

            else: 
                train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=end_date_train - ((1+x) * batch_size) , end_date=end_date_train - (x * batch_size))
                train_imos = pd.unique(train_sample["IMO number"])
                f.write(f"start date train sample: {end_date_train - ((1+x) * batch_size)} and end date train sample: {end_date_train - (x * batch_size)}\n")
                f.write(f'there are {len(train_imos)} imos sampled for train \n')
                
                train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                                        start_date=end_date_train - (x * batch_size) - encoding_size, end_date=end_date_train - (x * batch_size))

                train_imos_temp = pd.unique(train_batch["IMO number"])
                f.write(f"start date train: {end_date_train - (x * batch_size) - encoding_size} and end date train: {end_date_train - (x * batch_size)}\n")
                f.write(f'there are {len(train_imos_temp)} imos in train and {len(train_batch)} portcalls\n')

                overlap = np.intersect1d(train_imos, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
                overlap = np.intersect1d(train_imos_temp, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
                f.write('-----\n')

            G, travel_times, port_stay_times, df_edges, processing_info = create_network_graph(train_batch, id_column="IMO number")
            feature_bins = create_feature_bins(G, travel_times=travel_times, port_stay_times=port_stay_times)

            df_features_train = get_feature_df(train_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_12 = get_feature_df(test_batch_12, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')


            fold_directory = iteration_directory + f'/train_batch_{x+1}/'
            if not os.path.isdir(fold_directory):
                os.mkdir(fold_directory)

            df_features_train = df_features_train.copy()
            test_12 = df_features_test_12.copy()

            imo_train = df_features_train['IMO'].to_numpy()
            imo_test_12 = test_12['IMO'].to_numpy()

            np.save(f'{fold_directory}imo_test.npy', np.array(imo_test_12))
            np.save(f'{fold_directory}imo_train.npy', np.array(imo_train))
        
            for target in CLASSES_TO_USE:
                name = target.replace('/', '-')
                class_directory = fold_directory + f'/class_{name}/'
                if not os.path.isdir(class_directory):
                    os.mkdir(class_directory)

                df = df_features_train.copy()
                df_test_12 = test_12.copy()
                
                #   transform every irrelevant classname to 'other'
                df.loc[df['Target'] != target, 'Target'] = 'Other'
                X_train = df.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_12.loc[df_test_12['Target'] != target, 'Target'] = 'Other'
                X_test_12 = df_test_12.drop(['Target', 'IMO'], axis=1).to_numpy()

                #   Retrieve a list of unique labels (both string and numeric)
                labels = np.array([ 'Other', target])
                numeric_labels = np.array([0,1])

                #   Retrieve numeric labels train
                ship_labels = df['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_train = stratify

                #   Retrieve numeric labels test 12
                ship_labels = df_test_12['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_12 = stratify



                bst = XGBClassifier(n_estimators=100, max_depth=3, objective='binary:logistic', tree_method='gpu_hist')
                np.save(f'{class_directory}y_train.npy', np.array(y_train))

                # fit model
                bst.fit(X_train, y_train)
                bst.save_model(f'{class_directory}model.json')

                test_directory = class_directory + f'/test_12/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_12)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_12))
            

        end_network = time.time()
        dur_network = end_network - start_network
        f.write(f"In total from network creation until features took: {dur_network}, which is {dur_network/60} minutes\n")

        f.close()


## Expanding window (static and resampled)

In [4]:
#   EXPERIMENT Static expanding window

def static_expanding(df_portcalls, results_directory):
    # create necessary files for logging
    current_results_directory = results_directory + "experiment_static_expanding/"
    if not os.path.isdir(current_results_directory):
        os.mkdir(current_results_directory)


    # Retrieve the desired timeframe
    df = get_time_frame_data(df_portcalls, START_DATE, END_DATE, "ATA_LT")


    #   only keep relevant columns and set the id column
    relevant_columns = {"ATA_LT": "datetime64[ns]", 
                    "ATD_LT": "datetime64[ns]",
                        "Port Name": "string",
                        "IMO Number": "string",
                        "(ATA) Ship Type Description": "string"
                        }
    new_column_names = ["Arrival Time", "Departure Time", "Port Name", "IMO number", "Ship Type"]

    id_column= "IMO number"

    df = select_columns(df,relevant_columns=relevant_columns, new_column_names=new_column_names)

    #   only keep the classes we want to classify
    df = keep_classes(df, classes_to_keep=CLASSES_TO_USE, target_column='Ship Type', id_column='IMO number')
   
    #   remove single occurences from network set
    df_classify = remove_sinlge_occurences(df, column_name="IMO number")


    batch_size = relativedelta(months=1)
    encoding_size = relativedelta(months=12)

    for run in range(30):
        start_network = time.time()

        iteration_directory = current_results_directory + f'/run_{run+1}/'
        if not os.path.isdir(iteration_directory):
            os.mkdir(iteration_directory)

        current_run_log = iteration_directory + 'experiment_log.txt'

        f = open(current_run_log, 'a')


        end_date_train = datetime.datetime(2018, 4, 1) + (run * relativedelta(months=1))
        start_date_test = datetime.datetime(2018, 4, 1, microsecond=1) + (run * relativedelta(months=1))
        
        # account for data scarsity in two months
        if start_date_test >= datetime.datetime(2019, 11, 1, microsecond=1):
            end_date_train = end_date_train + relativedelta(months=3)
            start_date_test = start_date_test + relativedelta(months=3)
        
        if start_date_test + batch_size <= END_DATE:
            end_date_test = start_date_test + batch_size
        else:
            end_date_test = END_DATE

        test_sample = get_time_frame_data(df_classify, start_date_test, end_date_test, "Arrival Time")
        test_sample, _ = split_network_train_set(df=test_sample, id_column="IMO number", label_column="Ship Type", network_size=0.1)
        test_imos = pd.unique(test_sample["IMO number"])

        f.write(f"start date test sample: {start_date_test} and end date test sample: {end_date_test}\n")
        f.write(f'Amount of imos in test sample {len(test_imos)}\n')        

        test_batch_12 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=12), end_date=end_date_test)
        f.write(f"start date test 12: {end_date_test - relativedelta(months=12)} and end date test 12: {end_date_test}\n")
        imos_12 = pd.unique(test_batch_12["IMO number"])

        test_batch_6 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=6), end_date=end_date_test)
        f.write(f"start date test 6: {end_date_test - relativedelta(months=6)} and end date test 6: {end_date_test}\n")
        imos_6 = pd.unique(test_batch_6["IMO number"])

        test_batch_3 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=3), end_date=end_date_test)
        f.write(f"start date test 3: {end_date_test - relativedelta(months=3)} and end date test 3: {end_date_test}\n")
        imos_3 = pd.unique(test_batch_3["IMO number"])

        test_batch_1 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=1), end_date=end_date_test)
        f.write(f"start date test 1: {end_date_test - relativedelta(months=1)} and end date test 1: {end_date_test}\n")
        imos_1 = pd.unique(test_batch_1["IMO number"])

        f.write(f'in test 12 there are {len(imos_12)} imos and {len(test_batch_12)} portcalls\n')
        f.write(f'in test 6 there are {len(imos_6)} imos and {len(test_batch_6)} portcalls\n')
        f.write(f'in test 3 there are {len(imos_3)} imos and {len(test_batch_3)} portcalls\n')
        f.write(f'in test 1 there are {len(imos_1)} imos and {len(test_batch_1)} portcalls\n')

        for x in range(40):
            f.write(f'train batch {x+1}:\n')

            train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_train - ((1+x) * batch_size) , end_date=end_date_train)
            train_imos = pd.unique(train_sample["IMO number"])
            f.write(f"start date train sample: {end_date_train - ((1+x) * batch_size)} and end date train sample: {end_date_train}\n")
            f.write(f'there are {len(train_imos)} imos sampled for train \n')
            
            train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_train - (x * batch_size) - encoding_size, end_date=end_date_train)

            train_imos_temp = pd.unique(train_batch["IMO number"])
            f.write(f"start date train: {end_date_train - (x * batch_size) - encoding_size} and end date train: {end_date_train}\n")
            f.write(f'there are {len(train_imos_temp)} imos in train and {len(train_batch)} portcalls\n')

            overlap = np.intersect1d(train_imos, test_imos)
            f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
            overlap = np.intersect1d(train_imos_temp, test_imos)
            f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
            f.write('\n')

            # account for data scarsity in two months
            if (end_date_train - (x * batch_size) == datetime.datetime(2019, 12, 1)) or (end_date_train - (x * batch_size) == datetime.datetime(2020, 1, 1)):
                network_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                    start_date=datetime.datetime(2020, 2, 1) - batch_size, end_date=datetime.datetime(2020, 2, 1))
                network_imos = pd.unique(network_sample["IMO number"])
                f.write(f"start date network sample: {datetime.datetime(2020, 2, 1) - batch_size} and end network sample: {datetime.datetime(2020, 2, 1)}\n")
                f.write(f'there are {len(network_imos)} imos sampled for network \n')

                network_batch = get_subset_include_ids(df=df_classify, include_ids=network_imos, id_column=id_column, time_column="Arrival Time",
                                                    start_date=datetime.datetime(2020, 2, 1) - encoding_size, end_date=datetime.datetime(2020, 2, 1))

                network_imos_temp = pd.unique(network_batch["IMO number"])

                overlap = np.intersect1d(network_imos, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and test\n')
                overlap = np.intersect1d(network_imos_temp, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and test\n')

                overlap = np.intersect1d(network_imos, train_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and train\n')
                overlap = np.intersect1d(network_imos_temp, train_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and train\n')

            else: 
                network_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                    start_date=end_date_train - ((1+x) * batch_size) , end_date=end_date_train - (x * batch_size))
                network_imos = pd.unique(network_sample["IMO number"])
                f.write(f"start date network sample: {end_date_train - ((1+x) * batch_size)} and end network sample: {end_date_train - (x * batch_size)}\n")
                f.write(f'there are {len(network_imos)} imos sampled for network \n')

                network_batch = get_subset_include_ids(df=df_classify, include_ids=network_imos, id_column=id_column, time_column="Arrival Time",
                                                    start_date=end_date_train - (x * batch_size) - encoding_size, end_date=end_date_train - (x * batch_size))

                network_imos_temp = pd.unique(network_batch["IMO number"])

                overlap = np.intersect1d(network_imos, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and test\n')
                overlap = np.intersect1d(network_imos_temp, test_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and test\n')

                overlap = np.intersect1d(network_imos, train_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and train\n')
                overlap = np.intersect1d(network_imos_temp, train_imos)
                f.write(f'there are {len(overlap)} imos overlapping between network and train\n')

            G, travel_times, port_stay_times, df_edges, processing_info = create_network_graph(network_batch, id_column="IMO number")
            feature_bins = create_feature_bins(G, travel_times=travel_times, port_stay_times=port_stay_times)

            df_features_train = get_feature_df(train_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_12 = get_feature_df(test_batch_12, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_6 = get_feature_df(test_batch_6, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_3 = get_feature_df(test_batch_3, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')     
            df_features_test_1 = get_feature_df(test_batch_1, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')

            fold_directory = iteration_directory + f'/train_batch_{x+1}/'
            if not os.path.isdir(fold_directory):
                os.mkdir(fold_directory)

            df_features_train = df_features_train.copy()
            test_12 = df_features_test_12.copy()
            test_6 = df_features_test_6.copy()
            test_3 = df_features_test_3.copy()
            test_1 = df_features_test_1.copy()

            imo_train = df_features_train['IMO'].to_numpy()
            imo_test_12 = test_12['IMO'].to_numpy()
            imo_test_6 = test_6['IMO'].to_numpy()
            imo_test_3 = test_3['IMO'].to_numpy()
            imo_test_1 = test_1['IMO'].to_numpy()

            np.save(f'{fold_directory}imo_test_12.npy', np.array(imo_test_12))
            np.save(f'{fold_directory}imo_test_6.npy', np.array(imo_test_6))
            np.save(f'{fold_directory}imo_test_3.npy', np.array(imo_test_3))
            np.save(f'{fold_directory}imo_test_1.npy', np.array(imo_test_1))
            np.save(f'{fold_directory}imo_train.npy', np.array(imo_train))
        
            for target in CLASSES_TO_USE:
                name = target.replace('/', '-')
                class_directory = fold_directory + f'/class_{name}/'
                if not os.path.isdir(class_directory):
                    os.mkdir(class_directory)

                df = df_features_train.copy()
                df_test_12 = test_12.copy()
                df_test_6 = test_6.copy()
                df_test_3 = test_3.copy()
                df_test_1 = test_1.copy()
                
                #   transform every irrelevant classname to 'other'
                df.loc[df['Target'] != target, 'Target'] = 'Other'
                X_train = df.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_12.loc[df_test_12['Target'] != target, 'Target'] = 'Other'
                X_test_12 = df_test_12.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_6.loc[df_test_6['Target'] != target, 'Target'] = 'Other'
                X_test_6 = df_test_6.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_3.loc[df_test_3['Target'] != target, 'Target'] = 'Other'
                X_test_3 = df_test_3.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_1.loc[df_test_1['Target'] != target, 'Target'] = 'Other'
                X_test_1 = df_test_1.drop(['Target', 'IMO'], axis=1).to_numpy()

                #   Retrieve a list of unique labels (both string and numeric)
                labels = np.array([ 'Other', target])
                numeric_labels = np.array([0,1])

                #   Retrieve numeric labels train
                ship_labels = df['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_train = stratify

                #   Retrieve numeric labels test 12
                ship_labels = df_test_12['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_12 = stratify

                #   Retrieve numeric labels test 6
                ship_labels = df_test_6['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_6 = stratify

                #   Retrieve numeric labels test 3
                ship_labels = df_test_3['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_3 = stratify

                #   Retrieve numeric labels test 1
                ship_labels = df_test_1['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_1 = stratify


                bst = XGBClassifier(n_estimators=100, max_depth=3, objective='binary:logistic', tree_method='gpu_hist')
                np.save(f'{class_directory}y_train.npy', np.array(y_train))

                # fit model
                bst.fit(X_train, y_train)
                bst.save_model(f'{class_directory}model.json')

                test_directory = class_directory + f'/test_12/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_12)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_12))


                test_directory = class_directory + f'/test_6/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_6)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_6))


                test_directory = class_directory + f'/test_3/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_3)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_3))

                test_directory = class_directory + f'/test_1/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_1)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_1))
            f.write('---------------------------------------\n')
            


        end_network = time.time()
        dur_network = end_network - start_network
        f.write(f"In total from network creation until features took: {dur_network}, which is {dur_network/60} minutes\n")

        f.close()

In [5]:
#   EXPERIMENT Resampled expanding window
def resampled_expanding(df_portcalls, results_directory):
    # create necessary files for logging
    current_results_directory = results_directory + "experiment_resampled_expanding/"
    if not os.path.isdir(current_results_directory):
        os.mkdir(current_results_directory)


    # Retrieve the desired timeframe
    df = get_time_frame_data(df_portcalls, START_DATE, END_DATE, "ATA_LT")


    #   only keep relevant columns and set the id column
    relevant_columns = {"ATA_LT": "datetime64[ns]", 
                    "ATD_LT": "datetime64[ns]",
                        "Port Name": "string",
                        "IMO Number": "string",
                        "(ATA) Ship Type Description": "string"
                        }
    new_column_names = ["Arrival Time", "Departure Time", "Port Name", "IMO number", "Ship Type"]

    id_column= "IMO number"

    df = select_columns(df,relevant_columns=relevant_columns, new_column_names=new_column_names)

    #   only keep the classes we want to classify
    df = keep_classes(df, classes_to_keep=CLASSES_TO_USE, target_column='Ship Type', id_column='IMO number')
   
   

    #   remove single occurences from network set
    df_classify = remove_sinlge_occurences(df, column_name="IMO number")


    batch_size = relativedelta(months=1)
    encoding_size = relativedelta(months=12)

    for run in range(30):
        start_network = time.time()

        iteration_directory = current_results_directory + f'/run_{run+1}/'
        if not os.path.isdir(iteration_directory):
            os.mkdir(iteration_directory)

        current_run_log = iteration_directory + 'experiment_log.txt'

        f = open(current_run_log, 'a')


        end_date_train = datetime.datetime(2018, 4, 1) + (run * relativedelta(months=1))
        start_date_test = datetime.datetime(2018, 4, 1, microsecond=1) + (run * relativedelta(months=1))


        # account for data scarsity in two months
        if start_date_test >= datetime.datetime(2019, 11, 1, microsecond=1):
            end_date_train = end_date_train + relativedelta(months=3)
            start_date_test = start_date_test + relativedelta(months=3)
        
        if start_date_test + batch_size <= END_DATE:
            end_date_test = start_date_test + batch_size
        else:
            end_date_test = END_DATE

        test_sample = get_time_frame_data(df_classify, start_date_test, end_date_test, "Arrival Time")
        test_sample, _ = split_network_train_set(df=test_sample, id_column="IMO number", label_column="Ship Type", network_size=0.1)
        test_imos = pd.unique(test_sample["IMO number"])

        f.write(f"start date test sample: {start_date_test} and end date test sample: {end_date_test}\n")
        f.write(f'Amount of imos in test sample {len(test_imos)}\n')        

        test_batch_12 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=12), end_date=end_date_test)
        f.write(f"start date test 12: {end_date_test - relativedelta(months=12)} and end date test 12: {end_date_test}\n")
        imos_12 = pd.unique(test_batch_12["IMO number"])

        test_batch_6 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=6), end_date=end_date_test)
        f.write(f"start date test 6: {end_date_test - relativedelta(months=6)} and end date test 6: {end_date_test}\n")
        imos_6 = pd.unique(test_batch_6["IMO number"])

        test_batch_3 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=3), end_date=end_date_test)
        f.write(f"start date test 3: {end_date_test - relativedelta(months=3)} and end date test 3: {end_date_test}\n")
        imos_3 = pd.unique(test_batch_3["IMO number"])

        test_batch_1 = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_test - relativedelta(months=1), end_date=end_date_test)
        f.write(f"start date test 1: {end_date_test - relativedelta(months=1)} and end date test 1: {end_date_test}\n")
        imos_1 = pd.unique(test_batch_1["IMO number"])

        f.write(f'in test 12 there are {len(imos_12)} imos and {len(test_batch_12)} portcalls\n')
        f.write(f'in test 6 there are {len(imos_6)} imos and {len(test_batch_6)} portcalls\n')
        f.write(f'in test 3 there are {len(imos_3)} imos and {len(test_batch_3)} portcalls\n')
        f.write(f'in test 1 there are {len(imos_1)} imos and {len(test_batch_1)} portcalls\n')


        for x in range(40):
            f.write(f'train batch {x+1}:\n')

            train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_train - ((1+x) * batch_size) , end_date=end_date_train)
            train_imos = pd.unique(train_sample["IMO number"])
            f.write(f"start date train sample: {end_date_train - ((1+x) * batch_size)} and end date train sample: {end_date_train}\n")
            f.write(f'there are {len(train_imos)} imos sampled for train \n')
            
            train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                                  start_date=end_date_train - (x * batch_size) - encoding_size, end_date=end_date_train)

            train_imos_temp = pd.unique(train_batch["IMO number"])
            f.write(f"start date train: {end_date_train - (x * batch_size) - encoding_size} and end date train: {end_date_train}\n")
            f.write(f'there are {len(train_imos_temp)} imos in train and {len(train_batch)} portcalls\n')

            overlap = np.intersect1d(train_imos, test_imos)
            f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
            overlap = np.intersect1d(train_imos_temp, test_imos)
            f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
            

            G, travel_times, port_stay_times, df_edges, processing_info = create_network_graph(train_batch, id_column="IMO number")
            feature_bins = create_feature_bins(G, travel_times=travel_times, port_stay_times=port_stay_times)

            df_features_train = get_feature_df(train_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_12 = get_feature_df(test_batch_12, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_6 = get_feature_df(test_batch_6, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_3 = get_feature_df(test_batch_3, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
            df_features_test_1 = get_feature_df(test_batch_1, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')


            fold_directory = iteration_directory + f'/train_batch_{x+1}/'
            if not os.path.isdir(fold_directory):
                os.mkdir(fold_directory)

            df_features_train = df_features_train.copy()
            test_12 = df_features_test_12.copy()
            test_6 = df_features_test_6.copy()
            test_3 = df_features_test_3.copy()
            test_1 = df_features_test_1.copy()

            imo_train = df_features_train['IMO'].to_numpy()
            imo_test_12 = test_12['IMO'].to_numpy()
            imo_test_6 = test_6['IMO'].to_numpy()
            imo_test_3 = test_3['IMO'].to_numpy()
            imo_test_1 = test_1['IMO'].to_numpy()

            np.save(f'{fold_directory}imo_test_12.npy', np.array(imo_test_12))
            np.save(f'{fold_directory}imo_test_6.npy', np.array(imo_test_6))
            np.save(f'{fold_directory}imo_test_3.npy', np.array(imo_test_3))
            np.save(f'{fold_directory}imo_test_1.npy', np.array(imo_test_1))
            np.save(f'{fold_directory}imo_train.npy', np.array(imo_train))
        
            for target in CLASSES_TO_USE:
                name = target.replace('/', '-')
                class_directory = fold_directory + f'/class_{name}/'
                if not os.path.isdir(class_directory):
                    os.mkdir(class_directory)

                df = df_features_train.copy()
                df_test_12 = test_12.copy()
                df_test_6 = test_6.copy()
                df_test_3 = test_3.copy()
                df_test_1 = test_1.copy()
                
                #   transform every irrelevant classname to 'other'
                df.loc[df['Target'] != target, 'Target'] = 'Other'
                X_train = df.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_12.loc[df_test_12['Target'] != target, 'Target'] = 'Other'
                X_test_12 = df_test_12.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_6.loc[df_test_6['Target'] != target, 'Target'] = 'Other'
                X_test_6 = df_test_6.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_3.loc[df_test_3['Target'] != target, 'Target'] = 'Other'
                X_test_3 = df_test_3.drop(['Target', 'IMO'], axis=1).to_numpy()

                df_test_1.loc[df_test_1['Target'] != target, 'Target'] = 'Other'
                X_test_1 = df_test_1.drop(['Target', 'IMO'], axis=1).to_numpy()

                #   Retrieve a list of unique labels (both string and numeric)
                labels = np.array([ 'Other', target])
                numeric_labels = np.array([0,1])

                #   Retrieve numeric labels train
                ship_labels = df['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_train = stratify

                #   Retrieve numeric labels test 12
                ship_labels = df_test_12['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_12 = stratify

                #   Retrieve numeric labels test 6
                ship_labels = df_test_6['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_6 = stratify

                #   Retrieve numeric labels test 3
                ship_labels = df_test_3['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_3 = stratify

                #   Retrieve numeric labels test 1
                ship_labels = df_test_1['Target'].to_numpy()
                stratify = []

                for ship in ship_labels:
                    index = np.where(labels == ship)
                    stratify.append(index[0][0])

                stratify = np.array(stratify)

                y_test_1 = stratify


                bst = XGBClassifier(n_estimators=100, max_depth=3, objective='binary:logistic', tree_method='gpu_hist')
                np.save(f'{class_directory}y_train.npy', np.array(y_train))

                # fit model
                bst.fit(X_train, y_train)
                bst.save_model(f'{class_directory}model.json')

                test_directory = class_directory + f'/test_12/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_12)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_12))


                test_directory = class_directory + f'/test_6/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_6)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_6))


                test_directory = class_directory + f'/test_3/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_3)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_3))

                test_directory = class_directory + f'/test_1/'
                if not os.path.isdir(test_directory):
                    os.mkdir(test_directory)
                
                # both make predicition in probabilities and normal binary
                y_pred_prob = bst.predict_proba(X_test_1)
                y_pred_prob = np.array(y_pred_prob)
                y_pred_prob_roc = y_pred_prob[:, 1]
                y_pred = np.argmax(y_pred_prob, axis= 1)
                np.save(f'{test_directory}y_pred_prob.npy', np.array(y_pred_prob))
                np.save(f'{test_directory}y_pred.npy', np.array(y_pred))
                np.save(f'{test_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
                np.save(f'{test_directory}y_test.npy', np.array(y_test_1))
            
            f.write('-------------\n')


        end_network = time.time()
        dur_network = end_network - start_network
        f.write(f"In total from network creation until features took: {dur_network}, which is {dur_network/60} minutes\n")

        f.close()
            

## Sliding window (static and resampled)

In [2]:
#   EXPERIMENT Static sliding window

def static_sliding(df_portcalls, results_directory):
    # create necessary files for logging
    current_results_directory = results_directory + "experiment_static_sliding/"
    if not os.path.isdir(current_results_directory):
        os.mkdir(current_results_directory)


    # Retrieve the desired timeframe
    df = get_time_frame_data(df_portcalls, START_DATE, END_DATE, "ATA_LT")


    #   only keep relevant columns and set the id column
    relevant_columns = {"ATA_LT": "datetime64[ns]", 
                        "ATD_LT": "datetime64[ns]",
                        "Port Name": "string",
                        "IMO Number": "string",
                        "(ATA) Ship Type Description": "string"
                        }
    new_column_names = ["Arrival Time", "Departure Time", "Port Name", "IMO number", "Ship Type"]

    id_column= "IMO number"

    df = select_columns(df,relevant_columns=relevant_columns, new_column_names=new_column_names)

    #   only keep the classes we want to classify
    df = keep_classes(df, classes_to_keep=CLASSES_TO_USE, target_column='Ship Type', id_column='IMO number')
   
   

    #   remove single occurences from network set
    df_classify = remove_sinlge_occurences(df, column_name="IMO number")


    batch_size = relativedelta(months=1)
    encoding_size = relativedelta(months=12)

    
    start_network = time.time()

    current_run_log = current_results_directory + 'experiment_log.txt'

    f = open(current_run_log, 'a')

    df_train_batches = []
    df_test_batches = []

    end_date_train = datetime.datetime(2017, 6, 1)
    end_date_test = datetime.datetime(2017, 7, 1, microsecond=1) 

    f.write(f'train batch 1:\n')

    test_sample = get_time_frame_data(df_classify, end_date_test- batch_size, end_date_test, "Arrival Time")
    
    test_sample, _ = split_network_train_set(df=test_sample, id_column="IMO number", label_column="Ship Type", network_size=0.1)
    
    test_imos = pd.unique(test_sample["IMO number"])

    f.write(f"start date test sample: {end_date_test - batch_size} and end date test sample: {end_date_test}\n")
    f.write(f'Amount of imos in test sample {len(test_imos)}\n')        

    test_batch = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_test - encoding_size, end_date=end_date_test)
    f.write(f"start date test: {end_date_test - encoding_size} and end date test: {end_date_test}\n")
    test_imos = pd.unique(test_batch["IMO number"])

    train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                            start_date=end_date_train - batch_size, end_date=end_date_train)

    
    train_imos = pd.unique(train_sample["IMO number"])
    f.write(f"start date train sample: {end_date_train - batch_size} and end date train sample: {end_date_train}\n")
    f.write(f'there are {len(train_imos)} imos sampled for train \n')
    
    train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                            start_date=end_date_train - encoding_size, end_date=end_date_train)

    train_imos = pd.unique(train_batch["IMO number"])
    network_imos = pd.unique(train_batch["IMO number"])
    f.write(f"start date train: {end_date_train - encoding_size} and end date train: {end_date_train}\n")
    f.write(f'there are {len(train_imos)} imos in train and {len(train_batch)} portcalls\n')
    

    overlap = np.intersect1d(train_imos, test_imos)
    f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
    f.write('-----\n')


    G, travel_times, port_stay_times, df_edges, processing_info = create_network_graph(train_batch, id_column="IMO number")
    feature_bins = create_feature_bins(G, travel_times=travel_times, port_stay_times=port_stay_times)

    df_features_train = get_feature_df(train_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
    df_train_batches.append(df_features_train)

    df_features_test = get_feature_df(test_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
    df_test_batches.append(df_features_test)

    for x in range(40):
        f.write(f'train batch {x+1}:\n')
        end_date_test += batch_size
        end_date_train += batch_size 
        if end_date_test == datetime.datetime(2019, 12, 1, microsecond=1):
            end_date_train = end_date_train + relativedelta(months=3)
            end_date_test = end_date_test + relativedelta(months=3)
        
        if end_date_test >= END_DATE:
            end_date_test = END_DATE

        test_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=network_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_test - batch_size, end_date=end_date_test)
        test_sample, _ = split_network_train_set(df=test_sample, id_column="IMO number", label_column="Ship Type", network_size=0.3)
        test_imos = pd.unique(test_sample["IMO number"])
        f.write(f"start date test sample: {end_date_test - batch_size} and end date test sample: {end_date_test}\n")
        f.write(f'there are {len(test_imos)} imos sampled for test \n')
        
        test_batch = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_test - encoding_size, end_date=end_date_test)

        test_imos_temp = pd.unique(test_batch["IMO number"])
        f.write(f"start date test: {end_date_test - encoding_size} and end date test: {end_date_test}\n")
        f.write(f'there are {len(test_imos_temp)} imos in test and {len(test_batch)} portcalls\n')

        train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_train - batch_size, end_date=end_date_train)
        
        train_imos = pd.unique(train_sample["IMO number"])
        f.write(f"start date train sample: {end_date_train - batch_size} and end date train sample: {end_date_train}\n")
        f.write(f'there are {len(train_imos)} imos sampled for train \n')
        
        train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_train - encoding_size, end_date=end_date_train)

        train_imos_temp = pd.unique(train_batch["IMO number"])
        f.write(f"start date train: {end_date_train - encoding_size} and end date train: {end_date_train}\n")
        f.write(f'there are {len(train_imos_temp)} imos in train and {len(train_batch)} portcalls\n')

        overlap = np.intersect1d(train_imos, test_imos)
        f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
        overlap = np.intersect1d(train_imos_temp, test_imos_temp)
        f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
        overlap_network = np.intersect1d(network_imos, test_imos_temp)
        f.write(f'there are {len(overlap_network)} imos overlapping between network and test \n')
        f.write('-----\n')

        

        df_features_train = get_feature_df(train_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
        df_train_batches.append(df_features_train)

        df_features_test = get_feature_df(test_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
        df_test_batches.append(df_features_test)

        
        


    end_network = time.time()
    dur_network = end_network - start_network
    f.write(f"In total from network creation until features took: {dur_network}, which is {dur_network/60} minutes\n")

    f.close()

    # Do the training and reporting
    for i in range(len(df_train_batches)):
        fold_directory = current_results_directory + f'/train_batch_{i+1}/'
        if not os.path.isdir(fold_directory):
            os.mkdir(fold_directory)

        df_features_train = df_train_batches[i].copy()
        test = df_test_batches[i].copy()

        imo_train = df_features_train['IMO'].to_numpy()
        imo_test = test['IMO'].to_numpy()

        np.save(f'{fold_directory}imo_test.npy', np.array(imo_test))
        np.save(f'{fold_directory}imo_train.npy', np.array(imo_train))
    
        for target in CLASSES_TO_USE:
            name = target.replace('/', '-')
            class_directory = fold_directory + f'/class_{name}/'
            if not os.path.isdir(class_directory):
                os.mkdir(class_directory)

            df = df_features_train.copy()
            df_test = test.copy()
    
            
            #   transform every irrelevant classname to 'other'
            df.loc[df['Target'] != target, 'Target'] = 'Other'
            X_train = df.drop(['Target', 'IMO'], axis=1).to_numpy()

            df_test.loc[df_test['Target'] != target, 'Target'] = 'Other'
            X_test = df_test.drop(['Target', 'IMO'], axis=1).to_numpy()


            #   Retrieve a list of unique labels (both string and numeric)
            labels = np.array([ 'Other', target])
            numeric_labels = np.array([0,1])

            #   Retrieve numeric labels train
            ship_labels = df['Target'].to_numpy()
            stratify = []

            for ship in ship_labels:
                index = np.where(labels == ship)
                stratify.append(index[0][0])

            stratify = np.array(stratify)

            y_train = stratify

            #   Retrieve numeric labels test 
            ship_labels = df_test['Target'].to_numpy()
            stratify = []

            for ship in ship_labels:
                index = np.where(labels == ship)
                stratify.append(index[0][0])

            stratify = np.array(stratify)

            y_test = stratify


            bst = XGBClassifier(n_estimators=100, max_depth=3, objective='binary:logistic', tree_method='gpu_hist')
            np.save(f'{class_directory}y_train.npy', np.array(y_train))

            # fit model
            bst.fit(X_train, y_train)
            bst.save_model(f'{class_directory}model.json')

            
            # both make predicition in probabilities and normal binary
            y_pred_prob = bst.predict_proba(X_test)
            y_pred_prob = np.array(y_pred_prob)
            y_pred_prob_roc = y_pred_prob[:, 1]
            y_pred = np.argmax(y_pred_prob, axis= 1)
            np.save(f'{class_directory}y_pred_prob.npy', np.array(y_pred_prob))
            np.save(f'{class_directory}y_pred.npy', np.array(y_pred))
            np.save(f'{class_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
            np.save(f'{class_directory}y_test.npy', np.array(y_test))


In [4]:
#   EXPERIMENT Resampled sliding window
def resampled_sliding(df_portcalls, results_directory):
    # create necessary files for logging
    current_results_directory = results_directory + "experiment_resampled_sliding/"
    if not os.path.isdir(current_results_directory):
        os.mkdir(current_results_directory)


    # Retrieve the desired timeframe
    df = get_time_frame_data(df_portcalls, START_DATE, END_DATE, "ATA_LT")


    #   only keep relevant columns and set the id column
    relevant_columns = {"ATA_LT": "datetime64[ns]", 
                        "ATD_LT": "datetime64[ns]",
                        "Port Name": "string",
                        "IMO Number": "string",
                        "(ATA) Ship Type Description": "string"
                        }
    new_column_names = ["Arrival Time", "Departure Time", "Port Name", "IMO number", "Ship Type"]

    id_column= "IMO number"

    df = select_columns(df,relevant_columns=relevant_columns, new_column_names=new_column_names)

    #   only keep the classes we want to classify
    df = keep_classes(df, classes_to_keep=CLASSES_TO_USE, target_column='Ship Type', id_column='IMO number')
   
   

    #   remove single occurences from network set
    df_classify = remove_sinlge_occurences(df, column_name="IMO number")


    batch_size = relativedelta(months=1)
    encoding_size = relativedelta(months=12)

    
    start_network = time.time()

    current_run_log = current_results_directory + 'experiment_log.txt'

    f = open(current_run_log, 'a')

    df_train_batches = []
    df_test_batches = []

    end_date_train = datetime.datetime(2017, 6, 1)
    end_date_test = datetime.datetime(2017, 7, 1, microsecond=1) 

    f.write(f'train batch 1:\n')

    test_sample = get_time_frame_data(df_classify, end_date_test- batch_size, end_date_test, "Arrival Time")
    
    test_sample, _ = split_network_train_set(df=test_sample, id_column="IMO number", label_column="Ship Type", network_size=0.1)
    
    test_imos = pd.unique(test_sample["IMO number"])

    f.write(f'Amount of imos in test sample {len(test_imos)}\n')        

    test_batch = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_test - encoding_size, end_date=end_date_test)
    f.write(f"start date test: {end_date_test - encoding_size} and end date test: {end_date_test}\n")
    test_imos = pd.unique(test_batch["IMO number"])

    train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                            start_date=end_date_train - batch_size, end_date=end_date_train)
    
    train_imos = pd.unique(train_sample["IMO number"])
    f.write(f"start date train sample: {end_date_train - batch_size} and end date train sample: {end_date_train}\n")
    f.write(f'there are {len(train_imos)} imos sampled for train \n')
    
    train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                            start_date=end_date_train - encoding_size, end_date=end_date_train)

    train_imos = pd.unique(train_batch["IMO number"])
    network_imos = pd.unique(train_batch["IMO number"])
    f.write(f"start date train: {end_date_train - encoding_size} and end date train: {end_date_train}\n")
    f.write(f'there are {len(train_imos)} imos in train and {len(train_batch)} portcalls\n')
    

    overlap = np.intersect1d(train_imos, test_imos)
    f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
    f.write('-----\n')


    G, travel_times, port_stay_times, df_edges, processing_info = create_network_graph(train_batch, id_column="IMO number")
    feature_bins = create_feature_bins(G, travel_times=travel_times, port_stay_times=port_stay_times)

    df_features_train = get_feature_df(train_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
    df_train_batches.append(df_features_train)

    df_features_test = get_feature_df(test_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
    df_test_batches.append(df_features_test)

    # exclude these imos for sampling test, to be consistent with static design
    imos_to_exclude = network_imos

    for x in range(40):
        f.write(f'train batch {x+1}:\n')
        end_date_test += batch_size
        end_date_train += batch_size 
        if end_date_test == datetime.datetime(2019, 12, 1, microsecond=1):
            end_date_train = end_date_train + relativedelta(months=3)
            end_date_test = end_date_test + relativedelta(months=3)
        
        if end_date_test >= END_DATE:
            end_date_test = END_DATE

        test_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=imos_to_exclude, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_test - batch_size, end_date=end_date_test)
        test_sample, _ = split_network_train_set(df=test_sample, id_column="IMO number", label_column="Ship Type", network_size=0.3)
        test_imos = pd.unique(test_sample["IMO number"])

        f.write(f'there are {len(test_imos)} imos sampled for test \n')
        
        test_batch = get_subset_include_ids(df=df_classify, include_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_test - encoding_size, end_date=end_date_test)

        test_imos_temp = pd.unique(test_batch["IMO number"])
        f.write(f"start date test: {end_date_test - encoding_size} and end date test: {end_date_test}\n")
        f.write(f'there are {len(test_imos_temp)} imos in test and {len(test_batch)} portcalls\n')

        train_sample = get_subset_exclude_ids(df=df_classify, exclude_ids=test_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_train - batch_size, end_date=end_date_train)
        
        train_imos = pd.unique(train_sample["IMO number"])
        f.write(f"start date train sample: {end_date_train - batch_size} and end date train sample: {end_date_train}\n")
        f.write(f'there are {len(train_imos)} imos sampled for train \n')
        
        train_batch = get_subset_include_ids(df=df_classify, include_ids=train_imos, id_column=id_column, time_column="Arrival Time",
                                                start_date=end_date_train - encoding_size, end_date=end_date_train)
        
        train_imos = pd.unique(train_batch["IMO number"])
        network_imos = pd.unique(train_batch["IMO number"])
        f.write(f"start date train: {end_date_train - encoding_size} and end date train: {end_date_train}\n")
        f.write(f'there are {len(train_imos)} imos in train and {len(train_batch)} portcalls\n')
        

        G, travel_times, port_stay_times, df_edges, processing_info = create_network_graph(train_batch, id_column="IMO number")
        feature_bins = create_feature_bins(G, travel_times=travel_times, port_stay_times=port_stay_times)

        df_features_train = get_feature_df(train_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
        df_train_batches.append(df_features_train)

        df_features_test = get_feature_df(test_batch, G, feature_bins=feature_bins, id_column=id_column, target_column='Ship Type')
        df_test_batches.append(df_features_test)

        overlap = np.intersect1d(train_imos, test_imos_temp)
        f.write(f'there are {len(overlap)} imos overlapping between train and test\n')
        overlap_network = np.intersect1d(network_imos, test_imos_temp)
        f.write(f'there are {len(overlap_network)} imos overlapping between network and test \n')
        f.write('-----\n')


    end_network = time.time()
    dur_network = end_network - start_network
    f.write(f"In total from network creation until features took: {dur_network}, which is {dur_network/60} minutes\n")

    f.close()

    # Do the training and reporting
    for i in range(len(df_train_batches)):
        fold_directory = current_results_directory + f'/train_batch_{i+1}/'
        if not os.path.isdir(fold_directory):
            os.mkdir(fold_directory)

        df_features_train = df_train_batches[i].copy()
        test = df_test_batches[i].copy()

        imo_train = df_features_train['IMO'].to_numpy()
        imo_test = test['IMO'].to_numpy()

        np.save(f'{fold_directory}imo_test.npy', np.array(imo_test))
        np.save(f'{fold_directory}imo_train.npy', np.array(imo_train))
    
        for target in CLASSES_TO_USE:
            name = target.replace('/', '-')
            class_directory = fold_directory + f'/class_{name}/'
            if not os.path.isdir(class_directory):
                os.mkdir(class_directory)

            df = df_features_train.copy()
            df_test = test.copy()
    
            
            #   transform every irrelevant classname to 'other'
            df.loc[df['Target'] != target, 'Target'] = 'Other'
            X_train = df.drop(['Target', 'IMO'], axis=1).to_numpy()

            df_test.loc[df_test['Target'] != target, 'Target'] = 'Other'
            X_test = df_test.drop(['Target', 'IMO'], axis=1).to_numpy()


            #   Retrieve a list of unique labels (both string and numeric)
            labels = np.array([ 'Other', target])
            numeric_labels = np.array([0,1])

            #   Retrieve numeric labels train
            ship_labels = df['Target'].to_numpy()
            stratify = []

            for ship in ship_labels:
                index = np.where(labels == ship)
                stratify.append(index[0][0])

            stratify = np.array(stratify)

            y_train = stratify

            #   Retrieve numeric labels test 
            ship_labels = df_test['Target'].to_numpy()
            stratify = []

            for ship in ship_labels:
                index = np.where(labels == ship)
                stratify.append(index[0][0])

            stratify = np.array(stratify)

            y_test = stratify


            bst = XGBClassifier(n_estimators=100, max_depth=3, objective='binary:logistic', tree_method='gpu_hist')
            np.save(f'{class_directory}y_train.npy', np.array(y_train))

            # fit model
            bst.fit(X_train, y_train)
            bst.save_model(f'{class_directory}model.json')

            
            # both make predicition in probabilities and normal binary
            y_pred_prob = bst.predict_proba(X_test)
            y_pred_prob = np.array(y_pred_prob)
            y_pred_prob_roc = y_pred_prob[:, 1]
            y_pred = np.argmax(y_pred_prob, axis= 1)
            np.save(f'{class_directory}y_pred_prob.npy', np.array(y_pred_prob))
            np.save(f'{class_directory}y_pred.npy', np.array(y_pred))
            np.save(f'{class_directory}y_pred_prob_roc.npy', np.array(y_pred_prob_roc))
            np.save(f'{class_directory}y_test.npy', np.array(y_test))

## -------------