In [76]:
# Feature Analysis flow
from sklearn import svm
from sklearn.feature_selection import RFE
X = [[0, 0], [2, 2]]
y = [0.5, 2.5]
regr = svm.SVR()
selector = RFE(regr, n_features_to_select=5, step=1)
selector = selector.fit(X, y)
selector.support_
selector.ranking_

array([1, 1])

In [77]:
# load data
import numpy as np
import pandas as pd
df = pd.read_excel('OuterHarbour-Tiel2.xlsx')
# df = pd.read_excel('OuterHarbour-Tiel2.xlsx')

In [78]:
# All nodes on the route
locs = df[["locationType", "cityName", "countryName"]].to_numpy().astype('str')
nodes = np.unique(locs, axis=0)
# create columns with default value for now
for node in nodes:
    df[f'{node[0]}_{node[1]}_{node[2]}_visited'] = False
    df[f'{node[0]}_{node[1]}_{node[2]}_planned'] = False

In [79]:
df = df[df["ordSeaFreightMainTransportInvolvement"] == "J"]
df["ordJFHArrangeLoadingPickup"] = df["ordJFHArrangeLoadingPickup"].fillna("")
df["ordJFHArrangeDelivery"] = df["ordJFHArrangeDelivery"].fillna("")
df["ordReqDeliveryDate"] = df["ordReqDeliveryDate"].fillna("")

In [80]:
from math import sqrt, pow

def node_name(node):
    # node = node.iloc[0]
    return f'{node["locationType"]}_{node["cityName"]}_{node["countryName"]}'

def abs_distance(o, lat, long):
    # o = origin_node.iloc[0]
    diff_lat = abs(o["latitude"] - lat) * 111.139
    diff_long =  abs(o["longitude"] - long) * 111.139
    return sqrt(pow(diff_lat, 2) + pow(diff_long, 2))

def dep_late(port, node_created):
    if node_created > port["actualDepDate"]:
        return "yes" if port["actualDepDate"] > port["estimatedDepDate"] else "no"
    return "unknown"
    

def get_created_date(rowNo, destination, ordCreate, reqPortL, reqPortD, origDest, locType, readyDate, actArrDate):
    if rowNo == 1:
        return ordCreate
    elif rowNo == destination["rowNo"]:
        return ordCreate
    elif origDest == "O" and locType.strip() == "P":
        return ordCreate if reqPortL else readyDate
    elif origDest == "D" and locType.strip() == "P":
        return ordCreate if reqPortD else readyDate
    return actArrDate

def get_port(port, node_created):
    return node_name(port) if node_created > port["created"] else "unknown"

df["origin"] = [df.loc[(df["shipmentID"] == shipmentID) & (df["rowNo"] == 1)].iloc[0] for shipmentID in df["shipmentID"]]
df["originCountry"] = [o["countryName"] for o in df["origin"]]
df["destination"] = [df.loc[(df["shipmentID"] == shipmentID) & (df["rowNo"] == df[df["shipmentID"] == shipmentID]["rowNo"].max())].iloc[0] for shipmentID in df["shipmentID"]]
df["destinationCountry"] = [d["countryName"] for d in df["destination"]]
df["created"] = [get_created_date(*values) for values in df[["rowNo", "destination", "ordaCreateDate", "prtReqPOL", "prtReqPOD", "originDestination", "locationType", "readyDate", "actualArrDate"]].values]
df["created"] = df["created"].astype(object)
df["readyDate"] = [o["readyDate"] for o in df["origin"]]
df["consolidation"] = (df["ordtType"].str.strip() == "LF") | (df["ordtType"].str.strip() == "LFL")
df["deconsolidation"] = df["ordtType"].str.strip() == "LFL"
df["serviceType"] = [f"{'door' if p == 'Y' else 'port'}-to-{'door' if d == 'Y' else 'port'}" for p, d in df[["ordJFHArrangeLoadingPickup", "ordJFHArrangeDelivery"]].values]
df["pastTimeToOrigin"] = [orig["actualDepDate"] - arr_date for orig, arr_date in df[["origin", "actualArrDate"]].values]
df["pastAvgAbsSpeed"] = [past_time / abs_distance(orig, lat, long) for (lat, long, orig, past_time) in df[["latitude", "longitude", "origin", "pastTimeToOrigin"]].values]
df["counterStops"] = df["rowNo"] - 1
df["departureMonthOrigin"] = [o["actualDepDate"].month if (i > 1 and o["actualDepDate"] > created) else 0 for (o, created, i) in df[["origin", "created", "rowNo"]].values]
df["departureDayOrigin"] = [o["actualDepDate"].isoweekday() if (i > 1 and o["actualDepDate"] > created) else 0 for (o, created, i) in df[["origin", "created", "rowNo"]].values]
df["numberOfTEU"] = df["ordExpectedNBOfContainers"]
df["totalWeight"] = df["ordGrossWeightKGM"]
df["ETA"] = df["ordReqDeliveryDate"]
df["ATA"] = [df.loc[(df["shipmentID"] == shipmentID) & (df["rowNo"] == df[df["shipmentID"] == shipmentID]["rowNo"].max())].iloc[0]["actualArrDate"] for shipmentID in df["shipmentID"]]
df["currentMonth"] = [c.month for c in df["created"].values]
df["portOfLoading"] = [get_port(df.loc[(df["shipmentID"] == shipmentID) & (df["locationType"].str.strip() == "P") & (df["originDestination"].str.strip() == "O")].iloc[0], created) for (shipmentID, created) in df[["shipmentID", "created"]].values]
df["portOfDischarge"] = [get_port(df.loc[(df["shipmentID"] == shipmentID) & (df["locationType"].str.strip() == "P") & (df["originDestination"].str.strip() == "D")].iloc[0], created) for (shipmentID, created) in df[["shipmentID", "created"]].values]
df["portDestCongestion"] = 0 # external source needed
df["portOfLoadingLate"] = [dep_late(df.loc[(df["shipmentID"] == shipmentID) & (df["locationType"].str.strip() == "P") & (df["originDestination"].str.strip() == "O")].iloc[0], created) for (shipmentID, created) in df[["shipmentID", "created"]].values]
df["origin"] = [node_name(o) for o in df["origin"]]
df["destination"] = [node_name(d) for d in df["destination"]]
# set visited path and future path
# ss = np.unique(df["shipmentID"])
# paths = [df[df["shipmentID"] == s][["locationType", "cityName", "countryName"]].apply(lambda x : node_name(x), axis=1) for s in ss]
# for path, shipment in zip(paths, ss):
#     for i in range(1, len(path) + 1):
#         df.loc[(df["shipmentID"] == shipment) & (df["rowNo"] == i), [f"{n}_visited" for n in path[:i]]] = True
#         df.loc[(df["shipmentID"] == shipment) & (df["rowNo"] == i), [f"{n}_planned" for n in path[i-1:]]] = True

In [81]:

# df["portOfLoading"] = [get_port(df.loc[(df["shipmentID"] == shipmentID) & (df["locationType"].str.strip() == "P") & (df["originDestination"].str.strip() == "O")].iloc[0], created) for (shipmentID, created) in df[["shipmentID", "created"]].values]
# df["portOfLoadingLate"] = [dep_late(df.loc[(df["shipmentID"] == shipmentID) & (df["locationType"].str.strip() == "P") & (df["originDestination"].str.strip() == "O")].iloc[0], created) for (shipmentID, created) in df[["shipmentID", "created"]].values]

# np.unique(df["portOfLoadingLate"])

In [82]:

df = df.drop(columns=["prtReqPOL", "prtReqPOD", "ordReqShipDate", "ordaCreateDate", "ordReqDeliveryDate", "latitude" , "longitude", "actualArrDate", "ordExpectedNBOfC ontainers", "ordGrossWeightKGM", "rowNo", "shipmentID", "originDestination", "locationType", "cityName", "countryName", "ordtType", "ordJFHArrangeLoadingPickup", "ordSeaFreightMainTransportInvolvement", "ordJFHArrangeDelivery"])
df.to_excel("output.xlsx")

KeyError: "['ordExpectedNBOfC ontainers'] not found in axis"