In [None]:
import pyforest
import itertools
import warnings
warnings.filterwarnings("ignore")
from flaml import AutoML

In [None]:
# Source data for detention
data = sqlContext.sql(""" select * from dnd.cftex_v2 """)
main_data = data.toPandas().dropna()

# Filtering data
main_data = main_data[(main_data['CONTAINER_TYPE_CD'] == 'DRY') &  ((main_data['CONTAINER_SIZE_CD'] == '20') | (main_data['CONTAINER_SIZE_CD'] == '40'))]
main_data = main_data[(main_data['CDET_Days'] > 0) & (main_data['Std_FreeTimeGranted'] > 0) & (main_data['CONSIGNEE_CUSTOMER_CD'] != 'NULL')]
main_data['Import_Country'] = main_data['DIPLA_CITY_CD'].apply(lambda x : x[:2])

main_data = main_data.drop(['LOPFI_CITY_CD', 'FreightType_Cd', 'GateIN', 'DischargeDate', 'BOOKED_FFE'], axis=1)
main_data['CONTAINER_SIZE_CD'] = pd.to_numeric(main_data['CONTAINER_SIZE_CD'])

main_data1 = main_data.copy()
main_data1['PCD_Month'] = main_data1['PRICE_CALC_DT'].dt.strftime('%Y-%m')

# Mapping region code
region_data = spark.read.format("orc").load("/mnt/ipw_ontology_prod/v1/",header="True",inferschema="True")
region_data = region_data.toPandas()

region_data = region_data[(region_data['COUNTRY_CODE']!='Unknown')][['COUNTRY_CODE', 'REGION_CODE']].drop_duplicates()
region_data = region_data.rename(columns={'COUNTRY_CODE':'Import_Country', 'REGION_CODE':'region_cd'})

main_data1 = main_data1.merge(region_data, on =['Import_Country'], how = 'left')

# Filtering consignees of EUR only
main_data1 = main_data1[main_data1['region_cd'] == 'EUR']
main_data1.head()

<IPython.core.display.Javascript object>

Unnamed: 0,OPERATOR_DESC,PRICE_CALC_DT,DIPLA_CITY_CD,Import_Country,Spot_YN,CONSIGNEE_CUSTOMER_CD,SHIPMENT_NO,Equipment_No,CONTAINER_TYPE_CD,CONTAINER_SIZE_CD,Std_FreeTimeGranted,Total_FreeTimeGranted_Days,FTEx_Taken,ActualTurnTime_Days,CDET_Days,Delay,CommoditySubType_Dsc,PCD_Month,region_cd
6,Maersk,2024-05-31,GBFXS,GB,N,13000132306,239840399,HASU1531800,DRY,20,7,7,0,3,3,0,"Ores, slag and ash",2024-05,EUR
7,Maersk,2021-10-06,NLROT,NL,N,12248702540,240277959,MSKU8294000,DRY,40,10,10,0,2,2,0,Plastic and rubber,2021-10,EUR
10,Maersk,2024-06-10,ESVCI,ES,N,12700059245,240442363,MRKU5566041,DRY,40,14,14,0,7,7,0,Textiles and apparel,2024-06,EUR
15,Maersk,2024-06-27,GBLGP,GB,Y,130LDN20289,241723501,SUDU7440504,DRY,20,7,7,0,5,5,0,Dairy products,2024-06,EUR
19,Maersk,2024-07-25,BEANT,BE,N,11301018612,242456168,SUDU7758753,DRY,20,15,15,0,4,4,0,Chemicals,2024-07,EUR


In [None]:
# Creation of combinations
unique_dipla = list(main_data1['DIPLA_CITY_CD'].unique())
unique_dipla.sort()
unique_spot_yn = list(main_data1['Spot_YN'].unique())
unique_container_size = list(main_data1['CONTAINER_SIZE_CD'].unique())

combined_list = [unique_dipla, unique_spot_yn, unique_container_size]
combinations = list(itertools.product(*combined_list))
prediction_df = pd.DataFrame(columns = ['DIPLA_CITY_CD', 'Spot_YN', 'CONTAINER_SIZE', 'Std_FreeTimeGranted', 'PREDICTED_TT'])
empty_comb = []

# Model training
for combination in combinations:
  print(f'combination: {combination}')
  data1 = main_data1[(main_data1['DIPLA_CITY_CD'] == combination[0]) & (main_data1['Spot_YN'] == combination[1]) & (main_data1['CONTAINER_SIZE_CD'] == combination[2])]

  # Outliers removal
  Q1 = data1['CDET_Days'].quantile(0.25)
  Q3 = data1['CDET_Days'].quantile(0.75)
  IQR = Q3 - Q1
  #     print(Q1, Q3, IQR)
  data1 = data1[(data1['CDET_Days'] >= (Q1 - 1.2 * IQR)) & (data1['CDET_Days'] <= (Q3 + 1.2 * IQR))]

  data1 = data1[['Std_FreeTimeGranted', 'CDET_Days']].groupby('Std_FreeTimeGranted', as_index = False).mean()

  Q1 = data1['Std_FreeTimeGranted'].quantile(0.25)
  Q3 = data1['Std_FreeTimeGranted'].quantile(0.75)
  IQR = Q3 - Q1
  #     print(Q1, Q3, IQR)

  data1 = data1[(data1['Std_FreeTimeGranted'] >= (Q1 - 1.2 * IQR)) & (data1['Std_FreeTimeGranted'] <= (Q3 + 1.2 * IQR))]

  print(f'DF Length after removing Outliers: {len(data1)}')

  if len(data1) >= 3:
    x = data1.iloc[:,:1].values
    y = data1.iloc[:,1].values

    automl = AutoML()

    if len(data1)<5:
      split = len(data1)
      automl_settings = {
      "time_budget": 60,  # in seconds
      "task" : 'regression',
      "estimator_list": ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth'],
      "metric" : 'mae',
      "log_file_name" : 'AutoML.log',
      "n_splits" : split,
      "model_history": False
      }
    else:
      automl_settings = {
      "time_budget": 60,  # in seconds
      "task" : 'regression',
      "estimator_list": ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth'],
      "metric" : 'mae',
      "log_file_name" : 'AutoML.log',
      "model_history": False 
      }

    automl.fit(x, y, **automl_settings)

    pred_list = []
    sft_list = list(range(1, 51))
    temp_df = pd.DataFrame(columns = ['DIPLA_CITY_CD', 'Spot_YN', 'CONTAINER_SIZE', 'Std_FreeTimeGranted', 'PREDICTED_TT'])

    for sft in sft_list:
      sft = np.array([sft]).reshape(-1,1)
      pred = automl.predict(sft)
      pred_list.append(round(pred[0]))
      

    temp_df['Std_FreeTimeGranted'] = sft_list
    temp_df['PREDICTED_TT'] = pred_list
    temp_df['DIPLA_CITY_CD'] = combination[0]
    temp_df['Spot_YN'] = combination[1]
    temp_df['CONTAINER_SIZE'] = combination[2]

    prediction_df = pd.concat([prediction_df, temp_df], axis=0)
  else:
    empty_comb.append(combination)

prediction_df.head()

In [None]:
ship_free3_df = main_data1.copy()

# Extraction of risk factor

# Equipment count based on consignee and import country
ship_df1=ship_free3_df.groupby(['PCD_Month','CONSIGNEE_CUSTOMER_CD','DIPLA_CITY_CD','Spot_YN','CONTAINER_SIZE_CD']).count().reset_index()
ship_df1=ship_df1[['PCD_Month','CONSIGNEE_CUSTOMER_CD','DIPLA_CITY_CD','Spot_YN','CONTAINER_SIZE_CD','Delay']]
ship_df1.rename(columns = {'Delay':'Total_Equipments'}, inplace = True)

#shipment with  delay
delayed_df=ship_free3_df[ship_free3_df['Delay']>0]
delayed_df=delayed_df.groupby(['PCD_Month','CONSIGNEE_CUSTOMER_CD','DIPLA_CITY_CD','Spot_YN','CONTAINER_SIZE_CD']).count().reset_index()
delayed_equipment_df=delayed_df[['PCD_Month','CONSIGNEE_CUSTOMER_CD','DIPLA_CITY_CD','Spot_YN','CONTAINER_SIZE_CD','Delay']]
delayed_equipment_df.rename(columns = {'Delay':'Delayed_Equipments'}, inplace = True)

delayed_equipment_df.head()

Unnamed: 0,PCD_Month,CONSIGNEE_CUSTOMER_CD,DIPLA_CITY_CD,Spot_YN,CONTAINER_SIZE_CD,Delayed_Equipments
0,2021-02,12200262531,NLROT,N,40,3
1,2021-03,12700233327,ESMAR,Y,20,3
2,2021-03,13001830606,GBFXS,N,40,2
3,2021-03,13700017081,LVRIX,Y,40,1
4,2021-03,43600253038,DZORN,N,40,9


In [None]:
#Delayed Percentage
consignee_delay_df=pd.merge(ship_df1, delayed_equipment_df, how = 'left')
consignee_delay_df['Delayed_Percentage']= round((consignee_delay_df['Delayed_Equipments']/consignee_delay_df['Total_Equipments'])*100,2)

shipment_delayed_df = consignee_delay_df.copy().fillna(0)

#shipments delayed for consignee
shipment_delayed_df = shipment_delayed_df[['PCD_Month','DIPLA_CITY_CD', 'Spot_YN','CONTAINER_SIZE_CD', 'CONSIGNEE_CUSTOMER_CD', 'Total_Equipments', 'Delayed_Equipments', 'Delayed_Percentage']].sort_values(by=['DIPLA_CITY_CD', 'CONSIGNEE_CUSTOMER_CD']).reset_index().drop('index', axis=1)
shipment_delayed_df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,PCD_Month,DIPLA_CITY_CD,Spot_YN,CONTAINER_SIZE_CD,CONSIGNEE_CUSTOMER_CD,Total_Equipments,Delayed_Equipments,Delayed_Percentage
0,2024-02,ALDUR,Y,40,10000079758,57,19.0,33.33
1,2022-07,ALDUR,N,20,102-4576,1,0.0,0.0
2,2023-03,ALDUR,Y,20,102-4576,1,0.0,0.0
3,2023-07,ALDUR,Y,20,102-4576,1,0.0,0.0
4,2023-08,ALDUR,Y,20,102-4576,1,0.0,0.0


In [None]:
ship_df1 = main_data1.copy()
ship_df1 = ship_df1.groupby(['PCD_Month', 'CONSIGNEE_CUSTOMER_CD','DIPLA_CITY_CD','Spot_YN','CONTAINER_SIZE_CD']).sum().reset_index()

#Consumed days percentage
ship_df1 = ship_df1[['PCD_Month', 'CONSIGNEE_CUSTOMER_CD','DIPLA_CITY_CD','Spot_YN','CONTAINER_SIZE_CD','Std_FreeTimeGranted', 'FTEx_Taken', 'ActualTurnTime_Days', 'Delay']]

ship_df1[['FTEx_Taken']] = ship_df1[['FTEx_Taken']].astype(int)
ship_df1['SFT_Consumption%'] = (ship_df1['ActualTurnTime_Days']/ship_df1['Std_FreeTimeGranted']) * 100

ship_df1['ActualTurnTime_Days-SFT'] = ship_df1['ActualTurnTime_Days'] -  ship_df1['Std_FreeTimeGranted']
ship_df1.loc[ship_df1['ActualTurnTime_Days-SFT']<0, 'ActualTurnTime_Days-SFT'] = 0

ship_df1.loc[ship_df1['FTEx_Taken'] == 0, 'ActualTurnTime_Days-SFT'] = 0
ship_df1.loc[(ship_df1['FTEx_Taken'] > 0) & (ship_df1['SFT_Consumption%'] > 100), 'SFT_Consumption%'] = 100

ship_df1['FTEx_Consumption%'] = (ship_df1['ActualTurnTime_Days-SFT']/ship_df1['FTEx_Taken']) * 100

ship_df1.loc[ship_df1['FTEx_Consumption%'] > 100, 'SFT_Consumption%'] = ((ship_df1['ActualTurnTime_Days'] - ship_df1['FTEx_Taken'])/ship_df1['Std_FreeTimeGranted']) * 100
ship_df1.loc[ship_df1['FTEx_Consumption%'] > 100, 'FTEx_Consumption%'] = 100

ship_df1[['FTEx_Consumption%', 'SFT_Consumption%']] = ship_df1[['FTEx_Consumption%', 'SFT_Consumption%']].fillna(0)

ship_df1.tail()

Unnamed: 0,PCD_Month,CONSIGNEE_CUSTOMER_CD,DIPLA_CITY_CD,Spot_YN,CONTAINER_SIZE_CD,Std_FreeTimeGranted,FTEx_Taken,ActualTurnTime_Days,Delay,SFT_Consumption%,ActualTurnTime_Days-SFT,FTEx_Consumption%
770774,2024-09,11100180381,ESBCN,N,40,42,0,14,0,33.333333,0,0.0
770775,2024-09,12400181243,PTLEX,N,20,21,0,3,0,14.285714,0,0.0
770776,2024-09,130LPL17011,GBFXS,N,40,36,0,24,0,66.666667,0,0.0
770777,2024-09,22900389936,MACAS,N,20,63,0,12,0,19.047619,0,0.0
770778,2024-09,413031075,ILASD,N,40,56,0,6,0,10.714286,0,0.0


In [None]:
# Join 1 shipment_delayed_df with ship_df1
cng1 = shipment_delayed_df.merge(ship_df1, on = ['PCD_Month','DIPLA_CITY_CD', 'Spot_YN', 'CONTAINER_SIZE_CD', 'CONSIGNEE_CUSTOMER_CD'], how = 'left')

check1= cng1.copy()
check1['Cons_Factor'] = (check1['Delayed_Percentage'] + check1['SFT_Consumption%'] + check1['FTEx_Consumption%'])/100
check1 = check1[['PCD_Month', 'CONSIGNEE_CUSTOMER_CD', 'DIPLA_CITY_CD', 'Spot_YN', 'CONTAINER_SIZE_CD', 'Cons_Factor']]
check1.head()

Unnamed: 0,PCD_Month,CONSIGNEE_CUSTOMER_CD,DIPLA_CITY_CD,Spot_YN,CONTAINER_SIZE_CD,Cons_Factor
0,2024-02,10000079758,ALDUR,Y,40,1.245581
1,2022-07,102-4576,ALDUR,N,20,0.222222
2,2023-03,102-4576,ALDUR,Y,20,0.875
3,2023-07,102-4576,ALDUR,Y,20,0.375
4,2023-08,102-4576,ALDUR,Y,20,0.5


In [None]:
prediction_df = prediction_df.rename(columns = {'PREDICTED_TT':'model_pred', 'CONTAINER_SIZE':'CONTAINER_SIZE_CD'})

final1 = prediction_df.merge(check1, on =['DIPLA_CITY_CD', 'Spot_YN', 'CONTAINER_SIZE_CD'], how='left')

final1['cons_pred'] = round((final1['Std_FreeTimeGranted'] * final1['Cons_Factor']).astype(float))
final1['TT_pred'] = round(((final1['cons_pred'] + final1['model_pred'])/2).astype(float))

final1 = final1[['PCD_Month', 'CONSIGNEE_CUSTOMER_CD', 'DIPLA_CITY_CD', 'Spot_YN', 'CONTAINER_SIZE_CD', 'Std_FreeTimeGranted', 'TT_pred']]
final1['PCD_Month'] = final1['PCD_Month'].apply(lambda x : x[-2:])
final1 = final1.groupby(['PCD_Month', 'CONSIGNEE_CUSTOMER_CD', 'DIPLA_CITY_CD', 'Spot_YN', 'CONTAINER_SIZE_CD', 'Std_FreeTimeGranted'], as_index = False).mean()
final1.head()

Unnamed: 0,PCD_Month,CONSIGNEE_CUSTOMER_CD,DIPLA_CITY_CD,Spot_YN,CONTAINER_SIZE_CD,Std_FreeTimeGranted,TT_pred
0,1,9900000056,DKAAR,Y,20,1,1.0
1,1,9900000056,DKAAR,Y,20,2,1.0
2,1,9900000056,DKAAR,Y,20,3,2.0
3,1,9900000056,DKAAR,Y,20,4,2.0
4,1,9900000056,DKAAR,Y,20,5,2.0


In [None]:
final1 = spark.createDataFrame(final1)

In [None]:
final1.write.mode("overwrite").saveAsTable("dnd.pred_eur_prod_v2")