In [None]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import rank
from pyspark.sql.functions import sum
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import concat, col, lit

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline

import datetime
from datetime import datetime, timedelta
from dateutil.rrule import rrule, MONTHLY
from dateutil.relativedelta import relativedelta

import statsmodels
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import LinearRegression
import statistics

import warnings
warnings.filterwarnings('ignore')

### Extraction of discounts of the consignee based on consigness FTEx consumption 

In [None]:
data = sqlContext.sql(""" select * from dnd.cftex_v2 """)
main_data = data.toPandas().dropna()

main_data = main_data[(main_data['CONTAINER_TYPE_CD'] == 'DRY') &  ((main_data['CONTAINER_SIZE_CD'] == '20') | (main_data['CONTAINER_SIZE_CD'] == '40'))]
main_data = main_data[(main_data['CDET_Days'] > 0) & (main_data['Std_FreeTimeGranted'] > 0) & (main_data['CONSIGNEE_CUSTOMER_CD'] != 'NULL')]
main_data['Import_Country'] = main_data['DIPLA_CITY_CD'].apply(lambda x : x[:2])

main_data = main_data.drop(['LOPFI_CITY_CD', 'FreightType_Cd', 'GateIN', 'DischargeDate', 'BOOKED_FFE'], axis=1)
main_data['CONTAINER_SIZE_CD'] = pd.to_numeric(main_data['CONTAINER_SIZE_CD'])

main_data1 = main_data.copy()
main_data1['PCD_Month'] = main_data1['PRICE_CALC_DT'].dt.strftime('%Y-%m')

region_data = spark.read.format("orc").load("/mnt/ipw_ontology_prod/v1/",header="True",inferschema="True")
region_data = region_data.toPandas()
region_data = region_data[(region_data['COUNTRY_CODE']!='Unknown')][['COUNTRY_CODE', 'REGION_CODE']].drop_duplicates()
region_data = region_data.rename(columns={'COUNTRY_CODE':'Import_Country', 'REGION_CODE':'region_cd'})

main_data1 = main_data1.merge(region_data, on =['Import_Country'], how = 'left')
main_data1 = main_data1[main_data1['region_cd'] == 'EUR']

main_data1.tail()

Unnamed: 0,OPERATOR_DESC,PRICE_CALC_DT,DIPLA_CITY_CD,Import_Country,Spot_YN,CONSIGNEE_CUSTOMER_CD,SHIPMENT_NO,Equipment_No,CONTAINER_TYPE_CD,CONTAINER_SIZE_CD,Std_FreeTimeGranted,Total_FreeTimeGranted_Days,FTEx_Taken,ActualTurnTime_Days,CDET_Days,Delay,CommoditySubType_Dsc,PCD_Month,region_cd
18842653,Maersk,2021-10-26,MACAS,MA,N,22900194690,DIT013417,TCKU7643036,DRY,40,14,14,0,5,5,0,Plastic and rubber,2021-10,EUR
18842656,Maersk,2023-10-30,GBFXS,GB,N,130LPL18065,IK0103492,TRHU6828476,DRY,40,19,19,0,22,22,3,Furniture,2023-10,EUR
18842658,Maersk,2023-01-20,DEBRV,DE,N,1480240,KNCANF251,MRKU3305054,DRY,40,9,9,0,7,7,0,Appliances and kitchenware,2023-01,EUR
18842659,Maersk,2022-09-04,MAPTM,MA,N,22900199668,R05161979,MRSU6556694,DRY,40,21,21,0,10,10,0,Plastic and rubber,2022-09,EUR
18842668,Maersk,2021-10-05,PTLIS,PT,N,111VEN003,204180220,UESU4585210,DRY,40,21,21,0,9,9,0,Appliances and kitchenware,2021-10,EUR


In [None]:
ship_df1 = main_data1.copy()

ship_df1['PCD_Month'] = ship_df1['PCD_Month'].apply(lambda x : x[-2:])
ship_df1 = ship_df1.groupby(['PCD_Month', 'CONSIGNEE_CUSTOMER_CD','DIPLA_CITY_CD', 'CONTAINER_SIZE_CD']).sum().reset_index()

ship_df1[['FTEx_Taken']] = ship_df1[['FTEx_Taken']].astype(int)
ship_df1['SFT_Consumption%'] = (ship_df1['ActualTurnTime_Days']/ship_df1['Std_FreeTimeGranted']) * 100

ship_df1['ActualTurnTime_Days-SFT'] = ship_df1['ActualTurnTime_Days'] -  ship_df1['Std_FreeTimeGranted']
ship_df1.loc[ship_df1['ActualTurnTime_Days-SFT']<0, 'ActualTurnTime_Days-SFT'] = 0

ship_df1.loc[ship_df1['FTEx_Taken'] == 0, 'ActualTurnTime_Days-SFT'] = 0
ship_df1.loc[(ship_df1['FTEx_Taken'] > 0) & (ship_df1['SFT_Consumption%'] > 100), 'SFT_Consumption%'] = 100

ship_df1['FTEx_Consumption%'] = (ship_df1['ActualTurnTime_Days-SFT']/ship_df1['FTEx_Taken']) * 100

ship_df1.loc[ship_df1['FTEx_Consumption%'] > 100, 'SFT_Consumption%'] = ((ship_df1['ActualTurnTime_Days'] - ship_df1['FTEx_Taken'])/ship_df1['Std_FreeTimeGranted']) * 100
ship_df1.loc[ship_df1['FTEx_Consumption%'] > 100, 'FTEx_Consumption%'] = 100

ship_df1[['FTEx_Consumption%', 'SFT_Consumption%']] = ship_df1[['FTEx_Consumption%', 'SFT_Consumption%']].fillna(0)

ship_df1['FTEx_Consumption%'] = round(ship_df1['FTEx_Consumption%'])
ship_df1[(ship_df1['FTEx_Taken']!=0) & (ship_df1['FTEx_Consumption%']!=0)].tail(10)
ship_df1.head()

Unnamed: 0,PCD_Month,CONSIGNEE_CUSTOMER_CD,DIPLA_CITY_CD,CONTAINER_SIZE_CD,Std_FreeTimeGranted,Total_FreeTimeGranted_Days,FTEx_Taken,ActualTurnTime_Days,CDET_Days,Delay,SFT_Consumption%,ActualTurnTime_Days-SFT,FTEx_Consumption%
0,1,9900000053,DKTHO,20,8,8,0,6,6,0,75.0,0,0.0
1,1,9900000056,DKAAR,20,4,4,0,1,1,0,25.0,0,0.0
2,1,9900050220,DKTHO,20,4,4,0,19,19,15,475.0,0,0.0
3,1,9900050220,DKTHO,40,8,8,0,147,147,142,1837.5,0,0.0
4,1,10000004537,DKAAR,40,5,5,0,3,3,0,60.0,0,0.0


In [None]:
from datetime import datetime
ship_df1['PCD_Month'] = ship_df1['PCD_Month'].astype(int)

start_date = int(datetime.today().strftime('%m'))
end_date = int((datetime.today() + timedelta(days = 90)).strftime('%m'))
ship_df1 = ship_df1[ (ship_df1['PCD_Month'] >= start_date) & (ship_df1['PCD_Month'] < end_date)]
ship_df1['PCD_Month'] = ship_df1['PCD_Month'].astype(str)

ship_df1.head()

Unnamed: 0,PCD_Month,CONSIGNEE_CUSTOMER_CD,DIPLA_CITY_CD,CONTAINER_SIZE_CD,Std_FreeTimeGranted,Total_FreeTimeGranted_Days,FTEx_Taken,ActualTurnTime_Days,CDET_Days,Delay,SFT_Consumption%,ActualTurnTime_Days-SFT,FTEx_Consumption%
389496,9,9900050440,DKTHO,20,8,8,0,6,6,0,75.0,0,0.0
389497,9,9900050460,DKTHO,20,4,4,0,3,3,0,75.0,0,0.0
389498,9,10000004537,DKKAL,20,25,25,0,12,12,1,48.0,0,0.0
389499,9,10000004537,LVRIX,40,7,7,0,3,3,0,42.857143,0,0.0
389500,9,10000004537,NOOSL,20,5,5,0,5,5,0,100.0,0,0.0


In [None]:
ship_df1 = ship_df1.drop(['Std_FreeTimeGranted', 'Total_FreeTimeGranted_Days', 'ActualTurnTime_Days', 'CDET_Days', 'Delay', 'ActualTurnTime_Days-SFT'], axis=1)

ship_df1.loc[(ship_df1['FTEx_Taken'] != 0) & (ship_df1['FTEx_Consumption%'] >= 0) & (ship_df1['FTEx_Consumption%'] <= 10), 'FTEx_discount'] = 5
ship_df1.loc[(ship_df1['FTEx_Taken'] != 0) & (ship_df1['FTEx_Consumption%'] > 10) & (ship_df1['FTEx_Consumption%'] <= 25), 'FTEx_discount'] = 3
ship_df1.loc[(ship_df1['FTEx_Taken'] != 0) & (ship_df1['FTEx_Consumption%'] > 25) & (ship_df1['FTEx_Consumption%'] <= 50), 'FTEx_discount'] = 2 
ship_df1.loc[(ship_df1['FTEx_Taken'] != 0) & (ship_df1['FTEx_Consumption%'] > 50) & (ship_df1['FTEx_Consumption%'] <= 75), 'FTEx_discount'] = 1 
ship_df1.loc[(ship_df1['FTEx_Taken'] != 0) & (ship_df1['FTEx_Consumption%'] > 75) & (ship_df1['FTEx_Consumption%'] <= 85), 'FTEx_discount'] = 0 
ship_df1.loc[(ship_df1['FTEx_Taken'] != 0) & (ship_df1['FTEx_Consumption%'] > 85) , 'FTEx_discount'] = 0


ship_df1.loc[(ship_df1['FTEx_Taken'] == 0) & (ship_df1['SFT_Consumption%'] >= 0) & (ship_df1['SFT_Consumption%'] <= 10), 'FTEx_discount'] = 5
ship_df1.loc[(ship_df1['FTEx_Taken'] == 0) & (ship_df1['SFT_Consumption%'] > 10) & (ship_df1['SFT_Consumption%'] <= 25), 'FTEx_discount'] = 3
ship_df1.loc[(ship_df1['FTEx_Taken'] == 0) & (ship_df1['SFT_Consumption%'] > 25) & (ship_df1['SFT_Consumption%'] <= 50), 'FTEx_discount'] = 2 
ship_df1.loc[(ship_df1['FTEx_Taken'] == 0) & (ship_df1['SFT_Consumption%'] > 50) & (ship_df1['SFT_Consumption%'] <= 75), 'FTEx_discount'] = 1 
ship_df1.loc[(ship_df1['FTEx_Taken'] == 0) & (ship_df1['SFT_Consumption%'] > 75) & (ship_df1['SFT_Consumption%'] <= 85), 'FTEx_discount'] = 0 
ship_df1.loc[(ship_df1['FTEx_Taken'] == 0) & (ship_df1['SFT_Consumption%'] > 85) , 'FTEx_discount'] = 0

ship_df1.tail()

Unnamed: 0,PCD_Month,CONSIGNEE_CUSTOMER_CD,DIPLA_CITY_CD,CONTAINER_SIZE_CD,FTEx_Taken,SFT_Consumption%,FTEx_Consumption%,FTEx_discount
532464,11,50000585257,GRSLK,40,0,100.0,0.0,0.0
532465,11,50000585257,NLROT,40,0,14.285714,0.0,3.0
532466,11,50000585257,TRMER,40,0,78.571429,0.0,0.0
532467,11,50400011434,GBLGP,40,0,200.0,0.0,0.0
532468,11,5043000523,ITC79,40,0,178.571429,0.0,0.0


### Extraction of discounts of the consignee based on commodity TT

In [None]:
ship_df2 = main_data1[['PCD_Month', 'DIPLA_CITY_CD', 'CommoditySubType_Dsc', 'CONTAINER_SIZE_CD', 'ActualTurnTime_Days']].copy()
ship_df2['PCD_Month'] = ship_df2['PCD_Month'].apply(lambda x : x[-2:])
ship_df2 = ship_df2.groupby(['PCD_Month', 'DIPLA_CITY_CD', 'CommoditySubType_Dsc', 'CONTAINER_SIZE_CD'], as_index = False).mean()

ship_df2.head()

Unnamed: 0,PCD_Month,DIPLA_CITY_CD,CommoditySubType_Dsc,CONTAINER_SIZE_CD,ActualTurnTime_Days
0,1,ALDUR,Appliances and kitchenware,20,11.0
1,1,ALDUR,Appliances and kitchenware,40,10.140351
2,1,ALDUR,Chemicals,20,20.863636
3,1,ALDUR,Chemicals,40,6.181818
4,1,ALDUR,Dairy products,20,4.666667


In [None]:
ship_df2['PCD_Month'] = ship_df2['PCD_Month'].astype(int)

start_date = int(datetime.today().strftime('%m'))
end_date = int((datetime.today() + timedelta(days = 90)).strftime('%m'))
ship_df2 = ship_df2[ (ship_df2['PCD_Month'] >= start_date) & (ship_df2['PCD_Month'] < end_date)]
ship_df2['PCD_Month'] = ship_df2['PCD_Month'].astype(str)

ship_df2.head()

Unnamed: 0,PCD_Month,DIPLA_CITY_CD,CommoditySubType_Dsc,CONTAINER_SIZE_CD,ActualTurnTime_Days
42093,9,ALDUR,Appliances and kitchenware,20,7.0
42094,9,ALDUR,Appliances and kitchenware,40,11.307692
42095,9,ALDUR,Chemicals,20,9.407407
42096,9,ALDUR,Chemicals,40,4.714286
42097,9,ALDUR,Dairy products,20,7.0


In [None]:
unique_month = ship_df2['PCD_Month'].unique()
unique_dipla = ship_df2['DIPLA_CITY_CD'].unique()
unique_contsize = ship_df2['CONTAINER_SIZE_CD'].unique()
ship_df3 = pd.DataFrame()

for month in unique_month:
  for dipla in unique_dipla:
    for size in unique_contsize:
      print(month,dipla, size)
      temp = ship_df2[(ship_df2['PCD_Month'] == month) & (ship_df2['DIPLA_CITY_CD'] == dipla)  & (ship_df2['CONTAINER_SIZE_CD'] == size)]
      temp['TT_percentile'] = temp.ActualTurnTime_Days.rank(pct = True)
      ship_df3 = ship_df3.append(temp)

ship_df3['TT_percentile'] = (round(ship_df3['TT_percentile'] * 100))

ship_df3.loc[(ship_df3['TT_percentile'] >= 0) & (ship_df3['TT_percentile'] <= 10), 'Commodity_discount'] = 5
ship_df3.loc[(ship_df3['TT_percentile'] > 10) & (ship_df3['TT_percentile'] <= 25), 'Commodity_discount'] = 4
ship_df3.loc[(ship_df3['TT_percentile'] > 25) & (ship_df3['TT_percentile'] <= 50), 'Commodity_discount'] = 2 
ship_df3.loc[(ship_df3['TT_percentile'] > 50) & (ship_df3['TT_percentile'] <= 75), 'Commodity_discount'] = 1 
ship_df3.loc[(ship_df3['TT_percentile'] > 75) & (ship_df3['TT_percentile'] <= 85), 'Commodity_discount'] = 0
ship_df3.loc[(ship_df3['TT_percentile'] > 85), 'Commodity_discount'] = 0

ship_df3.head()

9 ALDUR 20
9 ALDUR 40
9 BEANT 20
9 BEANT 40
9 BEZEE 20
9 BEZEE 40
9 BGBOJ 20
9 BGBOJ 40
9 BGVAR 20
9 BGVAR 40
9 CYLMS 20
9 CYLMS 40
9 DEBRV 20
9 DEBRV 40
9 DEHAM 20
9 DEHAM 40
9 DEWVN 20
9 DEWVN 40
9 DKAAR 20
9 DKAAR 40
9 DKFRC 20
9 DKFRC 40
9 DKKAL 20
9 DKKAL 40
9 DKTHO 20
9 DKTHO 40
9 DZAAE 20
9 DZAAE 40
9 DZALG 20
9 DZALG 40
9 DZBJA 20
9 DZBJA 40
9 DZORN 20
9 DZORN 40
9 DZSKI 20
9 DZSKI 40
9 EETLL 20
9 EETLL 40
9 EGAIS 20
9 EGAIS 40
9 EGALD 20
9 EGALD 40
9 EGALY 20
9 EGALY 40
9 EGDAM 20
9 EGDAM 40
9 EGPSD 20
9 EGPSD 40
9 ESAEI 20
9 ESAEI 40
9 ESALC 20
9 ESALC 40
9 ESALR 20
9 ESALR 40
9 ESBCN 20
9 ESBCN 40
9 ESBIO 20
9 ESBIO 40
9 ESCAT 20
9 ESCAT 40
9 ESGIJ 20
9 ESGIJ 40
9 ESLPA 20
9 ESLPA 40
9 ESMAR 20
9 ESMAR 40
9 ESMGP 20
9 ESMGP 40
9 ESTRF 20
9 ESTRF 40
9 ESVCI 20
9 ESVCI 40
9 FIHEL 20
9 FIHEL 40
9 FIKTK 20
9 FIKTK 40
9 FIOUL 20
9 FIOUL 40
9 FIRMA 20
9 FIRMA 40
9 FRBES 20
9 FRBES 40
9 FRDUK 20
9 FRDUK 40
9 FRFSM 20
9 FRFSM 40
9 FRLEH 20
9 FRLEH 40
9 FRMTI 20
9 FRMTI 40
9 GBBFS 20

Unnamed: 0,PCD_Month,DIPLA_CITY_CD,CommoditySubType_Dsc,CONTAINER_SIZE_CD,ActualTurnTime_Days,TT_percentile,Commodity_discount
42093,9,ALDUR,Appliances and kitchenware,20,7.0,40.0,2.0
42095,9,ALDUR,Chemicals,20,9.407407,62.0,1.0
42097,9,ALDUR,Dairy products,20,7.0,40.0,2.0
42098,9,ALDUR,Foodstuff,20,8.444444,57.0,1.0
42100,9,ALDUR,Furniture,20,6.0,29.0,2.0


In [None]:
cngftex_comm = ship_df1.merge(ship_df3, on = ['PCD_Month', 'DIPLA_CITY_CD', 'CONTAINER_SIZE_CD'], how='left')
cngftex_comm = cngftex_comm.rename(columns={'DIPLA_CITY_CD':'Port'})
cngftex_comm.tail()

Unnamed: 0,PCD_Month,CONSIGNEE_CUSTOMER_CD,Port,CONTAINER_SIZE_CD,FTEx_Taken,SFT_Consumption%,FTEx_Consumption%,FTEx_discount,CommoditySubType_Dsc,ActualTurnTime_Days,TT_percentile,Commodity_discount
4559949,11,5043000523,ITC79,40,0,178.571429,0.0,0.0,Toys and games,18.733333,83.0,0.0
4559950,11,5043000523,ITC79,40,0,178.571429,0.0,0.0,"Umbrellas, sun umbrellas, walking-sticks, seat...",11.666667,67.0,1.0
4559951,11,5043000523,ITC79,40,0,178.571429,0.0,0.0,Unknown,7.0,12.0,4.0
4559952,11,5043000523,ITC79,40,0,178.571429,0.0,0.0,Vehicles,11.83871,71.0,1.0
4559953,11,5043000523,ITC79,40,0,178.571429,0.0,0.0,Wood,9.806452,42.0,2.0


In [None]:
cftex_cngftex_comm = cngftex_comm.copy()
cftex_cngftex_comm['Actual_%Charge'] = 75

cftex_cngftex_comm['new_Actual_%Charge'] = cftex_cngftex_comm['Actual_%Charge'] - cftex_cngftex_comm['Commodity_discount'] - cftex_cngftex_comm['FTEx_discount'] 

cftex_cngftex_comm['total_discount%'] = abs(cftex_cngftex_comm['new_Actual_%Charge'] - 100)

cftex_cngftex_comm[(cftex_cngftex_comm['CONSIGNEE_CUSTOMER_CD']=='118208159') & (cftex_cngftex_comm['PCD_Month']=='04') & (cftex_cngftex_comm['FTEx_discount'] > 0)].head()

cftex_cngftex_comm = cftex_cngftex_comm[['PCD_Month','Port', 'CONSIGNEE_CUSTOMER_CD', 'CommoditySubType_Dsc',  'CONTAINER_SIZE_CD', 'new_Actual_%Charge']].dropna()
cftex_cngftex_comm.tail()

Unnamed: 0,PCD_Month,Port,CONSIGNEE_CUSTOMER_CD,CommoditySubType_Dsc,CONTAINER_SIZE_CD,new_Actual_%Charge
4559949,11,ITC79,5043000523,Toys and games,40,75.0
4559950,11,ITC79,5043000523,"Umbrellas, sun umbrellas, walking-sticks, seat...",40,74.0
4559951,11,ITC79,5043000523,Unknown,40,71.0
4559952,11,ITC79,5043000523,Vehicles,40,74.0
4559953,11,ITC79,5043000523,Wood,40,73.0


In [None]:
a = cftex_cngftex_comm['new_Actual_%Charge'].unique()
a.sort()
a

array([65., 66., 67., 68., 69., 70., 71., 72., 73., 74., 75.])

In [None]:
cftex_cngftex_comm = cftex_cngftex_comm.sort_values(by=['PCD_Month', 'Port', 'CONSIGNEE_CUSTOMER_CD', 'CommoditySubType_Dsc', 'CONTAINER_SIZE_CD', 'new_Actual_%Charge']).reset_index().drop('index', axis=1)

cftex_cngftex_comm['COUNTRY_CODE'] = cftex_cngftex_comm['Port'].apply(lambda x:x[:2])
cftex_cngftex_comm['CONTAINER_SIZE_CD'] = cftex_cngftex_comm['CONTAINER_SIZE_CD'].astype(int)
cftex_cngftex_comm.head()

Unnamed: 0,PCD_Month,Port,CONSIGNEE_CUSTOMER_CD,CommoditySubType_Dsc,CONTAINER_SIZE_CD,new_Actual_%Charge,COUNTRY_CODE
0,10,ALDUR,10500001403,Appliances and kitchenware,20,70.0,AL
1,10,ALDUR,10500001403,Appliances and kitchenware,40,74.0,AL
2,10,ALDUR,10500001403,Beverages,20,73.0,AL
3,10,ALDUR,10500001403,Beverages,40,69.0,AL
4,10,ALDUR,10500001403,Chemicals,20,72.0,AL


In [None]:
cftex_cngftex_comm_spark = spark.createDataFrame(cftex_cngftex_comm) 
cftex_cngftex_comm_spark.write.mode("overwrite").saveAsTable("dnd.cftex_pricing_v2_eur")

In [None]:
cftex_cngftex_comm = spark.sql("""select * from dnd.cftex_pricing_v2_eur""")
cftex_cngftex_comm = cftex_cngftex_comm.toPandas()
cftex_cngftex_comm.tail()

Unnamed: 0,PCD_Month,Port,CONSIGNEE_CUSTOMER_CD,CommoditySubType_Dsc,CONTAINER_SIZE_CD,new_Actual_%Charge,COUNTRY_CODE
4559949,10,NLROT,1223813,Dairy products,40,70.0,NL
4559950,10,NLROT,1223813,Fertilizers,20,70.0,NL
4559951,10,NLROT,1223813,Fertilizers,40,75.0,NL
4559952,10,NLROT,1223813,Fish,20,73.0,NL
4559953,10,NLROT,1223813,Fish,40,71.0,NL


In [None]:
charge = cftex_cngftex_comm['new_Actual_%Charge'].unique()
charge

array([73., 74., 70., 71., 75., 72., 68., 69., 65., 66., 67.])

In [None]:
charge_cftex = pd.DataFrame()

for start_perc in charge:
  slab_charge_percentage = start_perc
  charge_percentage = start_perc
  diff_days = 1
  diff_slabs_percent = 2

  column_names = ['Arrival-10','Arrival-9','Arrival-8','Arrival-7','Arrival-6','Arrival-5','Arrival-4',
                  'Arrival-3','Arrival-2','Arrival-1']

  df = pd.DataFrame(columns = column_names)

  for i in range(10,0,-2):
      for j in range(0,3):       
          Discount_Percent = slab_charge_percentage
          slab_charge_percentage = slab_charge_percentage - (slab_charge_percentage * diff_slabs_percent)/100
          df.loc['{}'.format(j+1), 'Arrival-{}'.format(i)] = round(Discount_Percent)
          df.loc['{}'.format(j+1), 'Arrival-{}'.format(i-1)] = round(Discount_Percent) 
      charge_percentage += diff_days
      slab_charge_percentage = charge_percentage

  df['Slab'] = df.index
  df = df.reset_index().drop('index', axis=1)
  df['new_Actual_%Charge'] = start_perc
  charge_cftex = charge_cftex.append(df)

charge_cftex.loc[charge_cftex['Slab'] == '1', 'Slab_End_Day'] = 5
charge_cftex.loc[charge_cftex['Slab'] == '2', 'Slab_End_Day'] = 10
charge_cftex.loc[charge_cftex['Slab'] == '3', 'Slab_End_Day'] = 14
charge_cftex.tail()

Unnamed: 0,Arrival-10,Arrival-9,Arrival-8,Arrival-7,Arrival-6,Arrival-5,Arrival-4,Arrival-3,Arrival-2,Arrival-1,Slab,new_Actual_%Charge,Slab_End_Day
1,65,65,66,66,67,67,68,68,69,69,2,66.0,10.0
2,63,63,64,64,65,65,66,66,67,67,3,66.0,14.0
0,67,67,68,68,69,69,70,70,71,71,1,67.0,5.0
1,66,66,67,67,68,68,69,69,70,70,2,67.0,10.0
2,64,64,65,65,66,66,67,67,68,68,3,67.0,14.0


In [None]:
a = charge_cftex['Arrival-1'].unique()
a.sort()
a

array([66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
      dtype=object)

In [None]:
a = charge_cftex['Arrival-10'].unique()
a.sort()
a

array([62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75],
      dtype=object)

In [None]:
main_cftex = cftex_cngftex_comm.merge(charge_cftex, on=['new_Actual_%Charge'], how='left')

main_cftex = main_cftex[['PCD_Month', 'Port','CONSIGNEE_CUSTOMER_CD', 'CommoditySubType_Dsc',  'CONTAINER_SIZE_CD', 'Slab', 'Slab_End_Day', 'Arrival-10', 'Arrival-9', 'Arrival-8', 'Arrival-7', 'Arrival-6', 'Arrival-5', 'Arrival-4', 'Arrival-3', 'Arrival-2', 'Arrival-1']]
main_cftex[['Slab', 'Slab_End_Day', 'Arrival-10', 'Arrival-9', 'Arrival-8', 'Arrival-7', 'Arrival-6', 'Arrival-5', 'Arrival-4', 'Arrival-3', 'Arrival-2', 'Arrival-1']] = main_cftex[['Slab', 'Slab_End_Day', 'Arrival-10', 'Arrival-9', 'Arrival-8', 'Arrival-7', 'Arrival-6', 'Arrival-5', 'Arrival-4', 'Arrival-3', 'Arrival-2', 'Arrival-1']].astype(int)
main_cftex = main_cftex.rename(columns={'PCD_Month':'Pcd_Month', 'Port':'Dipla_CD'})
#main_cftex.head()

In [None]:
len(main_cftex)

13679862

In [None]:
commodity =spark.read.format("delta").load("/mnt/Gen2_prod_cleansed_gcss/mars_commodities/")
commodity.createOrReplaceTempView("commodity")

commodity = spark.sql(""" select distinct commodity.code, name from commodity  order by 2""")

commodity = commodity.toPandas()
commodity = commodity.rename(columns={'name':'CommoditySubType_Dsc', 'code':'Commodity_CD'})
commodity = commodity[commodity['Commodity_CD'].apply(lambda x:len(x) == 4) ]

main_cftex = main_cftex.merge(commodity, on = 'CommoditySubType_Dsc', how='left')
main_cftex.tail()

Unnamed: 0,Pcd_Month,Dipla_CD,CONSIGNEE_CUSTOMER_CD,CommoditySubType_Dsc,CONTAINER_SIZE_CD,Slab,Slab_End_Day,Arrival-10,Arrival-9,Arrival-8,Arrival-7,Arrival-6,Arrival-5,Arrival-4,Arrival-3,Arrival-2,Arrival-1,Commodity_CD
13679857,10,NLROT,1223813,Fish,20,2,10,72,72,73,73,74,74,74,74,75,75,5
13679858,10,NLROT,1223813,Fish,20,3,14,70,70,71,71,72,72,73,73,74,74,5
13679859,10,NLROT,1223813,Fish,40,1,5,71,71,72,72,73,73,74,74,75,75,5
13679860,10,NLROT,1223813,Fish,40,2,10,70,70,71,71,72,72,73,73,74,74,5
13679861,10,NLROT,1223813,Fish,40,3,14,68,68,69,69,70,70,71,71,72,72,5


In [None]:
main_cftex['Operator'] = 'MAEU'
main_cftex['CONTAINER_Type'] = 'Dry'

main_cftex = main_cftex[['Operator', 'Pcd_Month', 'Dipla_CD', 'CONSIGNEE_CUSTOMER_CD', 'CommoditySubType_Dsc', 'Commodity_CD', 'CONTAINER_SIZE_CD', 'CONTAINER_Type', 'Slab', 'Slab_End_Day', 'Arrival-10', 'Arrival-9', 'Arrival-8', 'Arrival-7', 'Arrival-6', 'Arrival-5', 'Arrival-4', 'Arrival-3', 'Arrival-2', 'Arrival-1']]
main_cftex = main_cftex.fillna('')
main_cftex = main_cftex.rename(columns={'CONSIGNEE_CUSTOMER_CD':'Consignee_CD', 'CommoditySubType_Dsc':'Commodity', 'CONTAINER_SIZE_CD':'Cont_Size', 'CONTAINER_Type':'Cont_Type'})
main_cftex.head()

Unnamed: 0,Operator,Pcd_Month,Dipla_CD,Consignee_CD,Commodity,Commodity_CD,Cont_Size,Cont_Type,Slab,Slab_End_Day,Arrival-10,Arrival-9,Arrival-8,Arrival-7,Arrival-6,Arrival-5,Arrival-4,Arrival-3,Arrival-2,Arrival-1
0,MAEU,9,NLROT,30700319595,"Hides, fur, leather and skin",21,20,Dry,1,5,73,73,74,74,75,75,76,76,77,77
1,MAEU,9,NLROT,30700319595,"Hides, fur, leather and skin",21,20,Dry,2,10,72,72,73,73,74,74,74,74,75,75
2,MAEU,9,NLROT,30700319595,"Hides, fur, leather and skin",21,20,Dry,3,14,70,70,71,71,72,72,73,73,74,74
3,MAEU,9,NLROT,30700319595,Household goods,22,20,Dry,1,5,73,73,74,74,75,75,76,76,77,77
4,MAEU,9,NLROT,30700319595,Household goods,22,20,Dry,2,10,72,72,73,73,74,74,74,74,75,75


In [None]:
len(main_cftex)

13679862

In [None]:
pricingdf = spark.createDataFrame(main_cftex)
pricingdf.write.mode("overwrite").saveAsTable("dnd.pricingdf_v2_eur")