In [95]:
import numpy as np
import pandas as pd

import calendar
import datetime as dt
from collections import Counter

In [193]:
deliveries = pd.read_csv(r'Einkaufslieferzeile.csv', on_bad_lines='skip', sep=';', low_memory=False,encoding='utf-8-sig')
deliveries.head()

Unnamed: 0,Attached to Line No_,Auto Charge Doc_ Line No_,Auto Charge Doc_ No_,Billing Entry No_,Billing Reference No_,Blanket Order Line No_,Blanket Order No_,Budgeted FA No_,Buy-from Vendor No_,Contract Line No_,...,Use Duplication List,Use Tax,VAT Base Amount,VAT Bus_ Posting Group,VAT Calculation Type,VAT Prod_ Posting Group,VAT _,VDP Code,Variant Code,timestamp
0,0,0,,0,,0,,,K00421,0,...,0,0,0,IL,0,MWST19,1900000000000000000000,,,000000005295C63A
1,0,0,,0,,0,,,K02028,0,...,0,0,0,IL,0,MWST19,1900000000000000000000,,,0000000052EC85AD
2,0,0,,0,,0,,,K04610,0,...,0,0,0,EU,1,MWST19,000000000000000000000,,,0000000052B4AE3C
3,0,0,,0,,0,,,K04607,0,...,0,0,0,IL,0,MWST19,1900000000000000000000,,,0000000052B4D12C
4,0,0,,0,,10000,RB19/0025,,K04137,0,...,0,0,0,IL,0,MWST19,1900000000000000000000,,,0000000052B4B8F5


In [97]:
deliveries.shape

(6012, 166)

In [194]:
# drop columns where all NaN values
deliveries=deliveries.dropna(axis=1, how='all')
deliveries.shape

(6012, 129)

In [195]:
# drop columns where all 0 values
deliveries=deliveries.loc[:, (deliveries != 0).any(axis=0)]
deliveries.shape

(6012, 103)

In [196]:
# drop columns where over 50% of values are null values
deliveries=deliveries.loc[:, deliveries.isnull().mean()<0.5]
deliveries.shape

(6012, 90)

In [197]:
# get list with column names
list(deliveries.columns)

['Blanket Order Line No_',
 'Buy-from Vendor No_',
 'Cross-Reference Type No_',
 'Demand Query Line No_',
 'Document No_',
 'Item Rcpt_ Entry No_',
 'Line No_',
 'No_',
 'Order Line No_',
 'Order No_',
 'Pay-to Vendor No_',
 'Pos_ No_',
 'Prod_ Order Line No_',
 'Routing Reference No_',
 'Allow Invoice Disc_',
 'Balance',
 'Bin Code',
 'Buy-from Address',
 'Buy-from City',
 'Buy-from Contact',
 'Buy-from Post Code',
 'Buy-from Vendor Name 2',
 'Buy-from Vendor Name',
 'Contract Amount',
 'Correction',
 'Cross-Reference Type',
 'Description 2',
 'Description',
 'Dimension Set ID',
 'Direct Unit Cost Price Factor',
 'Direct Unit Cost',
 'Expected Receipt Date',
 'Expected Receipt Time',
 'FA Posting Date',
 'Gen_ Bus_ Posting Group',
 'Gen_ Prod_ Posting Group',
 'Gross Weight',
 'Indirect Cost _',
 'Item Category Code',
 'Item Charge Base Amount',
 'Job Currency Factor',
 'Job Line Amount (LCY)',
 'Job Line Amount',
 'Job Line Disc_ Amount (LCY)',
 'Job Line Discount Amount',
 'Job Line

In [198]:
# drop columns that don't seem relevant to our challenge
to_drop=['Cross-Reference Type No_', 'Demand Query Line No_', 'Document No_', 'Item Rcpt_ Entry No_',
 'Line No_', 'Order Line No_', 'Pay-to Vendor No_', 'Pos_ No_', 'Prod_ Order Line No_', 'Routing Reference No_', 'Allow Invoice Disc_', 'Balance',
 'Bin Code', 'Contract Amount', 'Correction', 'Cross-Reference Type', 'Description 2', 'Dimension Set ID', 'Direct Unit Cost Price Factor',
 'Direct Unit Cost', 'Gen_ Bus_ Posting Group', 'Gen_ Prod_ Posting Group', 'Gross Weight', 'Indirect Cost _', 'Item Charge Base Amount',
 'Job Currency Factor', 'Job Line Amount (LCY)', 'Job Line Amount', 'Job Line Disc_ Amount (LCY)', 'Job Line Discount Amount', 'Job Line Discount _',
 'Job Total Price (LCY)', 'Job Unit Price (LCY)', 'Job Unit Price', 'Line Discount _', 'Net Weight', 'Overhead Rate', 'Pack Sample Quantity', 'Price Factor', 
 'Qty_ per Unit of Measure', 'Salvage Value', 'Shortcut Dimension 1 Code', 'Shortcut Dimension 2 Code','Qty_ Invoiced (Base)','Quantity (Base)',
 'Type', 'Unit Cost (LCY)', 'Unit Cost', 'Unit Price (LCY)', 'Unit Volume', 'Unit of Measure (Cross Ref_)', 'Unit of Measure Code',
 'Unit of Measure', 'Units per Parcel', 'VAT Base Amount', 'VAT Bus_ Posting Group', 'VAT Calculation Type', 'VAT Prod_ Posting Group','Blanket Order Line No_',
 'VAT _', 'Buy-from Vendor Name 2','Buy-from Address', 'timestamp','Item Category Code','Posting Group','Description','Buy-from Contact','Job Total Price','Quantity']
deliveries.drop(to_drop, inplace=True, axis=1)

In [103]:
deliveries.head()

Unnamed: 0,Buy-from Vendor No_,No_,Order No_,Buy-from City,Buy-from Post Code,Buy-from Vendor Name,Expected Receipt Date,Expected Receipt Time,FA Posting Date,Lead Time Calculation,...,Order Date,Order Quantity,Original Date,Planned Receipt Date,Posting Date,Promised Receipt Date,Qty_ Rcd_ Not Invoiced,Quantity Invoiced,Really Receipt DateTime,Requested Receipt Date
0,K00421,00451185,EB19/2893,Altena-Dahle,58754,Möhling GmbH & Co. KG,17.12.2019 00:00:00,01.01.1753 00:00:00,01.01.1753 00:00:00,,...,23.09.2019 00:00:00,0,17.12.2019 00:00:00,17.12.2019 00:00:00,30.12.2019 00:00:00,17.12.2019 00:00:00,0,2273000000000000000000000,16.12.2019 23:00:00,17.12.2019 00:00:00
1,K02028,17000040,EB19/3532,Mörfelden-Walldorf,64546,RS Components GmbH,17.12.2019 00:00:00,01.01.1753 00:00:00,01.01.1753 00:00:00,,...,15.11.2019 00:00:00,0,17.12.2019 00:00:00,17.12.2019 00:00:00,30.12.2019 00:00:00,17.12.2019 00:00:00,0,1500000000000000000000,01.01.1753 00:00:00,17.12.2019 00:00:00
2,K04610,KS00520714,EB19/3570,Carei Jud. Satu Mare,RO - 445100,Taygan Metal Press SRL,19.12.2019 00:00:00,01.01.1753 00:00:00,01.01.1753 00:00:00,,...,19.12.2019 00:00:00,0,19.12.2019 00:00:00,19.12.2019 00:00:00,30.12.2019 00:00:00,19.12.2019 00:00:00,0,200000000000000000000000,01.01.1753 00:00:00,19.12.2019 00:00:00
3,K04607,KS00525776-1,EB19/3737,Ebersbach,73061,Bodycote,06.01.2020 00:00:00,01.01.1753 00:00:00,01.01.1753 00:00:00,,...,03.12.2019 00:00:00,0,03.12.2019 00:00:00,06.01.2020 00:00:00,30.12.2019 00:00:00,06.01.2020 00:00:00,0,30000000000000000000000,01.01.1753 00:00:00,18.12.2019 00:00:00
4,K04137,MV00491619,EB19/2532,Velbert,42551,Heismann Drehtechnik,18.12.2019 00:00:00,01.01.1753 00:00:00,01.01.1753 00:00:00,8,...,03.07.2019 00:00:00,0,18.12.2019 00:00:00,18.12.2019 00:00:00,30.12.2019 00:00:00,18.12.2019 00:00:00,0,870400000000000000000000,01.01.1753 00:00:00,18.12.2019 00:00:00


In [199]:
dates=deliveries[[
 'Expected Receipt Date', 'Expected Receipt Time', 'FA Posting Date', 'Lead Time Calculation',  'Planned Receipt Date',
 'Order Date', 'Original Date', 'Posting Date', 'Promised Receipt Date', 'Really Receipt DateTime', 'Requested Receipt Date']]
dates.head(3)

Unnamed: 0,Expected Receipt Date,Expected Receipt Time,FA Posting Date,Lead Time Calculation,Planned Receipt Date,Order Date,Original Date,Posting Date,Promised Receipt Date,Really Receipt DateTime,Requested Receipt Date
0,17.12.2019 00:00:00,01.01.1753 00:00:00,01.01.1753 00:00:00,,17.12.2019 00:00:00,23.09.2019 00:00:00,17.12.2019 00:00:00,30.12.2019 00:00:00,17.12.2019 00:00:00,16.12.2019 23:00:00,17.12.2019 00:00:00
1,17.12.2019 00:00:00,01.01.1753 00:00:00,01.01.1753 00:00:00,,17.12.2019 00:00:00,15.11.2019 00:00:00,17.12.2019 00:00:00,30.12.2019 00:00:00,17.12.2019 00:00:00,01.01.1753 00:00:00,17.12.2019 00:00:00
2,19.12.2019 00:00:00,01.01.1753 00:00:00,01.01.1753 00:00:00,,19.12.2019 00:00:00,19.12.2019 00:00:00,19.12.2019 00:00:00,30.12.2019 00:00:00,19.12.2019 00:00:00,01.01.1753 00:00:00,19.12.2019 00:00:00


**Date columns to keep**

Order Date: date the order was created. 

Posting Date: date you want as the Posting Date on the Ledger Entry tables when the order is Shipped and Invoiced.

Promised Receipt Date: date your vendor promised they would deliver the goods. NAV does not change this date, so it allows you to set a marker to measure your vendor’s performance against what was promised.

Requested Receipt Date: date you wanted to receive the goods from the vendor. This date is not recalculated in NAV, so once it has been established, it gives you a marker to measure your vendor’s performance against the requested date.

Really Receipt DateTime

In [200]:
# searching for how Microsoft Nav calculates dates, we'll drop some columns indicating dates, taking into consideration the following:
# requested receipt date - lead time calculation = order date
# requested receipt date + inbound whse. handling time + safety lead time = expected receipt date
# lead time calculation: amount of time the product needs in order to be delivered to your company that the Purchasing Agent needs to be alerted
# Expected Receipt Date: date you currently expect your vendor to deliver the order
# Planned Receipt Date: date the order is expected to arrive at the warehouse location, uses the Requested Receipt Date
to_drop=['Expected Receipt Time','FA Posting Date','Lead Time Calculation','Planned Receipt Date','Original Date',]
deliveries.drop(to_drop, inplace=True, axis=1)
deliveries.head()


Unnamed: 0,Buy-from Vendor No_,No_,Order No_,Buy-from City,Buy-from Post Code,Buy-from Vendor Name,Expected Receipt Date,Location Code,Order Date,Order Quantity,Posting Date,Promised Receipt Date,Qty_ Rcd_ Not Invoiced,Quantity Invoiced,Really Receipt DateTime,Requested Receipt Date
0,K00421,00451185,EB19/2893,Altena-Dahle,58754,Möhling GmbH & Co. KG,17.12.2019 00:00:00,B-01,23.09.2019 00:00:00,0,30.12.2019 00:00:00,17.12.2019 00:00:00,0,2273000000000000000000000,16.12.2019 23:00:00,17.12.2019 00:00:00
1,K02028,17000040,EB19/3532,Mörfelden-Walldorf,64546,RS Components GmbH,17.12.2019 00:00:00,B-01,15.11.2019 00:00:00,0,30.12.2019 00:00:00,17.12.2019 00:00:00,0,1500000000000000000000,01.01.1753 00:00:00,17.12.2019 00:00:00
2,K04610,KS00520714,EB19/3570,Carei Jud. Satu Mare,RO - 445100,Taygan Metal Press SRL,19.12.2019 00:00:00,B-01,19.12.2019 00:00:00,0,30.12.2019 00:00:00,19.12.2019 00:00:00,0,200000000000000000000000,01.01.1753 00:00:00,19.12.2019 00:00:00
3,K04607,KS00525776-1,EB19/3737,Ebersbach,73061,Bodycote,06.01.2020 00:00:00,B-01,03.12.2019 00:00:00,0,30.12.2019 00:00:00,06.01.2020 00:00:00,0,30000000000000000000000,01.01.1753 00:00:00,18.12.2019 00:00:00
4,K04137,MV00491619,EB19/2532,Velbert,42551,Heismann Drehtechnik,18.12.2019 00:00:00,B-01,03.07.2019 00:00:00,0,30.12.2019 00:00:00,18.12.2019 00:00:00,0,870400000000000000000000,01.01.1753 00:00:00,18.12.2019 00:00:00


Qty. Rcd. Not Invoiced Field: specifies how many units, such as pieces, of the ordered item have been received but not yet invoiced.

Quantity (Base): Inventory is stored in the base unit of measure, and every time that you sell or purchase or use any other unit of measure, NAV needs to know the quantity expressed in base unit of measure, so it can update inventory correctly

Order Quantity: covers all gross requirements (forecast, sales orders, and also replenishment of the inventory level).

In [202]:
deliveries.columns = deliveries.columns.str.replace(" ", "_").str.lower()
deliveries=deliveries.rename(columns={'buy-from_vendor_no_': 'vendor_no',"no_": "delivery_no","order_no_": "order_no",'buy-from_city':'city',
'buy-from_post_code':'postcode','buy-from_vendor_name':'vendor_name','expected_receipt_date':'expected_date','order_quantity':'order_qty',
 'promised_receipt_date': 'promised_date', 'qty__rcd__not_invoiced':'qty_rcd_not_invoiced','quantity_invoiced':'qty_invoiced','really_receipt_datetime':'delivery_date',
 'requested_receipt_date':'requested_date'})
deliveries.vendor_name=deliveries.vendor_name.str.replace(",", " ") # commas would create additional columns when exporting to csv
list(deliveries.columns)

['vendor_no',
 'delivery_no',
 'order_no',
 'city',
 'postcode',
 'vendor_name',
 'expected_date',
 'location_code',
 'order_date',
 'order_qty',
 'posting_date',
 'promised_date',
 'qty_rcd_not_invoiced',
 'qty_invoiced',
 'delivery_date',
 'requested_date']

In [203]:
deliveries[['qty_invoiced','order_qty','qty_rcd_not_invoiced']] = deliveries[['qty_invoiced','order_qty','qty_rcd_not_invoiced']].applymap(lambda x: str(x)[:-21])
deliveries.head(3)

Unnamed: 0,vendor_no,delivery_no,order_no,city,postcode,vendor_name,expected_date,location_code,order_date,order_qty,posting_date,promised_date,qty_rcd_not_invoiced,qty_invoiced,delivery_date,requested_date
0,K00421,00451185,EB19/2893,Altena-Dahle,58754,Möhling GmbH & Co. KG,17.12.2019 00:00:00,B-01,23.09.2019 00:00:00,0,30.12.2019 00:00:00,17.12.2019 00:00:00,0,22730,16.12.2019 23:00:00,17.12.2019 00:00:00
1,K02028,17000040,EB19/3532,Mörfelden-Walldorf,64546,RS Components GmbH,17.12.2019 00:00:00,B-01,15.11.2019 00:00:00,0,30.12.2019 00:00:00,17.12.2019 00:00:00,0,15,01.01.1753 00:00:00,17.12.2019 00:00:00
2,K04610,KS00520714,EB19/3570,Carei Jud. Satu Mare,RO - 445100,Taygan Metal Press SRL,19.12.2019 00:00:00,B-01,19.12.2019 00:00:00,0,30.12.2019 00:00:00,19.12.2019 00:00:00,0,2000,01.01.1753 00:00:00,19.12.2019 00:00:00


In [204]:
deliveries[['expected_date','order_date','posting_date','promised_date','delivery_date','requested_date']] = deliveries[['expected_date','order_date','posting_date','promised_date','delivery_date','requested_date']].applymap(lambda x: str(x)[:-9])
deliveries.head(3)

Unnamed: 0,vendor_no,delivery_no,order_no,city,postcode,vendor_name,expected_date,location_code,order_date,order_qty,posting_date,promised_date,qty_rcd_not_invoiced,qty_invoiced,delivery_date,requested_date
0,K00421,00451185,EB19/2893,Altena-Dahle,58754,Möhling GmbH & Co. KG,17.12.2019,B-01,23.09.2019,0,30.12.2019,17.12.2019,0,22730,16.12.2019,17.12.2019
1,K02028,17000040,EB19/3532,Mörfelden-Walldorf,64546,RS Components GmbH,17.12.2019,B-01,15.11.2019,0,30.12.2019,17.12.2019,0,15,01.01.1753,17.12.2019
2,K04610,KS00520714,EB19/3570,Carei Jud. Satu Mare,RO - 445100,Taygan Metal Press SRL,19.12.2019,B-01,19.12.2019,0,30.12.2019,19.12.2019,0,2000,01.01.1753,19.12.2019


In [205]:
deliveries.shape

(6012, 16)

In [110]:
#deliveries = deliveries[(deliveries['expected_date']!= '01.01.1753')&(deliveries['expected_date']!= '09.09.2099')]
#deliveries=deliveries[~deliveries.promised_date.str.contains("2099")]
#deliveries=deliveries[~deliveries.delivery_date.str.contains("2099")]

In [206]:
# Some dates with a value of 01.01.1753 and 09.09.2099 mean the field is blank or there's been an error in the input.
# we will filter out these rows
deliveries=deliveries[deliveries.promised_date.str.contains("1753|2033|2099")==False]
deliveries=deliveries[deliveries.delivery_date.str.contains("1753|2033|2099")==False]
deliveries=deliveries[deliveries.expected_date.str.contains("1753|2033|2099")==False]
deliveries=deliveries[deliveries.order_date.str.contains("1753|2033|2099")==False]
deliveries=deliveries[deliveries.posting_date.str.contains("1753|2033|2099")==False]
deliveries=deliveries[deliveries.requested_date.str.contains("1753|2033|2099")==False]
deliveries.shape

(4152, 16)

In [207]:
deliveries = deliveries.astype({'order_date': 'datetime64','posting_date': 'datetime64','promised_date': 'datetime64',
'delivery_date': 'datetime64','requested_date': 'datetime64','expected_date':'datetime64','qty_invoiced': 'int64','order_qty': 'int64','qty_rcd_not_invoiced': 'int64'})
deliveries.dtypes

vendor_no                       object
delivery_no                     object
order_no                        object
city                            object
postcode                        object
vendor_name                     object
expected_date           datetime64[ns]
location_code                   object
order_date              datetime64[ns]
order_qty                        int64
posting_date            datetime64[ns]
promised_date           datetime64[ns]
qty_rcd_not_invoiced             int64
qty_invoiced                     int64
delivery_date           datetime64[ns]
requested_date          datetime64[ns]
dtype: object

In [208]:
# we will also drop rows that have a delivery, promised or order date later than 01.03.2022
# also, drop rows where order date is later than delivery, expected, requested or posting dates 
deliveries = deliveries[(deliveries['promised_date']< '2022-03-01')&(deliveries['delivery_date']< '2022-03-01')&(deliveries['order_date']< '2022-03-01')]
deliveries = deliveries[(deliveries['promised_date']> deliveries['order_date'])&(deliveries['delivery_date']> deliveries['order_date'])&(deliveries['expected_date']> deliveries['order_date'])&(deliveries['requested_date']> deliveries['order_date'])&(deliveries['posting_date']> deliveries['order_date'])]

deliveries.shape

(2334, 16)

In [114]:
deliveries.sort_values(by='order_date', ascending=False).head()

Unnamed: 0,vendor_no,delivery_no,order_no,city,postcode,vendor_name,expected_date,location_code,order_date,order_qty,posting_date,promised_date,qty_rcd_not_invoiced,qty_invoiced,delivery_date,requested_date
5934,K02028,ABB0869526,EB22/0294,Frankfurt,60327,RS Components GmbH,2022-01-28,A-01,2022-01-25,0,2022-01-27,2022-01-28,0,40,2022-01-26,2022-01-28
5923,K02028,ABB0869526,EB22/0294,Frankfurt,60327,RS Components GmbH,2022-01-28,A-01,2022-01-25,0,2022-01-27,2022-01-28,0,200,2022-01-26,2022-01-28
5860,K04659,HDE0001158,EB22/0023,Bamberg,96052,Robert Bosch GmbH,2022-01-21,A-01,2022-01-18,0,2022-01-21,2022-01-21,0,1500,2022-01-20,2022-01-21
5948,K00633,O3 1393582-4,EB22/0161,Teltow,14513,Teltower Diakon. Werkstätten,2022-01-31,A-01,2022-01-17,0,2022-01-31,2022-01-31,0,2000,2022-01-30,2022-01-31
5949,K00633,O3 1-1393583-5,EB22/0161,Teltow,14513,Teltower Diakon. Werkstätten,2022-01-31,A-01,2022-01-17,0,2022-01-31,2022-01-31,0,1000,2022-01-30,2022-01-31


In [209]:
deliveries.isna().sum()

vendor_no               0
delivery_no             0
order_no                0
city                    1
postcode                2
vendor_name             0
expected_date           0
location_code           0
order_date              0
order_qty               0
posting_date            0
promised_date           0
qty_rcd_not_invoiced    0
qty_invoiced            0
delivery_date           0
requested_date          0
dtype: int64

In [210]:
# drop rows with nan values
deliveries = deliveries.dropna()
deliveries.shape

(2332, 16)

In [211]:
# drop duplicates
deliveries.drop_duplicates(keep='first')
deliveries.shape

(2332, 16)

In [186]:
deliveries.head()

Unnamed: 0,vendor_no,delivery_no,order_no,city,country,postcode,vendor_name,expected_date,location_code,order_date,order_qty,posting_date,promised_date,qty_rcd_not_invoiced,qty_invoiced,delivery_date,requested_date
0,K00421,00451185,EB19/2893,Altena-Dahle,,58754,Möhling GmbH & Co. KG,2019-12-17,B-01,2019-09-23,0,2019-12-30,2019-12-17,0,22730,2019-12-16,2019-12-17
6,K00016,40063527,EB19/3307,Berlin,,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13
7,K00016,40063526,EB19/3307,Berlin,,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-16,2019-12-13
9,K00268,27602381,EB19/2763,Salzweg,,94121,Electrovac Hacht & Huber GmbH,2019-12-13,A-01,2019-08-22,0,2019-12-30,2019-12-13,0,1014,2019-12-19,2019-12-13
11,K03974,HES0076418,EB19/2849,Cisano Bergamosco,,IT-24034,M.S. Ambrogio S.p.A.,2020-10-01,A-01,2019-03-09,0,2019-12-30,2020-10-01,0,54780,2019-12-17,2020-10-01


In [212]:
# upload world cities database from: https://simplemaps.com/data/world-cities to insert country column
cities = pd.read_csv(r'worldcities.csv')
cities=cities[["city", "country"]]
cities.head()

Unnamed: 0,city,country
0,Tokyo,Japan
1,Jakarta,Indonesia
2,Delhi,India
3,Mumbai,India
4,Manila,Philippines


In [213]:
# filter only the cities that are also present in the deliveries dataframe
city_list=list(set(deliveries.city))
city_list
cities=cities[cities.city.isin(city_list)].reset_index(drop=True)
cities

Unnamed: 0,city,country
0,Tokyo,Japan
1,Berlin,Germany
2,Hamburg,Germany
3,Frankfurt,Germany
4,Düsseldorf,Germany
...,...,...
159,Wirges,Germany
160,Hirschau,Germany
161,Berlin,United States
162,Wilhelmsdorf,Germany


In [214]:
# check for possible duplication of city names in the world cities dataframe
duplicate = cities[cities.duplicated(subset="city", keep=False)].drop_duplicates(keep='first').reset_index(drop=True).sort_values(by='city') # False marks all duplicates as True
duplicate

# as we can see, some city names are present in more than one country. If we were to merge the two dataframs, we could have rows assigned to the wrong country
# But as we know that the dataset is from a German company, we could infer that most of these vendors are in Germany.
# In order to confirm this, we will match a list of German postal codes to the postal codes in the deliveries dataframe

Unnamed: 0,city,country
0,Berlin,Germany
9,Berlin,United States
13,Birkenfeld,Germany
16,Feldkirchen,Germany
11,Feldkirchen,Austria
3,Halle,Germany
8,Halle,Belgium
5,Hamburg,United States
1,Hamburg,Germany
18,Herne,United Kingdom


In [216]:
# upload German postcode list source: https://www.suche-postleitzahl.org/downloads
de_postcodes = pd.read_excel(r'zuordnung_plz_ort.xlsx')
de_postcodes=de_postcodes.rename(columns={'ort': 'city',"plz": "postcode"})

# we're only interested in the postcode and city columns. we add a country column with Germany as a value
de_postcodes=de_postcodes[["city","postcode"]]
de_postcodes['country']='Germany'
de_postcodes = pd.DataFrame(data=de_postcodes)
de_postcodes.postcode = de_postcodes.postcode.astype(str)
de_postcodes.head()


Unnamed: 0,city,postcode,country
0,Aach,78267,Germany
1,Aach,54298,Germany
2,Aachen,52062,Germany
3,Aachen,52064,Germany
4,Aachen,52066,Germany


In [247]:
# we merge deliveries and de_postcode dataframes
# drop duplicated column city_y and copy country column
joined_df=pd.merge(deliveries,de_postcodes, on="postcode", how="left").drop(columns="city_y").rename(columns={"city_x": "city", "country":"country2"})
joined_df.insert(4, 'country',joined_df["country2"])
joined_df=joined_df.drop(columns="country2")
joined_df.head()

Unnamed: 0,vendor_no,delivery_no,order_no,city,country,postcode,vendor_name,expected_date,location_code,order_date,order_qty,posting_date,promised_date,qty_rcd_not_invoiced,qty_invoiced,delivery_date,requested_date
0,K00421,00451185,EB19/2893,Altena-Dahle,,58754,Möhling GmbH & Co. KG,2019-12-17,B-01,2019-09-23,0,2019-12-30,2019-12-17,0,22730,2019-12-16,2019-12-17
1,K00016,40063527,EB19/3307,Berlin,Germany,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13
2,K00016,40063526,EB19/3307,Berlin,Germany,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-16,2019-12-13
3,K00268,27602381,EB19/2763,Salzweg,Germany,94121,Electrovac Hacht & Huber GmbH,2019-12-13,A-01,2019-08-22,0,2019-12-30,2019-12-13,0,1014,2019-12-19,2019-12-13
4,K03974,HES0076418,EB19/2849,Cisano Bergamosco,,IT-24034,M.S. Ambrogio S.p.A.,2020-10-01,A-01,2019-03-09,0,2019-12-30,2020-10-01,0,54780,2019-12-17,2020-10-01


In [246]:
# we're now merging going to try to fill up all the NaN values in country
joined_df=pd.merge(joined_df,cities, on="city", how="left") #.drop(columns="city_y").rename(columns={"city_x": "city", "country":"country2"})
joined_df.head()


Unnamed: 0,vendor_no,delivery_no,order_no,city,country_x,postcode,vendor_name,expected_date,location_code,order_date,order_qty,posting_date,promised_date,qty_rcd_not_invoiced,qty_invoiced,delivery_date,requested_date,country_y
0,K00421,451185,EB19/2893,Altena-Dahle,,58754,Möhling GmbH & Co. KG,2019-12-17,B-01,2019-09-23,0,2019-12-30,2019-12-17,0,22730,2019-12-16,2019-12-17,
1,K00016,40063527,EB19/3307,Berlin,Germany,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13,Germany
2,K00016,40063527,EB19/3307,Berlin,Germany,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13,United States
3,K00016,40063527,EB19/3307,Berlin,Germany,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13,United States
4,K00016,40063527,EB19/3307,Berlin,Germany,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13,United States


In [251]:
# let's verify the rows with NaN values in the country column and try to fill them up
is_NaN = joined_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = joined_df[row_has_NaN]
rows_with_NaN=pd.merge(rows_with_NaN,cities, on="city", how="left").drop(columns="country_x").rename(columns={"country_y": "country2"})
rows_with_NaN.insert(4, 'country',rows_with_NaN["country2"])
rows_with_NaN=rows_with_NaN.drop(columns="country2")
rows_with_NaN.isnull().sum()

vendor_no                 0
delivery_no               0
order_no                  0
city                      0
country                 193
postcode                  0
vendor_name               0
expected_date             0
location_code             0
order_date                0
order_qty                 0
posting_date              0
promised_date             0
qty_rcd_not_invoiced      0
qty_invoiced              0
delivery_date             0
requested_date            0
dtype: int64

In [135]:
sorted(Counter(deliveries.city).items())

[('Adelsheim', 1),
 ('Altena-Dahle', 3),
 ('Amberg', 2),
 ('Anzing', 1),
 ('Bad Berleburg', 3),
 ('Bad Driburg', 46),
 ('Bad Malente', 2),
 ('Bad Rappenau', 1),
 ('Bad Soden-Salmünster', 10),
 ('Balve', 3),
 ('Bamberg', 180),
 ('Bellignat', 10),
 ('Bensheim', 194),
 ('Berlin', 296),
 ('Beuren', 2),
 ('Bietigheim-Bissingen', 1),
 ('Birkenfeld', 2),
 ('Birkenwerder', 1),
 ('Blaichach', 15),
 ('Blankenfelde-Mahlow', 1),
 ('Blankenhain', 1),
 ('Blomberg', 7),
 ('Bonn', 7),
 ('Braunschweig', 4),
 ('Bretten', 206),
 ('Bretzfeld', 4),
 ('Brieselang', 3),
 ('Bubsheim', 1),
 ('Bydgoszcz', 10),
 ('Bünde', 1),
 ('Carei Jud. Satu Mare', 3),
 ('Cham', 16),
 ('Chemnitz', 2),
 ('Cisano Bergamosco', 1),
 ('Dahlewitz', 1),
 ('Deizisau', 1),
 ('Dieburg', 1),
 ('Dinslaken', 4),
 ('Ditzingen', 2),
 ('Donauwörth', 2),
 ('Dornhan', 1),
 ('Dortmund', 1),
 ('Düsseldorf', 7),
 ('Ebersbach', 23),
 ('Egesheim', 19),
 ('Ehningen', 1),
 ('El Son', 1),
 ('Engstingen', 2),
 ('Eningen', 5),
 ('Ennepetal', 1),
 ('Epfe

In [133]:
deliveries2=Left_join = pd.merge(deliveries, 
                     cities, 
                     on ='city', 
                     how ='left')
deliveries2

Unnamed: 0,vendor_no,delivery_no,order_no,city,country_x,postcode,vendor_name,expected_date,location_code,order_date,order_qty,posting_date,promised_date,qty_rcd_not_invoiced,qty_invoiced,delivery_date,requested_date,country_y
0,K00421,00451185,EB19/2893,Altena-Dahle,,58754,Möhling GmbH & Co. KG,2019-12-17,B-01,2019-09-23,0,2019-12-30,2019-12-17,0,22730,2019-12-16,2019-12-17,
1,K00016,40063527,EB19/3307,Berlin,,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13,Germany
2,K00016,40063527,EB19/3307,Berlin,,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13,United States
3,K00016,40063527,EB19/3307,Berlin,,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13,United States
4,K00016,40063527,EB19/3307,Berlin,,13437,Höttges,2019-12-13,A-01,2019-10-29,0,2019-12-30,2019-12-13,0,3200,2019-12-18,2019-12-13,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3575,K00910,17200137,EB21/3506,Eningen,,72800,Adelhelm LubriCoat GmbH,2022-01-02,A-01,2021-11-30,0,2022-01-02,2022-01-02,0,19160,2022-01-31,2022-01-28,
3576,K04659,HDE0001158,EB22/0023,Bamberg,,96052,Robert Bosch GmbH,2022-01-28,A-01,2021-12-17,0,2022-01-02,2022-01-28,0,1500,2022-01-31,2022-01-28,Germany
3577,K04128,HD00100047,EB21/3732,Neuenbürg,,75305,Albert Weber GmbH,2022-01-28,B-01,2021-04-15,0,2022-01-31,2022-01-28,0,146,2022-01-02,2021-05-08,Germany
3578,K00130,01061589,EB21/3626,Reichshof - Eckenhagen,,51574,Elektrisola GmbH & Co. KG,2022-01-28,A-01,2021-10-12,0,2022-02-02,2022-01-28,30,0,2022-01-02,2022-01-28,


In [122]:
# export as csv file
deliveries.to_csv(r'deliveries_clean.csv', index=False);

In [123]:
# export to Excel file
read_file = pd.read_csv (r'deliveries_clean.csv')
read_file.to_excel (r'deliveries.xlsx', index = None, header=True)
