In [213]:
import numpy as np
import pandas as pd

import calendar
import datetime as dt
from collections import Counter

In [214]:
purchases = pd.read_csv(r'Einkaufszeile.csv', on_bad_lines='skip', sep=';',low_memory=False)
purchases.head()

Unnamed: 0,Attached to Line No_,Auto Charge Doc_ Line No_,Auto Charge Doc_ No_,Billing Entry No_,Billing Reference No_,Blanket Order Line No_,Blanket Order No_,Budgeted FA No_,Buy-from Vendor No_,Contract Line No_,...,VAT Base Amount,VAT Bus_ Posting Group,VAT Calculation Type,VAT Difference,VAT Identifier,VAT Prod_ Posting Group,VAT _,VDP Code,Variant Code,timestamp
0,0,0,,0,,0,,,K04148,0,...,395000000000000000000000,IL,0,0,14.0,MWST19,1900000000000000000000,,,0000000052ADF4C4
1,0,0,,0,,0,,,K04138,0,...,565061000000000000000000,EU,1,0,14.0,MWST19,000000000000000000000,,,00000000572F7357
2,0,0,,0,,0,,,K03105,0,...,6240000000000000000000,IL,0,0,14.0,MWST19,1900000000000000000000,,,00000000572F78DF
3,0,0,,0,,0,,,K02739,0,...,749660000000000000000000,IL,0,0,14.0,MWST19,1900000000000000000000,,,0000000055C7E662
4,0,0,,0,,0,,,K02739,0,...,356773000000000000000000,IL,0,0,14.0,MWST19,1900000000000000000000,,,0000000055C7E663


In [215]:
purchases.shape

(4416, 249)

In [216]:
# drop columns where all NaN values
purchases=purchases.dropna(axis=1, how='all')
purchases.shape

(4416, 197)

In [217]:
# drop columns where all 0 values
purchases=purchases.loc[:, (purchases != 0).any(axis=0)]
purchases.shape

(4416, 153)

In [218]:
# drop columns where over 50% of values are null values
purchases=purchases.loc[:, purchases.isnull().mean()<0.5]
purchases.shape

(4416, 141)

In [219]:
# get list with column names
list(purchases.columns)

['Blanket Order Line No_',
 'Buy-from Vendor No_',
 'Cross-Reference Type No_',
 'Demand Query Line No_',
 'Document No_',
 'Line No_',
 'No_',
 'Pay-to Vendor No_',
 'Pos_ No_',
 'Prod_ Order Line No_',
 'Routing Reference No_',
 'A_ Rcd_ Not Inv_ Ex_ VAT (LCY)',
 'Allow Invoice Disc_',
 'Allow Item Charge Assignment',
 'Amount Including VAT',
 'Amount',
 'Amt_ Rcd_ Not Invoiced (LCY)',
 'Amt_ Rcd_ Not Invoiced',
 'Bin Code',
 'Blanket Order valid from',
 'Blanket Order valid to',
 'Completely Received',
 'Contract Amount',
 'Cross-Reference Type',
 'Description 2',
 'Description',
 'Dimension Set ID',
 'Direct Unit Cost Price Factor',
 'Direct Unit Cost',
 'Document Type',
 'Expected Receipt Date',
 'Expected Receipt Time',
 'FA Posting Date',
 'Fixed Quantity',
 'Gen_ Bus_ Posting Group',
 'Gen_ Prod_ Posting Group',
 'Gross Weight',
 'Indirect Cost _',
 'Inv_ Disc_ Amount to Invoice',
 'Inv_ Discount Amount',
 'Invoice Quantity (Item Charge)',
 'Item Category Code',
 'Job Currency 

In [220]:
# drop columns that don't seem relevant to our challenge
to_drop=['Blanket Order Line No_',  'Cross-Reference Type No_', 'Demand Query Line No_', 'Document No_', 'Line No_', 'Pay-to Vendor No_',
 'Pos_ No_', 'Prod_ Order Line No_', 'Routing Reference No_', 'A_ Rcd_ Not Inv_ Ex_ VAT (LCY)', 'Allow Invoice Disc_', 'Allow Item Charge Assignment',
 'Bin Code', 'Blanket Order valid from', 'Blanket Order valid to', 'Contract Amount', 'Cross-Reference Type', 'Description 2', 'Dimension Set ID', 'Direct Unit Cost Price Factor',
 'Direct Unit Cost', 'Document Type', 'Fixed Quantity', 'Gen_ Bus_ Posting Group', 'Gen_ Prod_ Posting Group', 'Gross Weight', 'Indirect Cost _', 'Inv_ Disc_ Amount to Invoice',
 'Inv_ Discount Amount', 'Job Currency Factor', 'Job Line Amount (LCY)', 'Job Line Amount', 'Job Line Disc_ Amount (LCY)', 'Job Line Discount Amount', 'Job Line Discount _',
 'Job Remaining Qty_ (Base)', 'Job Remaining Qty_', 'Job Total Price (LCY)', 'Job Unit Price (LCY)', 'Job Unit Price', 'Line Amount', 'Line Discount Amount',
 'Line Discount _', 'Max Order Quantity', 'Min Order Quantity', 'Net Weight', 'Outstanding Amount (LCY)', 'Outstanding Amount', 'Outstanding Amt_ Ex_ VAT (LCY)',
 'Outstanding Qty_ (Base)', 'Outstanding Quantity', 'Overhead Rate', 'Pack Sample Quantity (Base)', 'Pack Sample Quantity', 'Planned Receipt Date',
 'Planning Flexibility', 'Posting Group', 'Prepayment Amount', 'Prepayment VAT Difference', 'Prepayment VAT _', 'Prepayment _', 'Prepmt Amt Deducted',
 'Prepmt Amt to Deduct', 'Prepmt VAT Diff_ Deducted', 'Prepmt VAT Diff_ to Deduct', 'Prepmt_ Amount Inv_ (LCY)', 'Prepmt_ Amount Inv_ Incl_ VAT',
 'Prepmt_ Amt_ Incl_ VAT', 'Prepmt_ Amt_ Inv_', 'Prepmt_ Line Amount', 'Prepmt_ VAT Amount Inv_ (LCY)', 'Prepmt_ VAT Base Amt_', 'Price Factor',
 'Profit _', 'Promised Receipt Date', 'Recalculate Invoice Disc_', 'Ret_ Qty_ Shpd Not Invd_(Base)', 'Return Qty_ Shipped (Base)',
 'Return Qty_ Shipped Not Invd_', 'Return Qty_ Shipped', 'Return Qty_ to Ship (Base)', 'Return Qty_ to Ship', 'Return Shpd_ Not Invd_ (LCY)',
 'Return Shpd_ Not Invd_', 'Returns Deferral Start Date', 'Safety Lead Time', 'Salvage Value', 'Shortcut Dimension 1 Code', 'Shortcut Dimension 2 Code',
 'Unit Cost (LCY)', 'Unit Price (LCY)', 'Unit Volume', 'Unit of Measure (Cross Ref_)', 'Unit of Measure Code', 'Unit of Measure', 'Units per Parcel',
 'VAT Base Amount', 'VAT Bus_ Posting Group', 'VAT Calculation Type', 'VAT Difference', 'VAT Identifier', 'VAT Prod_ Posting Group', 'VAT _', 'timestamp',
'Amt_ Rcd_ Not Invoiced (LCY)', 'Amt_ Rcd_ Not Invoiced','FA Posting Date','Qty_ to Receive (Base)','Quantity (Base)','Unit Cost','Description',
 'Lead Time Calculation', 'Expected Receipt Time','Original Commission Quantity','Qty_ Invoiced (Base)', 'Qty_ Rcd_ Not Invoiced (Base)','Qty_ Rcd_ Not Invoiced',
 'Qty_ Received (Base)', 'Qty_ per Unit of Measure', 'Qty_ to Invoice (Base)', 'Qty_ to Invoice', 'Qty_ to Receive','Item Category Code','Invoice Quantity (Item Charge)',
 'Job Total Price','Order Quantity','Original Date','Completely Received','Type','Expected Receipt Date','Amount Including VAT']
purchases.drop(to_drop, inplace=True, axis=1)

In [221]:
purchases.head()

Unnamed: 0,Buy-from Vendor No_,No_,Amount,Location Code,Order Date,Quantity Invoiced,Quantity Received,Quantity,Really Receipt DateTime,Requested Receipt Date
0,K04148,HD00034020,395000000000000000000000,B-01,25.04.2018 00:00:00,25000000000000000000000,25000000000000000000000,25000000000000000000000,01.01.1753 00:00:00,11.01.2019 00:00:00
1,K04138,WA00497656,565061000000000000000000,B-01,21.08.2018 00:00:00,121800000000000000000000,121800000000000000000000,121800000000000000000000,01.01.1753 00:00:00,28.02.2019 00:00:00
2,K03105,MV00300385,6240000000000000000000,B-01,19.10.2018 00:00:00,100000000000000000000000,100000000000000000000000,100000000000000000000000,01.01.1753 00:00:00,29.11.2018 00:00:00
3,K02739,17000162,749660000000000000000000,B-01,05.06.2019 00:00:00,200000000000000000000000,200000000000000000000000,200000000000000000000000,01.01.1753 00:00:00,29.11.2019 00:00:00
4,K02739,17000162,356773000000000000000000,B-01,05.06.2019 00:00:00,90400000000000000000000,90400000000000000000000,90400000000000000000000,01.01.1753 00:00:00,10.01.2020 00:00:00


In [222]:
list(purchases.columns)

['Buy-from Vendor No_',
 'No_',
 'Amount',
 'Location Code',
 'Order Date',
 'Quantity Invoiced',
 'Quantity Received',
 'Quantity',
 'Really Receipt DateTime',
 'Requested Receipt Date']

In [223]:
# make feature names consistent
purchases.columns = purchases.columns.str.replace(" ", "_").str.lower()
purchases=purchases.rename(columns={'buy-from_vendor_no_':'vendor_no',"no_": "purchase_no", 'quantity_invoiced':'qty_invoiced','quantity_received':'qty_received',
'quantity':'qty','really_receipt_datetime':'receipt_date','requested_receipt_date':'requested_date'})
list(purchases.columns)

['vendor_no',
 'purchase_no',
 'amount',
 'location_code',
 'order_date',
 'qty_invoiced',
 'qty_received',
 'qty',
 'receipt_date',
 'requested_date']

In [224]:
# drop unnecessary characters to enable typecasting
purchases.order_date = purchases.order_date.map(lambda x: str(x)[:-9])
purchases.receipt_date = purchases.receipt_date.map(lambda x: str(x)[:-9])
purchases.requested_date = purchases.requested_date.map(lambda x: str(x)[:-9])
purchases.amount = purchases.amount.map(lambda x: str(x)[:-18])
purchases.qty_invoiced = purchases.qty_invoiced.map(lambda x: str(x)[:-21])
purchases.qty_received = purchases.qty_received.map(lambda x: str(x)[:-21])
purchases.qty = purchases.qty.map(lambda x: str(x)[:-21])

purchases.head(3)

Unnamed: 0,vendor_no,purchase_no,amount,location_code,order_date,qty_invoiced,qty_received,qty,receipt_date,requested_date
0,K04148,HD00034020,395000,B-01,25.04.2018,250,250,250,01.01.1753,11.01.2019
1,K04138,WA00497656,565061,B-01,21.08.2018,1218,1218,1218,01.01.1753,28.02.2019
2,K03105,MV00300385,6240,B-01,19.10.2018,1000,1000,1000,01.01.1753,29.11.2018


In [225]:
purchases.dtypes

vendor_no         object
purchase_no       object
amount            object
location_code     object
order_date        object
qty_invoiced      object
qty_received      object
qty               object
receipt_date      object
requested_date    object
dtype: object

In [226]:
purchases.shape

(4416, 10)

In [227]:
# Dates in the really_receipt_datetime columns with a value of 1753-01-01 mean the field is blank or there's been an error in the input.
# we will filter out these rows (1,236 from 6012)
purchases = purchases[purchases['order_date']!= '01.01.1753']
purchases = purchases[purchases['receipt_date']!= '01.01.1753']
purchases = purchases[purchases['requested_date']!= '01.01.1753']
purchases.shape

(3549, 10)

In [228]:
# replace the commas for dots so we can typecast to float
purchases.amount = purchases.amount.str.replace(",", ".").str.lower()
purchases.head()

Unnamed: 0,vendor_no,purchase_no,amount,location_code,order_date,qty_invoiced,qty_received,qty,receipt_date,requested_date
18,K04149,MS00203015,9460.13,B-01,02.07.2020,1250,1250,1250,22.07.2021,10.09.2020
19,K04149,MS00203015,9460.13,B-01,24.09.2019,1250,1250,1250,22.07.2021,15.03.2021
20,K04149,DA00114007,8247.23,B-01,06.08.2021,2415,2415,2415,22.07.2021,06.08.2021
28,K04797,RA00000360,83.5,A-01,08.10.2019,5000,5000,5000,14.01.2020,28.10.2019
29,K04797,RA00000370,51.75,A-01,08.10.2019,1500,1500,1500,14.01.2020,28.10.2019


In [230]:
# typecast
purchases = purchases.astype({'order_date': 'datetime64','receipt_date': 'datetime64', 'requested_date': 'datetime64',
'amount': 'float64','qty_invoiced': 'int64', 'qty_received': 'int64','qty': 'int64'})
purchases.dtypes

vendor_no                 object
purchase_no               object
amount                   float64
location_code             object
order_date        datetime64[ns]
qty_invoiced               int64
qty_received               int64
qty                        int64
receipt_date      datetime64[ns]
requested_date    datetime64[ns]
dtype: object

In [231]:
purchases.shape

(3549, 10)

In [232]:
# chck and drop duplicates
purchases.drop_duplicates(keep='first')
purchases.shape

(3549, 10)

In [233]:
# check for null values
purchases.isna().sum()

vendor_no         0
purchase_no       0
amount            0
location_code     0
order_date        0
qty_invoiced      0
qty_received      0
qty               0
receipt_date      0
requested_date    0
dtype: int64

In [234]:
# export as csv file
purchases.to_csv(r'purchases_clean.csv', index=False);