In [1]:
import numpy as np
import pandas as pd

import calendar
import datetime as dt

The 4 tables are an export from the ERP system (Microsoft NAV), i.e. some columns may not be actively used. The purchase orders placed with the suppliers are documented in the purchasing rows and the individual deliveries from the suppliers to the purchase orders from the purchasing rows are documented in the purchasing delivery rows. In addition, the suppliers and the articles are included as a separate table.


The use case is about supplier evaluation (suppliers = creditors). What we are mainly interested in is which suppliers are particularly punctual or unpunctual. If special features stand out, this is of course interesting.


The visualisations/evaluations can be done in a tool of your choice.

Evaluation of delivery includes assessment of the supplier's ability to meet the requirements for on-time deliveries and ordered quantities and a buyer should have an appropriate system in place for effective assessment. 

In [2]:
artikel = pd.read_csv(r'Artikel.csv', on_bad_lines='skip', sep=';', low_memory=False)
artikel.head(3)

Unnamed: 0,timestamp,No_,No_ 2,Description,Search Description,Description 2,Base Unit of Measure,Price Unit Conversion,Type,Inventory Posting Group,...,Single-Level Cap_ Ovhd Cost,Single-Level Mfg_ Ovhd Cost,Overhead Rate,Rolled-up Subcontracted Cost,Rolled-up Mfg_ Ovhd Cost,Rolled-up Cap_ Overhead Cost,Order Tracking Policy,Critical,Common Item No_,db_id
0,00000000562CA905,210254,,"BOLZEN, OBERFLAECHE",C23303A 154C143 *Z170-03,,ST,0,0,--,...,0,0,0,0,0,0,0,0,,1
1,00000000562CA965,214302,,"SCHUTZKAPPE,OBFL.Cu Sn",C23303A 21C 79 *Z170,Lager: 37405596,ST,0,0,--,...,0,0,0,0,0,0,0,0,,2
2,00000000562CA97D,216436,,"SCHRAUBE, ROH",C20303A 64C128 *Z190,Lager: 29403581,ST,0,0,--,...,0,0,0,0,0,0,0,0,,3


In [3]:
artikel.dtypes

timestamp                        object
No_                              object
No_ 2                            object
Description                      object
Search Description               object
                                 ...   
Rolled-up Cap_ Overhead Cost     object
Order Tracking Policy             int64
Critical                          int64
Common Item No_                 float64
db_id                             int64
Length: 174, dtype: object

In [4]:
# convert hex to datetime and calculate date
data=artikel['timestamp']
new_timestamp=[]

def hex_to_int(string):
    string = ''.join(reversed(string.split()))
    return int(string,16)

for (string) in data:
    secs = hex_to_int(string)
    date = dt.datetime.fromtimestamp(secs).strftime("%Y-%m-%d %I:%M:%S")
    new_timestamp.append(date)
print(new_timestamp[0:5])

['2015-10-25 11:03:49', '2015-10-25 11:05:25', '2015-10-25 11:05:49', '2015-10-25 11:17:34', '2015-10-25 12:10:53']


In [5]:
artikel.insert(loc=0, column='new_timestamp', value=new_timestamp) # insert new_timestamp at beginning of dataframe
artikel.head()

Unnamed: 0,new_timestamp,timestamp,No_,No_ 2,Description,Search Description,Description 2,Base Unit of Measure,Price Unit Conversion,Type,...,Single-Level Cap_ Ovhd Cost,Single-Level Mfg_ Ovhd Cost,Overhead Rate,Rolled-up Subcontracted Cost,Rolled-up Mfg_ Ovhd Cost,Rolled-up Cap_ Overhead Cost,Order Tracking Policy,Critical,Common Item No_,db_id
0,2015-10-25 11:03:49,00000000562CA905,210254,,"BOLZEN, OBERFLAECHE",C23303A 154C143 *Z170-03,,ST,0,0,...,0,0,0,0,0,0,0,0,,1
1,2015-10-25 11:05:25,00000000562CA965,214302,,"SCHUTZKAPPE,OBFL.Cu Sn",C23303A 21C 79 *Z170,Lager: 37405596,ST,0,0,...,0,0,0,0,0,0,0,0,,2
2,2015-10-25 11:05:49,00000000562CA97D,216436,,"SCHRAUBE, ROH",C20303A 64C128 *Z190,Lager: 29403581,ST,0,0,...,0,0,0,0,0,0,0,0,,3
3,2015-10-25 11:17:34,00000000562CAC3E,237512,,"HALTERING, OBFL.",C23303A 21C 80 *Z170,VF-EBK ZU C23303-A21-C80,ST,0,0,...,0,0,0,0,0,0,0,0,,4
4,2015-10-25 12:10:53,00000000562CB8BD,392891,,Polblechstr. C10 (entfet) n=20,C23303A 40C 10 *Z172 01,entfetten,ST,0,0,...,0,0,0,0,0,0,0,0,,5


In [6]:
artikel.shape

(29616, 175)

In [7]:
artikel.info()
# too many columns, let's start by dropping columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29616 entries, 0 to 29615
Columns: 175 entries, new_timestamp to db_id
dtypes: float64(28), int64(39), object(108)
memory usage: 39.5+ MB


In [8]:
# drop columns where all NaN values
artikel=artikel.dropna(axis=1, how='any')
artikel.shape

(29616, 112)

In [9]:
# drop columns where all 0 values
artikel=artikel.loc[:, (artikel != 0).any(axis=0)]
artikel.shape

(29616, 96)

In [10]:
# drop columns where over 50% of values are null values
artikel=artikel.loc[:, artikel.isnull().mean()<0.5]
artikel.shape

(29616, 96)

In [11]:
# get list with column names
list(artikel.columns)

['new_timestamp',
 'timestamp',
 'No_',
 'Allow Invoice Disc_',
 'Unit Price',
 'Price_Profit Calculation',
 'Profit _',
 'Costing Method',
 'Unit Cost',
 'Standard Cost',
 'Last Direct Cost',
 'Indirect Cost _',
 'Cost is Adjusted',
 'Allow Online Adjustment',
 'Reorder Point',
 'Maximum Inventory',
 'Reorder Quantity',
 'Unit List Price',
 'Duty Due _',
 'Gross Weight',
 'Net Weight',
 'Units per Parcel',
 'Unit Volume',
 'Duty Unit Conversion',
 'Budget Quantity',
 'Budgeted Amount',
 'Budget Profit',
 'Blocked',
 'Last DateTime Modified',
 'Last Date Modified',
 'Last Time Modified',
 'Picture',
 'Automatic Ext_ Texts',
 'Reserve',
 'Low-Level Code',
 'Lot Size',
 'Last Unit Cost Calc_ Date',
 'Rolled-up Material Cost',
 'Rolled-up Capacity Cost',
 'Scrap _',
 'Inventory Value Zero',
 'Minimum Order Quantity',
 'Maximum Order Quantity',
 'Safety Stock Quantity',
 'Order Multiple',
 'Replenishment System',
 'Rounding Precision',
 'Reordering Policy',
 'Include Inventory',
 'Manufact

In [12]:
# drop columns that don't seem relevant to our challenge
to_drop=['timestamp', 'Allow Invoice Disc_', 'Unit Price', 'Price_Profit Calculation',
 'Profit _', 'Costing Method', 'Unit Cost', 'Standard Cost', 'Last Direct Cost', 'Indirect Cost _', 'Cost is Adjusted', 'Allow Online Adjustment', 'Reorder Point',
 'Maximum Inventory', 'Unit List Price', 'Duty Due _', 'Gross Weight', 'Net Weight', 'Units per Parcel', 'Unit Volume', 'Duty Unit Conversion',
 'Budget Quantity', 'Budgeted Amount', 'Budget Profit', 'Blocked', 'Picture', 'Automatic Ext_ Texts', 'Reserve',
 'Stockout Warning', 'Low-Level Code', 'Lot Size', 'Last Unit Cost Calc_ Date', 'Rolled-up Material Cost',
 'Rolled-up Capacity Cost', 'Scrap _', 'Inventory Value Zero', 'Minimum Order Quantity', 'Maximum Order Quantity', 'Safety Stock Quantity', 'Order Multiple', 
 'Replenishment System', 'Rounding Precision', 'Reordering Policy', 'Include Inventory', 'Manufacturing Policy',
  'Dampener Quantity', 'Overflow Level', 'Last Counting Period Update', 
  'Use Cross-Docking', 'Next Counting Start Date', 'Next Counting End Date', 'Id', 'Unit of Measure Id', 'Tax Group Id', 'Purchase Blocked',  'Sales Overhead _', 'Status for New Lot No_', 'Lot Status by neg_ControlOrder', 'Scope', 'Rolled-up Calc_ Material Cost', 'Rolled-up Calc_ Capacity Cost', 
 'Rolled-up Calc_ Subcontr_ Cost', 'Rolled-up Calc_ Mfg_ Ovhd Cost', 'Rolled-up Calc_ Cap_ Ovhd Cost', 'Rolled-up Calc_ Ovhd Cost', 'Single-Level Calc_ Mat_ Cost', 
 'Single-Level Calc_ Cap_ Cost', 'Single-Level Calc_ Sub_ Cost', 'Single-Level Calc_ Mfg_ Ovhd', 'Single-Level Calc_ Cap_ Ovhd', 'Single-Level Calc_ Ovhd', 
 'Rolled-up Calc_ Mfg_ Ovhd C_2', 'Rolled-up Calc_ Cap_ Ovhd C_2', 'Single-Level Calc_ Mfg_ Ovhd 2', 'Single-Level Calc_ Cap_ Ovhd 2', 'Purch Price (Newest) Net', 
 'Calculated Cost', 'Release Status', 'Single-Level Material Cost', 'Single-Level Capacity Cost', 
 'Single-Level Subcontrd_ Cost', 'Single-Level Cap_ Ovhd Cost', 'Single-Level Mfg_ Ovhd Cost', 'Overhead Rate', 'Rolled-up Subcontracted Cost', 
 'Rolled-up Mfg_ Ovhd Cost', 'Rolled-up Cap_ Overhead Cost', 'Order Tracking Policy', 'Critical']
artikel.drop(to_drop, inplace=True, axis=1)
artikel.head(10)

Unnamed: 0,new_timestamp,No_,Reorder Quantity,Last DateTime Modified,Last Date Modified,Last Time Modified,db_id
0,2015-10-25 11:03:49,00210254,100000000000000000000,29.11.2020 14:23:14,29.11.2020 00:00:00,01.01.1754 15:23:14,1
1,2015-10-25 11:05:25,00214302,100000000000000000000,29.11.2020 14:23:15,29.11.2020 00:00:00,01.01.1754 15:23:15,2
2,2015-10-25 11:05:49,00216436,000000000000000000000,29.11.2020 14:23:15,29.11.2020 00:00:00,01.01.1754 15:23:15,3
3,2015-10-25 11:17:34,00237512,000000000000000000000,29.11.2020 14:23:19,29.11.2020 00:00:00,01.01.1754 15:23:19,4
4,2015-10-25 12:10:53,00392891,100000000000000000000,29.11.2020 14:23:39,29.11.2020 00:00:00,01.01.1754 15:23:39,5
5,2015-10-25 12:35:01,00476960,100000000000000000000,29.11.2020 14:23:48,29.11.2020 00:00:00,01.01.1754 15:23:48,6
6,2015-10-25 12:51:05,00498355,100000000000000000000,29.11.2020 14:23:54,29.11.2020 00:00:00,01.01.1754 15:23:54,7
7,2015-10-25 01:29:03,0-0740976-2,900000000000000000000,29.11.2020 14:24:07,29.11.2020 00:00:00,01.01.1754 15:24:07,8
8,2015-10-25 01:29:09,0-0740977-1,1000000000000000000000,29.11.2020 14:24:07,29.11.2020 00:00:00,01.01.1754 15:24:07,9
9,2015-10-25 02:47:10,05920036,100000000000000000000,29.11.2020 14:24:38,29.11.2020 00:00:00,01.01.1754 15:24:38,10


In [13]:
artikel.tail()

Unnamed: 0,new_timestamp,No_,Reorder Quantity,Last DateTime Modified,Last Date Modified,Last Time Modified,db_id
29611,2018-03-08 05:47:07,17000234,20000000000000000000000,07.02.2022 09:00:40,07.02.2022 00:00:00,01.01.1754 10:00:40,29612
29612,2018-03-08 05:47:48,WA00736108,20000000000000000000000,10.01.2022 08:15:49,10.01.2022 00:00:00,01.01.1754 09:15:49,29613
29613,2018-03-08 05:48:28,WA00736109,20000000000000000000000,10.01.2022 08:16:52,10.01.2022 00:00:00,01.01.1754 09:16:52,29614
29614,2018-03-08 05:50:22,NW00000500,1440000000000000000000000,29.06.2021 08:55:03,29.06.2021 00:00:00,01.01.1754 10:55:03,29615
29615,2018-03-08 05:53:02,KP0064B313,100000000000000000000000,07.02.2022 09:02:41,07.02.2022 00:00:00,01.01.1754 10:02:41,29616


In [14]:
# drop more features that don't seem relevant to our challenge
to_drop=['new_timestamp', 'Last DateTime Modified', 'Last Time Modified','db_id']
artikel.drop(to_drop, inplace=True, axis=1)
artikel.head()

Unnamed: 0,No_,Reorder Quantity,Last Date Modified
0,210254,100000000000000000000,29.11.2020 00:00:00
1,214302,100000000000000000000,29.11.2020 00:00:00
2,216436,000000000000000000000,29.11.2020 00:00:00
3,237512,000000000000000000000,29.11.2020 00:00:00
4,392891,100000000000000000000,29.11.2020 00:00:00


In [15]:
artikel.columns = artikel.columns.str.replace(" ", "_").str.lower()
artikel=artikel.rename(columns={"no_": "artikel_no"})
artikel.reorder_quantity = artikel.reorder_quantity.map(lambda x: str(x)[:-21])
list(artikel.columns)

['artikel_no', 'reorder_quantity', 'last_date_modified']

In [16]:
artikel = artikel.astype({'last_date_modified': 'datetime64', 'reorder_quantity': 'int64',})
artikel.dtypes

artikel_no                    object
reorder_quantity               int64
last_date_modified    datetime64[ns]
dtype: object

In [17]:
artikel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29616 entries, 0 to 29615
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   artikel_no          29616 non-null  object        
 1   reorder_quantity    29616 non-null  int64         
 2   last_date_modified  29616 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 694.2+ KB


In [18]:
artikel.head()

Unnamed: 0,artikel_no,reorder_quantity,last_date_modified
0,210254,1,2020-11-29
1,214302,1,2020-11-29
2,216436,0,2020-11-29
3,237512,0,2020-11-29
4,392891,1,2020-11-29


In [19]:
artikel.isna().sum()

artikel_no            0
reorder_quantity      0
last_date_modified    0
dtype: int64

In [20]:
artikel.shape

(29616, 3)

In [21]:
# drop duplicates
artikel.drop_duplicates(keep='first')
artikel.shape

(29616, 3)

In [22]:
# export as csv file
artikel.to_csv(r'artikel_clean.csv', index=False);