In [1]:
import numpy as np
import pandas as pd

import calendar
import datetime as dt
from collections import Counter

In [2]:
lieferanten = pd.read_csv(r'Lieferanten.csv', on_bad_lines='skip', sep=';')
lieferanten.head()

Unnamed: 0,Creditor No_,Datev Account No_,Fax No_,Group Vendor No_,ILN No_,No_ Series,No_,Our Account No_,Pay-to Vendor No_,Phone No_,...,Territory Code,VAT Bus_ Posting Group,Vendor Posting Group,akq Allow Payment 3rd parties,akq DTAZV Charges Rule,akq DTAZV Currency Instruction,akq Direction Code,akq Payment Type,akq Single Payment,timestamp
0,,,,,,,K00001,D12638,,,...,,IL,IC3,0,0,0,,,0,00000000557853B0
1,,,,,,,K00002,18295,K03838,033764 2557 00,...,,IL,IL,0,0,0,,,0,00000000594C7FBD
2,,,+49 30 814547-890,,,,K00004,d12638,,+49 30 814547-100,...,,IL,IC4,0,0,0,,,0,00000000556C6548
3,,,089 359 6183,,,,K00006,20621,,089 359 0031/32,...,,IL,IL,0,0,0,,,0,0000000053C3DDF4
4,,,0041-61785-5188,,,,K00010,,,0041-61785-5271,...,,IL,AL,0,0,0,,,0,0000000053C3DDF9


In [3]:
lieferanten.shape

(2411, 90)

In [4]:
# drop columns where all NaN values
lieferanten=lieferanten.dropna(axis=1, how='all')
lieferanten.shape

(2411, 71)

In [5]:
# drop columns where all 0 values
lieferanten=lieferanten.loc[:, (lieferanten != 0).any(axis=0)]
lieferanten.shape

(2411, 56)

In [6]:
# drop columns where over 50% of values are null values
lieferanten=lieferanten.loc[:, lieferanten.isnull().mean()<0.5]
lieferanten.shape

(2411, 39)

In [7]:
# get list with column names
list(lieferanten.columns)

['Fax No_',
 'No_ Series',
 'No_',
 'Our Account No_',
 'Phone No_',
 'akq No_ of Entries Paym_ Adv_',
 'Address',
 'Base Calendar Code',
 'Blocked',
 'Budgeted Amount',
 'City',
 'Currency Id',
 'Datev Export Date',
 'E-Mail',
 'Gen_ Bus_ Posting Group',
 'Home Page',
 'Id',
 'Image',
 'Invoice Disc_ Code',
 'Last Date Modified',
 'Last Modified Date Time',
 'Location Code',
 'Name',
 'Partner Type',
 'Payment Method Code',
 'Payment Method Id',
 'Payment Terms Code',
 'Payment Terms Id',
 'Post Code',
 'Preferred Bank Account Code',
 'Prepayment _',
 'Privacy Blocked',
 'Release Until',
 'Search Name',
 'Shipment Method Code',
 'VAT Bus_ Posting Group',
 'Vendor Posting Group',
 'akq Single Payment',
 'timestamp']

In [8]:
# drop columns that don't seem relevant to our challenge
to_drop=['Fax No_', 'No_ Series', 'Our Account No_', 'Phone No_', 'akq No_ of Entries Paym_ Adv_',
 'Address', 'Base Calendar Code', 'Blocked', 'Budgeted Amount', 'Currency Id', 'Datev Export Date',
 'E-Mail', 'Gen_ Bus_ Posting Group', 'Home Page', 'Image', 'Invoice Disc_ Code', 'Name', 'Partner Type',
 'Payment Method Code', 'Payment Method Id', 'Payment Terms Code', 'Payment Terms Id', 
 'Preferred Bank Account Code', 'Prepayment _', 'Privacy Blocked', 'Release Until', 'VAT Bus_ Posting Group','akq Single Payment', 'timestamp',
 'Last Modified Date Time', 'Id','Shipment Method Code','Vendor Posting Group']
lieferanten.drop(to_drop, inplace=True, axis=1)

In [9]:
lieferanten.head()

Unnamed: 0,No_,City,Last Date Modified,Location Code,Post Code,Search Name
0,K00001,Berlin,17.09.2020 00:00:00,,12487,DIMIDIA INDUSTRIE HOLDING GMBH
1,K00002,Mittenwalde,20.10.2021 00:00:00,A-01,15749,ALPHA/ LTE
2,K00004,Berlin,11.09.2020 00:00:00,,12487,DIMIDIA IMMOBILIEN GMBH
3,K00006,München,21.02.2020 00:00:00,A-01,80807,STEFAN MAIER GMBH
4,K00010,Breitenbach,21.02.2020 00:00:00,A-01,CH-4226,#VON ROLL ISOLA TROISDORF


In [10]:
lieferanten.columns = lieferanten.columns.str.replace(" ", "_").str.lower()
lieferanten=lieferanten.rename(columns={"no_": "lieferanten_no"})
list(lieferanten.columns)

['lieferanten_no',
 'city',
 'last_date_modified',
 'location_code',
 'post_code',
 'search_name']

In [11]:
lieferanten.last_date_modified=pd.to_datetime(lieferanten.last_date_modified).dt.date
lieferanten.last_date_modified=pd.to_datetime(lieferanten.last_date_modified)
lieferanten.dtypes

lieferanten_no                object
city                          object
last_date_modified    datetime64[ns]
location_code                 object
post_code                     object
search_name                   object
dtype: object

In [12]:
lieferanten.head()

Unnamed: 0,lieferanten_no,city,last_date_modified,location_code,post_code,search_name
0,K00001,Berlin,2020-09-17,,12487,DIMIDIA INDUSTRIE HOLDING GMBH
1,K00002,Mittenwalde,2021-10-20,A-01,15749,ALPHA/ LTE
2,K00004,Berlin,2020-11-09,,12487,DIMIDIA IMMOBILIEN GMBH
3,K00006,München,2020-02-21,A-01,80807,STEFAN MAIER GMBH
4,K00010,Breitenbach,2020-02-21,A-01,CH-4226,#VON ROLL ISOLA TROISDORF


In [13]:
lieferanten.isna().sum()

lieferanten_no          0
city                   39
last_date_modified      0
location_code         569
post_code              40
search_name             0
dtype: int64

In [14]:
# drop rows with nan values
lieferanten = lieferanten.dropna()
lieferanten.shape

(1838, 6)

In [15]:
# drop duplicates
lieferanten.drop_duplicates(keep='first')
lieferanten.shape

(1838, 6)

In [16]:
# export as csv file
lieferanten.to_csv(r'lieferanten_clean.csv', index=False);