In [2]:
import numpy as np
import pandas as pd

import calendar
import datetime as dt
from collections import Counter

In [52]:
lieferanten = pd.read_csv(r'Lieferanten.csv', on_bad_lines='skip', sep=';')
lieferanten.head()

Unnamed: 0,Creditor No_,Datev Account No_,Fax No_,Group Vendor No_,ILN No_,No_ Series,No_,Our Account No_,Pay-to Vendor No_,Phone No_,...,Territory Code,VAT Bus_ Posting Group,Vendor Posting Group,akq Allow Payment 3rd parties,akq DTAZV Charges Rule,akq DTAZV Currency Instruction,akq Direction Code,akq Payment Type,akq Single Payment,timestamp
0,,,,,,,K00001,D12638,,,...,,IL,IC3,0,0,0,,,0,00000000557853B0
1,,,,,,,K00002,18295,K03838,033764 2557 00,...,,IL,IL,0,0,0,,,0,00000000594C7FBD
2,,,+49 30 814547-890,,,,K00004,d12638,,+49 30 814547-100,...,,IL,IC4,0,0,0,,,0,00000000556C6548
3,,,089 359 6183,,,,K00006,20621,,089 359 0031/32,...,,IL,IL,0,0,0,,,0,0000000053C3DDF4
4,,,0041-61785-5188,,,,K00010,,,0041-61785-5271,...,,IL,AL,0,0,0,,,0,0000000053C3DDF9


In [53]:
lieferanten.shape

(2411, 90)

In [54]:
# drop columns where all NaN values
lieferanten=lieferanten.dropna(axis=1, how='all')
lieferanten.shape

(2411, 71)

In [55]:
# drop columns where all 0 values
lieferanten=lieferanten.loc[:, (lieferanten != 0).any(axis=0)]
lieferanten.shape

(2411, 56)

In [56]:
# drop columns where over 50% of values are null values
lieferanten=lieferanten.loc[:, lieferanten.isnull().mean()<0.5]
lieferanten.shape

(2411, 39)

In [57]:
# get list with column names
list(lieferanten.columns)

['Fax No_',
 'No_ Series',
 'No_',
 'Our Account No_',
 'Phone No_',
 'akq No_ of Entries Paym_ Adv_',
 'Address',
 'Base Calendar Code',
 'Blocked',
 'Budgeted Amount',
 'City',
 'Currency Id',
 'Datev Export Date',
 'E-Mail',
 'Gen_ Bus_ Posting Group',
 'Home Page',
 'Id',
 'Image',
 'Invoice Disc_ Code',
 'Last Date Modified',
 'Last Modified Date Time',
 'Location Code',
 'Name',
 'Partner Type',
 'Payment Method Code',
 'Payment Method Id',
 'Payment Terms Code',
 'Payment Terms Id',
 'Post Code',
 'Preferred Bank Account Code',
 'Prepayment _',
 'Privacy Blocked',
 'Release Until',
 'Search Name',
 'Shipment Method Code',
 'VAT Bus_ Posting Group',
 'Vendor Posting Group',
 'akq Single Payment',
 'timestamp']

In [58]:
# drop columns that don't seem relevant to our challenge
to_drop=['Fax No_', 'No_ Series', 'Our Account No_', 'Phone No_', 'akq No_ of Entries Paym_ Adv_','Last Date Modified',
 'Address', 'Base Calendar Code', 'Blocked', 'Budgeted Amount', 'Currency Id', 'Datev Export Date',
 'E-Mail', 'Gen_ Bus_ Posting Group', 'Home Page', 'Image', 'Invoice Disc_ Code', 'Name', 'Partner Type',
 'Payment Method Code', 'Payment Method Id', 'Payment Terms Code', 'Payment Terms Id', 
 'Preferred Bank Account Code', 'Prepayment _', 'Privacy Blocked', 'Release Until', 'VAT Bus_ Posting Group','akq Single Payment', 'timestamp',
 'Last Modified Date Time', 'Id','Shipment Method Code','Vendor Posting Group']
lieferanten.drop(to_drop, inplace=True, axis=1)

In [59]:
lieferanten.head()

Unnamed: 0,No_,City,Location Code,Post Code,Search Name
0,K00001,Berlin,,12487,DIMIDIA INDUSTRIE HOLDING GMBH
1,K00002,Mittenwalde,A-01,15749,ALPHA/ LTE
2,K00004,Berlin,,12487,DIMIDIA IMMOBILIEN GMBH
3,K00006,München,A-01,80807,STEFAN MAIER GMBH
4,K00010,Breitenbach,A-01,CH-4226,#VON ROLL ISOLA TROISDORF


In [60]:
lieferanten.columns = lieferanten.columns.str.replace(" ", "_").str.lower()
lieferanten=lieferanten.rename(columns={"no_": "vendor_no", "post_code": "postcode"})
list(lieferanten.columns)

['vendor_no', 'city', 'location_code', 'postcode', 'search_name']

In [61]:
lieferanten.sort_values(by='search_name', ascending=True).head(20)

Unnamed: 0,vendor_no,city,location_code,postcode,search_name
1279,K03602,Hannover,,30401,# HANOMAG HÄRTECENTER GMBH
1883,K04410,Köln,A-01,50672,#ABCFINANCE GMBH F. K.KAUFFMANN
1921,K04455,Weissach,A-01,71287,#ABE - DRUCKÜBERTRÄGER#
1502,K03931,Dietzenbach,,63128,#ACAL GMBH#
1435,K03841,Gröbenzell,,82194,#ACAL GMBH#
952,K03164,Freiburg,A-01,79111,#ADVANTEK -FALSCHER KRED.-
380,K01368,Bochum,A-01,44807,#AIR PRODUCTS GMBH#
1900,K04433,Wriezen,A-01,16269,#ALBA SÜDOST-BRANDENBURG GMBH#
1431,K03835,Stockach,,78333,#ALKA GMBH RETZA#
1135,K03430,Pforzheim,A-01,75181,#AMI DODUCO EU


In [62]:
# some labels in the search_name column have unwanted characters. since we don't know if they're trailing or leading, we will use replace().
lieferanten.search_name = lieferanten.search_name.str.replace("#", "", regex=False).str.replace(" ...", "",regex=False).str.replace('"', "",regex=False)


In [84]:
lieferanten.isna().sum()

vendor_no        0
city             0
location_code    0
postcode         0
search_name      0
dtype: int64

In [85]:
# drop rows with nan values
lieferanten = lieferanten.dropna()
lieferanten.shape

(1838, 5)

In [86]:
# drop duplicates
lieferanten.drop_duplicates(keep='first')
lieferanten.shape

(1838, 5)

In [87]:
lieferanten.sort_values(by='city', ascending=False).head()

Unnamed: 0,vendor_no,city,location_code,postcode,search_name
2208,K04760,Zürich,B-01,CH-8052,JOHNSON MATTHEY
847,K02965,Zülpich,A-01,53909,MARSTON-BENTLEY-DOMSEL GMBH
1663,K04134,Zörbig,B-01,06780,FLP MICROFINISHING GMBH
1014,K03263,Zwönitz,A-01,08297,IMPREGLON / BAUM ZWÖNITZ GMBH
1096,K03378,Zwingenberg,B-01,64673,RESINEX GERMANY GMBH


In [88]:
# export as csv file
lieferanten.to_csv(r'providers_clean.csv', index=False);

In [6]:
# export to Excel file
read_file = pd.read_csv (r'performance_table.csv', on_bad_lines='skip', delimiter=',')
read_file.to_excel (r'performance_table.xlsx', index = None, header=True)

In [3]:
# export to Excel file
read_file = pd.read_csv (r'cities.csv', on_bad_lines='skip', delimiter=';')
read_file.to_excel (r'cities.xlsx', index = None, header=True)