## Preliminary data cleaning

In [1]:
import numpy as np
import pandas as pd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

Load the data from `battery.csv` into a pandas data frame. 

In [2]:
df = pd.read_pickle("../data/battery_with_geo.pkl")

In [None]:
df.head() # redacted

In [None]:
df.shape # redacted

In [None]:
df.columns # redacted

In [None]:
# Check data info and show the data type of each column
print(df.info()) # redacted

## Dealing with missing values

In [None]:
df.isna().sum() # redacted

In [None]:
df.nunique() # redacted

In [None]:
msno.matrix(df) # redacted

we see a pattern:
- **IF** disponiert_am is missing **THEN** also abholdatum is missing
- both columns have *almost* the same missing values
  - *in some few cases* abholdatum has a missing value where disponiert_am has a value

In [9]:
# nice one liner for statistical overview over missing values
df.isna().mean().mul(100).sort_values(ascending=False).iloc[:15]

long                           0.029636
lat                            0.029636
transporteur                   0.000000
volle_Addresse                 0.000000
angeforderter_behältertyp      0.000000
angemeldete_containeranzahl    0.000000
nettogewicht_in_kg             0.000000
bruttogewicht_in_kg            0.000000
zurückgemeldet_am              0.000000
abholjahr                      0.000000
abholdatum                     0.000000
auftrag_bestätigt_am           0.000000
auftragsdatum                  0.000000
auftragsstatus                 0.000000
auftragstyp                    0.000000
dtype: float64

### Imputing constant values for the missing values
We do this only for non-date columns. date columns are handled in "Data Types and Transforming Data"

In [None]:
#impute "unknown" for  string columns
df[["name_2", "stasse", "name_1",]] = df[["name_2", "stasse", "name_1",]].fillna("unknown")

#impute "other" for categorical columns
df[["typ", "gelieferter_behältertyp"]] = df[["typ", "gelieferter_behältertyp"]].fillna("other")

#impute 0 for numeric columns
df[["kreisgemeindeschlüssel", "länderschlüssel", "region"]] = df[["kreisgemeindeschlüssel", "länderschlüssel", "region"]].fillna(0)

## Dealing with dublicates

In [None]:
# check how many duplicated rows exist in the data frame
df.duplicated().value_counts() # redacted

In [None]:
# remove duplicates
df = df.drop_duplicates()
# reset index inplace
df.reset_index(inplace=True, drop=True)
df.head(5) # redacted

In [None]:
# check again for missing values
df.isna().sum() # redacted

In [None]:
#print the total number of missing values
print(df.isnull().values.sum()) # redacted

In [None]:
print(df.name_2.value_counts())
print("count of missing values in name_2: ", df.name_2.isna().sum()) # redacted

In [None]:
# Übersicht über gelieferte Behältertypen
print(df.gelieferter_behältertyp.value_counts())
print("count of missing values in gelieferter_behältertyp: ", df.gelieferter_behältertyp.isna().sum()) # redacted

In [None]:
df['gelieferter_behältertyp'].isnull() # redacted

As we can see Pandas recognized the "0s" as a missing value. 

## Data Types and Transforming Data

In the following we want to get the data types into the right shape. The dtype `object` for ecample means that there are several data types in the corresponding column.

In [None]:
# check data types in data frame
df.dtypes # redacted

In [None]:
# Select numeric columns
df.select_dtypes('number') # redacted

In [None]:
df = df.astype({'kreisgemeindeschlüssel': int})

In [None]:
df['nettogewicht_in_kg'] = df['nettogewicht_in_kg'].apply(lambda x: x.replace(',','.'))

In [None]:
df = df.astype({'nettogewicht_in_kg': float})
df['nettogewicht_in_kg'] = df['nettogewicht_in_kg'].apply(lambda x: round(x))
df = df.astype({'nettogewicht_in_kg': int})

In [None]:
# type of first date entry
type(df['auftragsdatum'][0])

str

In [None]:
# type of first date entry
type(df['auftrag_bestätigt_am'][0])

str

As you can see our date entry is just a string. We can change that to a date time very easy as we already learned. Note that you also could do that in the beginning, when you read in the csv file with the parameter `parse_date=['date']`.

In [None]:
# change "date" dtype to datetime with format %Y/%m/%d
df['auftragsdatum'] = pd.to_datetime(df['auftragsdatum'], format='%d.%m.%y')

In [None]:
# type of first date entry
type(df['auftragsdatum'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
df_datum = pd.DataFrame(df[['auftragsdatum', 'auftrag_bestätigt_am', 'disponiert_am', 'abholdatum', 'zurückgemeldet_am']])
df_datum.head()

Unnamed: 0,auftragsdatum,auftrag_bestätigt_am,disponiert_am,abholdatum,zurückgemeldet_am
0,2020-01-06,-,,,
1,2020-01-06,-,06.01.20,20.01.20,21.01.20
2,2020-01-06,-,21.01.20,09.01.20,22.01.20
3,2020-01-06,-,,,
4,2020-01-06,-,08.01.20,13.01.20,13.01.20


In [None]:
df['auftrag_bestätigt_am'] = df['auftrag_bestätigt_am'].replace('-', np.nan)
df['auftrag_bestätigt_am'] = df['auftrag_bestätigt_am'].fillna("01.01.99")

df['auftrag_bestätigt_am'] = pd.to_datetime(df['auftrag_bestätigt_am'], format='%d.%m.%y')

In [None]:
df['disponiert_am'] = df['disponiert_am'].fillna("01.01.99")

df['disponiert_am'] = pd.to_datetime(df['disponiert_am'], format='%d.%m.%y')

In [None]:
df['abholdatum'] = df['abholdatum'].fillna("01.01.99")

df['abholdatum'] = pd.to_datetime(df['abholdatum'], format='%d.%m.%y')

In [None]:
df['zurückgemeldet_am'] = df['zurückgemeldet_am'].fillna("01.01.99")

df['zurückgemeldet_am'] = pd.to_datetime(df['zurückgemeldet_am'], format='%d.%m.%y')

In [None]:
df_datum = pd.DataFrame(df[['auftragsdatum', 'auftrag_bestätigt_am', 'disponiert_am', 'abholdatum', 'zurückgemeldet_am']])
df_datum.head()

Unnamed: 0,auftragsdatum,auftrag_bestätigt_am,disponiert_am,abholdatum,zurückgemeldet_am
0,2020-01-06,1999-01-01,1999-01-01,1999-01-01,1999-01-01
1,2020-01-06,1999-01-01,2020-01-06,2020-01-20,2020-01-21
2,2020-01-06,1999-01-01,2020-01-21,2020-01-09,2020-01-22
3,2020-01-06,1999-01-01,1999-01-01,1999-01-01,1999-01-01
4,2020-01-06,1999-01-01,2020-01-08,2020-01-13,2020-01-13


In [None]:
df.groupby(['auftrag_bestätigt_am']).size() # redacted

In [None]:
print(df['herkömliche_übergabestelle'].isnull().values.sum())

1


In [None]:
type(df['herkömliche_übergabestelle'][0])

str

In [None]:
# replace string by boolean
df['herkömliche_übergabestelle'] = df['herkömliche_übergabestelle'].map({'x':True}) 
type(df['herkömliche_übergabestelle'][0])

bool

In [None]:
df['herkömliche_übergabestelle'].fillna(True, inplace=True)

In [None]:
print(df['herkömliche_übergabestelle'].isnull().values.sum())

0


In [None]:
# replace string by boolean
df['qualifizierte_annahmestelle'] = df['qualifizierte_annahmestelle'].map({'x':True, '-': False}) 
df['qualifizierte_annahmestelle'] # redacted

In [None]:
# replace string by boolean
df['qualifizierte_sammelstelle'] = df['qualifizierte_sammelstelle'].map({'x':True, '-': False}) 
df['qualifizierte_sammelstelle'] # redacted

In [None]:
# display number of distinct elements
print("The count of distinct categories in angeforderter_behältertyp is: ", df.angeforderter_behältertyp.nunique())
print("The count of distinct categories in gelieferter_behältertyp is: ", df.gelieferter_behältertyp.nunique())

The count of distinct categories in angeforderter_behältertyp is:  20
The count of distinct categories in gelieferter_behältertyp is:  18


In [None]:
# Descriptive statistics for column age_group
print(df.angeforderter_behältertyp.value_counts())
# redacted

In [None]:
print(df.gelieferter_behältertyp.value_counts())
# redacted

In [None]:
df.region.value_counts() # redacted

In [None]:
sorted_regions = df.region.unique()
sorted_regions.sort()
sorted_regions

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37.])

In [None]:
# drop columns
df.drop('los_1p7', axis='columns', inplace=True)
df.drop('auftragsjahr', axis='columns', inplace=True)
df.drop('auftragsmonat', axis='columns', inplace=True)
df.drop('länderschlüssel', axis='columns', inplace=True)
df.drop('region', axis='columns', inplace=True)
df.drop('disponiert_am', axis='columns', inplace=True)
df.drop('kategorie', axis='columns', inplace=True)

In [None]:
# check data types in data frame
df.dtypes # redacted

## Dealing with Outliers
In the following we want to detect and handle outliers. 

An outlier is an exceptionally high or low value. Based on this definition, a first idea to detect outliers would be to simply cut down the highest and lowest points of the dataset.

To create a better understanding, lets look only the numerical values. 

In [None]:
# select numeric columns
df_numeric = pd.DataFrame(df[['bruttogewicht_in_kg', 'nettogewicht_in_kg', 'angemeldete_containeranzahl', 'angeforderter_behältertyp']])


In [None]:
df_numeric.head()

Unnamed: 0,bruttogewicht_in_kg,nettogewicht_in_kg,angemeldete_containeranzahl,angeforderter_behältertyp
0,0,0,1,Fass (60 Ltr.)
1,431,415,15,Kiste
2,306,297,3,Fass (60 Ltr.)
3,0,0,8,Kiste
4,200,194,8,Kiste


In [None]:
df_numeric.describe().T # redacted

What we can observe from reading this data:
- `bruttogewicht_in_kg`: very high (# redacted) 'bruttogewicht' 
- `nettogewicht_in_kg`: we can see few outliers that 1) have negative weight and 2) very high 'nettogewicht' (# redacted)
- `angemeldete_containeranzahl`: very high 'contraineranzahl' (# redacted) (outlier)

Dropping the outliers.

In [None]:
# drop rows with "brutto_gewicht_in_kg" larger than 30000
df = df.drop(df[df.bruttogewicht_in_kg > 30000].index)

In [None]:
# drop rows with "nettogewicht_in_kg" larger than 25000
df = df.drop(df[df.nettogewicht_in_kg > 25000].index)

In [None]:
# drop all rows with negative "nettogewicht_in_kg" 
df = df.drop(df[df.nettogewicht_in_kg < 0.0].index)

In [None]:
# drop the rest non-relevant values
#df = df.drop(df[(df.bruttogewicht_in_kg > 15000) & (df.angemeldete_containeranzahl == 10)].index)

In [None]:
# drop rows with "angemeldete_containeranzahl" larger than 300
df = df.drop(df[df.angemeldete_containeranzahl > 300].index)

In [None]:
# drop rows with "angemeldete_containeranzahl" = 0
df = df.drop(df[df.angemeldete_containeranzahl == 0].index)

In [None]:
# drop rows with "auftragsstatus" = 'Storniert'
df = df.drop(df[df.auftragsstatus == 'Storniert'].index)

In [None]:
# drop rows with "auftragsstatus" = 'Fehlfahrt'
df = df.drop(df[df.auftragsstatus == 'Fehlfahrt'].index)

In [None]:
df = df.drop(df[df.nettogewicht_in_kg == 1].index)

In [None]:
df = df.drop(df[df.bruttogewicht_in_kg == 3].index)

In [None]:
df.query("bruttogewicht_in_kg == nettogewicht_in_kg") # redacted

In [None]:
df.bundesland.value_counts() # redacted

In [None]:
df.bundesland = np.where(((df.plz == 27639) & (df.bundesland == '?')), 'Niedersachsen', df.bundesland)

In [None]:
df.bundesland = np.where(((df.plz == 57234) & (df.bundesland == '?')), 'Nordrhein-Westfalen', df.bundesland)

In [None]:
df.bundesland = np.where(((df.plz == 33333) & (df.bundesland == '?')), 'Nordrhein-Westfalen', df.bundesland)

In [None]:
print("Shape raw data: ", raw_df.shape)
print("Shape cleaned outliers in the numerical values: ", df.shape)
print("Shape cleaned data without missing values and outliers in weights: ", df.shape)
# redacted

In [None]:
# show min and max netto and brutto of the cleaned data, respectively
print("min netto: ", df.nettogewicht_in_kg.min())
print("max netto: ", df.nettogewicht_in_kg.max())
print()
print("min brutto: ", df.bruttogewicht_in_kg.min())
print("max brutto: ", df.bruttogewicht_in_kg.max())
# redacted

In [None]:
df.info()
# redacted

In [None]:
df.isna().sum() # redacted

In [None]:
df[df.volle_Addresse.isna()] # redacted

2 addresses are not complete so there's also not lat, long.

In [None]:
# pickle will preserve the full state of a pandas dataframe including dtypes
df.to_pickle("../data/battery_cleaned_with_geo.pkl")