## Imports

In [1]:
import pandas as pd
import os
import numpy as np

print (np.__version__)
print (pd.__version__)

1.15.2
0.23.4


In [2]:
# Reading data into pandas dataframe
# Also we know that the missing values in this dataset is denoted by '?', so we are telling pandas beforehand  to treat '?' as NaN values
df = pd.read_table(
                     'H:\SELF\Yashu\Files\german-credit-risk\german_credit_data.csv',
                    sep = ',',
                    header=None,
                    na_values = '?'
                   )
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
1,0.0,67,male,2,own,,little,1169,6,radio/TV
2,1.0,22,female,2,own,little,moderate,5951,48,radio/TV
3,2.0,49,male,1,own,little,,2096,12,education
4,3.0,45,male,2,free,little,little,7882,42,furniture/equipment


In [3]:
# list out number of missing value in each of the series of the df
df.isnull().sum()

0      1
1      0
2      0
3      0
4      0
5    183
6    394
7      0
8      0
9      0
dtype: int64

In [4]:
# just have look at the data types to get an idea of what imputation to make for any particular series
df.dtypes

0    float64
1     object
2     object
3     object
4     object
5     object
6     object
7     object
8     object
9     object
dtype: object

## Deleting Rows that have missing values

In [5]:
df_ = df.dropna(inplace=False)
df_.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

## Imputing with mode value-(For Categorical Data)

In [7]:
# we will just show example of doing it with column number 1; this can be scaled to 'n' number of columns
df[0].fillna(df[0].mode()[0], inplace=True)
df.isnull().sum()

0      0
1      0
2      0
3      0
4      0
5    183
6    394
7      0
8      0
9      0
dtype: int64

## Imputing by adding one more category

In [8]:
# 3rd column can have these number of possibilities
df[3].unique()

array(['Job', '2', '1', '3', '0'], dtype=object)

In [9]:
# treating it as new category all together tells model to learn it's dependency with other features for 
# making prediction; adds to one more column; if one hot representation; else another integer value if label encoding
df[3].fillna('UNK', inplace=True)
df.isnull().sum()

0      0
1      0
2      0
3      0
4      0
5    183
6    394
7      0
8      0
9      0
dtype: int64

In [10]:
df[df[3] == 'UNK']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9


## Imputing by back filling

In [12]:
# we can specify a back-fill to propagate the next values backward
df[1].fillna(method='bfill', inplace=True)
df.isnull().sum()

0      0
1      0
2      0
3      0
4      0
5    183
6    394
7      0
8      0
9      0
dtype: int64

## Imputing by forward filling

In [13]:
# We can specify a forward-fill to propagate the previous value forward
df[4].fillna(method='ffill', inplace=True)
df.isnull().sum()

0      0
1      0
2      0
3      0
4      0
5    183
6    394
7      0
8      0
9      0
dtype: int64