# Missing value treatment

In [1]:
import pandas as pd

In [2]:
data = pd.Series([1, 2, None, 4, 5, 6])
data

0    1.0
1    2.0
2    NaN
3    4.0
4    5.0
5    6.0
dtype: float64

In [3]:
data.iloc[2]

nan

In [4]:
pd.isnull(data.iloc[1])

False

In [5]:
pd.isnull(data.iloc[2])

True

In [6]:
pd.isnull(data)

0    False
1    False
2     True
3    False
4    False
5    False
dtype: bool

In [7]:
data.isna()

0    False
1    False
2     True
3    False
4    False
5    False
dtype: bool

In [8]:
data.isna().sum()

1

In [9]:
# To get % of missing value
data.isna().sum() / data.shape[0] * 100

16.666666666666664

In [10]:
df = pd.DataFrame({
    'city': ['a', None, 'c', 'd', 'e', 'f'],
    'sales': [1, 2, 3, 4, 5, None]
})
df

Unnamed: 0,city,sales
0,a,1.0
1,,2.0
2,c,3.0
3,d,4.0
4,e,5.0
5,f,


In [11]:
df['city'].isna()

0    False
1     True
2    False
3    False
4    False
5    False
Name: city, dtype: bool

In [12]:
df.isna()

Unnamed: 0,city,sales
0,False,False
1,True,False
2,False,False
3,False,False
4,False,False
5,False,True


In [13]:
df.isna().sum()

city     1
sales    1
dtype: int64

In [15]:
# To get % of missing value

df.isna().sum() / df.shape[0] * 100

city     16.666667
sales    16.666667
dtype: float64

## Dropping missing values

In [16]:
df.dropna()

Unnamed: 0,city,sales
0,a,1.0
2,c,3.0
3,d,4.0
4,e,5.0


## Filling missing values
- Fill using constant (zero, average, mode)
- Fill using forward or backward fill

In [17]:
import pandas as pd
df = pd.DataFrame({
    'city': ['BNG', 'MUM', 'CHE', 'COC', 'DEL', 'PUN', 'COI', 'HYD'],
    'sales': [1, 2, 3, 4, 5, None, 6, 8],
    'category': ['A', 'B', 'A', None, 'C', 'B', 'A', 'C'] 
})
df

Unnamed: 0,city,sales,category
0,BNG,1.0,A
1,MUM,2.0,B
2,CHE,3.0,A
3,COC,4.0,
4,DEL,5.0,C
5,PUN,,B
6,COI,6.0,A
7,HYD,8.0,C


In [18]:
df['sales_fill_zero'] = df['sales'].fillna(0)
df

Unnamed: 0,city,sales,category,sales_fill_zero
0,BNG,1.0,A,1.0
1,MUM,2.0,B,2.0
2,CHE,3.0,A,3.0
3,COC,4.0,,4.0
4,DEL,5.0,C,5.0
5,PUN,,B,0.0
6,COI,6.0,A,6.0
7,HYD,8.0,C,8.0


In [19]:
avg_sales = df['sales'].mean()
df['sales_fill_avg'] = df['sales'].fillna(avg_sales)
df

Unnamed: 0,city,sales,category,sales_fill_zero,sales_fill_avg
0,BNG,1.0,A,1.0,1.0
1,MUM,2.0,B,2.0,2.0
2,CHE,3.0,A,3.0,3.0
3,COC,4.0,,4.0,4.0
4,DEL,5.0,C,5.0,5.0
5,PUN,,B,0.0,4.142857
6,COI,6.0,A,6.0,6.0
7,HYD,8.0,C,8.0,8.0


In [20]:
df['sales_ffill'] = df['sales'].fillna(method='ffill')
df

Unnamed: 0,city,sales,category,sales_fill_zero,sales_fill_avg,sales_ffill
0,BNG,1.0,A,1.0,1.0,1.0
1,MUM,2.0,B,2.0,2.0,2.0
2,CHE,3.0,A,3.0,3.0,3.0
3,COC,4.0,,4.0,4.0,4.0
4,DEL,5.0,C,5.0,5.0,5.0
5,PUN,,B,0.0,4.142857,5.0
6,COI,6.0,A,6.0,6.0,6.0
7,HYD,8.0,C,8.0,8.0,8.0


In [21]:
df['sales_bfill'] = df['sales'].fillna(method='bfill')
df

Unnamed: 0,city,sales,category,sales_fill_zero,sales_fill_avg,sales_ffill,sales_bfill
0,BNG,1.0,A,1.0,1.0,1.0,1.0
1,MUM,2.0,B,2.0,2.0,2.0,2.0
2,CHE,3.0,A,3.0,3.0,3.0,3.0
3,COC,4.0,,4.0,4.0,4.0,4.0
4,DEL,5.0,C,5.0,5.0,5.0,5.0
5,PUN,,B,0.0,4.142857,5.0,6.0
6,COI,6.0,A,6.0,6.0,6.0,6.0
7,HYD,8.0,C,8.0,8.0,8.0,8.0


In [22]:
df['category']

0       A
1       B
2       A
3    None
4       C
5       B
6       A
7       C
Name: category, dtype: object

In [23]:
df['category'].mode()[0]

'A'

In [24]:
df['category_fill'] = df['category'].fillna(df['category'].mode()[0])
df

Unnamed: 0,city,sales,category,sales_fill_zero,sales_fill_avg,sales_ffill,sales_bfill,category_fill
0,BNG,1.0,A,1.0,1.0,1.0,1.0,A
1,MUM,2.0,B,2.0,2.0,2.0,2.0,B
2,CHE,3.0,A,3.0,3.0,3.0,3.0,A
3,COC,4.0,,4.0,4.0,4.0,4.0,A
4,DEL,5.0,C,5.0,5.0,5.0,5.0,C
5,PUN,,B,0.0,4.142857,5.0,6.0,B
6,COI,6.0,A,6.0,6.0,6.0,6.0,A
7,HYD,8.0,C,8.0,8.0,8.0,8.0,C
