NaN Values in Mathematical Operations


In [None]:
import numpy as np
import pandas as pd
data={'A':[10,10,np.nan,10,10],'B':[10,np.nan,10,10,10]}
df=pd.DataFrame(data)
df

Unnamed: 0,A,B
0,10.0,10.0
1,10.0,
2,,10.0
3,10.0,10.0
4,10.0,10.0


In [None]:
print("\nsum of column A(default behavior):",df['A'].sum())
#by default nan values will be ignored
print("\nsum of column B(default behavior):",df['B'].sum())


sum of column A(default behavior): 40.0

sum of column B(default behavior): 40.0


In [None]:
print("Sum of column A(ignoring nan:)",df['A'].sum(skipna=True))
print("Sum of column A(ignoring nan):",df['A'].sum(skipna=False))

Sum of column A(ignoring nan:) 40.0
Sum of column A(ignoring nan): nan


Filling in Missing Data


In [None]:
df_filled=df.fillna(0) #filling nan with constant value 0
print(df_filled)

      A     B
0  10.0  10.0
1  10.0   0.0
2   0.0  10.0
3  10.0  10.0
4  10.0  10.0


In [None]:
r=df.fillna(df.mean())
#filling nan with column mean
print("\n DataFrame after filling NaN with column mean:")
print(r)


 DataFrame after filling NaN with column mean:
      A     B
0  10.0  10.0
1  10.0  10.0
2  10.0  10.0
3  10.0  10.0
4  10.0  10.0


In [None]:
df=pd.read_csv(r"/content/Bengaluru_House_Data.csv")
print(df.head())

              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  


In [None]:
df1=df.fillna({"balcony":1})
df1["balcony"].isnull().sum()

0

In [None]:
df=df.fillna({"balcony":1,"price":100," society":"APPLE"})
df["balcony"].isnull().sum()

0

In [None]:
#handling numeric attributes
for i in ['balcony']:
  df[i].fillna(df[i].mean(),inplace=True)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[i].fillna(df[i].mean(),inplace=True)


Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,0
price,0


In [None]:
for i in ['society']:
  df[i].fillna(df[i].mode(),inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[i].fillna(df[i].mode(),inplace=True)


Forward and Backward filling of missing values

In [None]:
import numpy as np
import pandas as pd
data={'A':[10,10,np.nan,10,10],'B':[10,np.nan,10,10,10]}
df=pd.DataFrame(data)
df

Unnamed: 0,A,B
0,10.0,10.0
1,10.0,
2,,10.0
3,10.0,10.0
4,10.0,10.0


In [None]:
#Forward Fill(propagates last valid value forward)
df_forward_filled=df.fillna(method='ffill')
print("\nDataFrame after forward fill:")
print(df_forward_filled)


DataFrame after forward fill:
      A     B
0  10.0  10.0
1  10.0  10.0
2  10.0  10.0
3  10.0  10.0
4  10.0  10.0


  df_forward_filled=df.fillna(method='ffill')


In [None]:
#Forward Fill(propagates last valid value forward)
df_forward_filled=df.fillna(method='ffill',axis=0)
print("\nDataFrame after forward fill:")
print(df_forward_filled)


DataFrame after forward fill:
      A     B
0  10.0  10.0
1  10.0  10.0
2  10.0  10.0
3  10.0  10.0
4  10.0  10.0


  df_forward_filled=df.fillna(method='ffill',axis=0)


In [None]:
#backward Fill(propagates last valid value backward)
df_forward_filled=df.fillna(method='bfill',axis=0)
print("\nDataFrame after forward fill:")
print(df_forward_filled)


DataFrame after forward fill:
      A     B
0  10.0  10.0
1  10.0  10.0
2  10.0  10.0
3  10.0  10.0
4  10.0  10.0


  df_forward_filled=df.fillna(method='bfill',axis=0)


In [None]:
#backward Fill(propagates last valid value backward)
df_forward_filled=df.fillna(method='bfill',axis=1)
print("\nDataFrame after forward fill:")
print(df_forward_filled)


DataFrame after forward fill:
      A     B
0  10.0  10.0
1  10.0   NaN
2  10.0  10.0
3  10.0  10.0
4  10.0  10.0


  df_forward_filled=df.fillna(method='bfill',axis=1)


In [None]:
#Forward Fill(propagates last valid value forward)
df_forward_filled=df.fillna(method='ffill',axis=1)
print("\nDataFrame after forward fill:")
print(df_forward_filled)


DataFrame after forward fill:
      A     B
0  10.0  10.0
1  10.0  10.0
2   NaN  10.0
3  10.0  10.0
4  10.0  10.0


  df_forward_filled=df.fillna(method='ffill',axis=1)


Filling with Index values

In [None]:
import numpy as np
import pandas as pd
data={'A':[10,10,np.nan,10,10],'B':[10,np.nan,10,10,10]}
df=pd.DataFrame(data)
df

Unnamed: 0,A,B
0,10.0,10.0
1,10.0,
2,,10.0
3,10.0,10.0
4,10.0,10.0


In [None]:
index_series=pd.Series(df.index,index=df.index)
print("\nIndex Series:")
print(index_series)


Index Series:
0    0
1    1
2    2
3    3
4    4
dtype: int64


In [None]:
for col in df.columns:
  df[col].fillna(index_series,inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(index_series,inplace=True)


Unnamed: 0,A,B
0,10.0,10.0
1,10.0,1.0
2,2.0,10.0
3,10.0,10.0
4,10.0,10.0


  Interpolation


In [None]:
import pandas as pd
df=pd.read_csv(r"/content/Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [None]:
import pandas as pd
df=pd.read_csv(r"/content/Bengaluru_House_Data.csv")
df['bath']=df['bath'].interpolate(method='linear')
df.isnull().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,0
balcony,609
price,0


IMPUTER from SKLEARN Package

In [None]:
dff=df
from sklearn.impute import KNNImputer #Importing Imputer
impute =KNNImputer() #Initializing the Imputer

In [None]:
for i in df.select_dtypes(include='number').columns:
  df[i]=impute.fit_transform(df[[i]]) #select one or more columns
df.isnull().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,0
balcony,0
price,0
