# Data Cleaning
## 1)Handling Missing Data

In [1]:
import pandas as pd
import numpy as np
data=pd.Series(['jhon','susan',np.nan,'joe'])
data

0     jhon
1    susan
2      NaN
3      joe
dtype: object

In [21]:
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [22]:
data[1]=None
data.isnull()

0    False
1     True
2     True
3    False
dtype: bool

### NA handling methods:
dropna: Filter axis labels based on whether values for each label have missing data, with varying thresholds for how
        much missing data to tolerate.
        
fillna: Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.

isnull: Return boolean values indicating which values are missing/NA.

notnull:Negation of isnull.

In [23]:
data1=pd.Series([2,np.nan,5,5.6,np.nan,23.2,np.nan])
data1.dropna()

0     2.0
2     5.0
3     5.6
5    23.2
dtype: float64

In [38]:
data2=pd.DataFrame([[1.3,4,5.3],[np.nan,np.nan,np.nan],[np.nan,6,2],[1,np.nan,np.nan]])
data2

Unnamed: 0,0,1,2
0,1.3,4.0,5.3
1,,,
2,,6.0,2.0
3,1.0,,


In [39]:
data2.dropna() #Give all those rows having no NAN value

Unnamed: 0,0,1,2
0,1.3,4.0,5.3


In [40]:
data2.dropna(how='all') #Drop only those rows having all NAN values

Unnamed: 0,0,1,2
0,1.3,4.0,5.3
2,,6.0,2.0
3,1.0,,


In [52]:
df=pd.DataFrame(np.random.randn(8,3))
df

Unnamed: 0,0,1,2
0,1.024603,0.206797,1.501836
1,0.844204,0.214263,1.457071
2,1.663984,0.801298,-2.038489
3,1.420273,-0.375727,-1.945578
4,0.332926,-3.010518,-0.443151
5,1.096811,-0.142605,-0.265595
6,-1.846544,-0.167272,2.732585
7,0.383162,-1.048945,1.755999


In [53]:
df.iloc[:3,1]=np.nan

In [54]:
df.iloc[5:,2]=np.nan

In [55]:
df

Unnamed: 0,0,1,2
0,1.024603,,1.501836
1,0.844204,,1.457071
2,1.663984,,-2.038489
3,1.420273,-0.375727,-1.945578
4,0.332926,-3.010518,-0.443151
5,1.096811,-0.142605,
6,-1.846544,-0.167272,
7,0.383162,-1.048945,


In [56]:
df.dropna()

Unnamed: 0,0,1,2
3,1.420273,-0.375727,-1.945578
4,0.332926,-3.010518,-0.443151


In [57]:
df.fillna('Not available')

Unnamed: 0,0,1,2
0,1.024603,Not available,1.50184
1,0.844204,Not available,1.45707
2,1.663984,Not available,-2.03849
3,1.420273,-0.375727,-1.94558
4,0.332926,-3.01052,-0.443151
5,1.096811,-0.142605,Not available
6,-1.846544,-0.167272,Not available
7,0.383162,-1.04895,Not available


In [69]:
df.fillna('not available')

Unnamed: 0,0,1,2
0,1.024603,not available,1.50184
1,0.844204,not available,1.45707
2,1.663984,not available,-2.03849
3,1.420273,-0.375727,-1.94558
4,0.332926,-3.01052,-0.443151
5,1.096811,-0.142605,not available
6,-1.846544,-0.167272,not available
7,0.383162,-1.04895,not available



### 2)Data Transformation 

In [96]:
df=pd.DataFrame({'a':['one','two','three']*2,'b': [1, 1, 2, 3, 3, 2]})
df

Unnamed: 0,a,b
0,one,1
1,two,1
2,three,2
3,one,3
4,two,3
5,three,2


In [97]:
df.duplicated() #The method duplicated will check each row at a time and return a bool series
                #that whether the row is duplicated

0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool

In [98]:
df.drop_duplicates() #It will return a data frame without duplicates

Unnamed: 0,a,b
0,one,1
1,two,1
2,three,2
3,one,3
4,two,3


In [102]:
df['c']=range(6)
df

Unnamed: 0,a,b,c
0,one,1,0
1,two,1,1
2,three,2,2
3,one,3,3
4,two,3,4
5,three,2,5


In [104]:
df.drop_duplicates(['a'])  #Here we are filtering out duplicates on the base of a column

Unnamed: 0,a,b,c
0,one,1,0
1,two,1,1
2,three,2,2


In [32]:
data=pd.DataFrame({'Items':['broast','fish','rice','shrimps','steaks','broast','steaks'],
                  'Weight':['2kg','3kg','7kg','1kg','3kg','4kg','9kg']})

In [33]:
a=data['Items']
a

0     broast
1       fish
2       rice
3    shrimps
4     steaks
5     broast
6     steaks
Name: Items, dtype: object

In [38]:
ctg={'broast':'Chicken','fish':'seafood','shrimps':'seafood','steaks':'chicken','rice':'crop'}

In [39]:
data['Categories']=a.map(ctg)
data

Unnamed: 0,Items,Weight,Categories
0,broast,2kg,Chicken
1,fish,3kg,seafood
2,rice,7kg,crop
3,shrimps,1kg,seafood
4,steaks,3kg,chicken
5,broast,4kg,Chicken
6,steaks,9kg,chicken
