# Data Cleaning and Preparation

## Missing Data Management

NaN: easy to spot, 'valor sentinela'

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

In [2]:
fruits = Series(['apple', 'banana', np.nan, 'avocado', None])
fruits

0      apple
1     banana
2        NaN
3    avocado
4       None
dtype: object

In [3]:
type(np.nan)

float

In [4]:
fruits.isnull()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [5]:
fruits.notnull()

0     True
1     True
2    False
3     True
4    False
dtype: bool

In [6]:
# filtering using null
fruits[fruits.notnull()]

0      apple
1     banana
3    avocado
dtype: object

In [7]:
# equivalent but with dropna
fruits.dropna()

0      apple
1     banana
3    avocado
dtype: object

In [8]:
fruits.fillna('cherry')

0      apple
1     banana
2     cherry
3    avocado
4     cherry
dtype: object

## DataFrame filtering

In [11]:
from numpy import nan as na
df1 = DataFrame([[1.,6.5,3],[1.,na, na],[na,na,na],[na,6.5,3.]])
df1

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
df1.dropna() # how='any' as default

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
df1.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
df1[4]=na
df1

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
df1.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
df2 = DataFrame(np.random.randn(7,5))
df2.iloc[:4, 2] = na
df2.iloc[:2, 1:4] = na
df2

Unnamed: 0,0,1,2,3,4
0,-1.195519,,,,0.006985
1,-1.837837,,,,-0.143199
2,-1.598931,-0.982112,,-0.340653,-0.862136
3,0.703809,-0.588666,,0.61227,1.135771
4,-0.151463,0.640051,-0.554583,-1.024543,0.160361
5,-0.641106,-0.526564,0.137289,0.081043,0.005912
6,0.109256,0.924931,-0.223851,-0.399029,-1.209414


In [18]:
# drop rows with less than 3 not-nan
df2.dropna(thresh=3)

Unnamed: 0,0,1,2,3,4
2,-1.598931,-0.982112,,-0.340653,-0.862136
3,0.703809,-0.588666,,0.61227,1.135771
4,-0.151463,0.640051,-0.554583,-1.024543,0.160361
5,-0.641106,-0.526564,0.137289,0.081043,0.005912
6,0.109256,0.924931,-0.223851,-0.399029,-1.209414


In [19]:
df2.fillna(0)

Unnamed: 0,0,1,2,3,4
0,-1.195519,0.0,0.0,0.0,0.006985
1,-1.837837,0.0,0.0,0.0,-0.143199
2,-1.598931,-0.982112,0.0,-0.340653,-0.862136
3,0.703809,-0.588666,0.0,0.61227,1.135771
4,-0.151463,0.640051,-0.554583,-1.024543,0.160361
5,-0.641106,-0.526564,0.137289,0.081043,0.005912
6,0.109256,0.924931,-0.223851,-0.399029,-1.209414


In [20]:
d = {1:0.5, 2:0, 3:9.5} # fill values by columns
df2.fillna(d)

Unnamed: 0,0,1,2,3,4
0,-1.195519,0.5,0.0,9.5,0.006985
1,-1.837837,0.5,0.0,9.5,-0.143199
2,-1.598931,-0.982112,0.0,-0.340653,-0.862136
3,0.703809,-0.588666,0.0,0.61227,1.135771
4,-0.151463,0.640051,-0.554583,-1.024543,0.160361
5,-0.641106,-0.526564,0.137289,0.081043,0.005912
6,0.109256,0.924931,-0.223851,-0.399029,-1.209414


In [22]:
df2 = DataFrame(np.random.randn(7,5))
df2.iloc[1:5, 2] = na
df2.iloc[1:3, 1:4] = na
df2.iloc[4:, 3] = na
df2

Unnamed: 0,0,1,2,3,4
0,0.545183,-0.439927,1.949989,-1.115514,-0.778602
1,0.481379,,,,0.516328
2,-0.528012,,,,-1.195508
3,-0.912439,0.789747,,0.820471,1.244431
4,0.152874,1.71771,,,-0.880683
5,0.885392,-1.376826,0.120086,,-0.544493
6,0.258043,1.500652,-0.826251,,-0.665061


In [24]:
df2.fillna(method='ffill') # grabs the previous col value and use it below

Unnamed: 0,0,1,2,3,4
0,0.545183,-0.439927,1.949989,-1.115514,-0.778602
1,0.481379,-0.439927,1.949989,-1.115514,0.516328
2,-0.528012,-0.439927,1.949989,-1.115514,-1.195508
3,-0.912439,0.789747,1.949989,0.820471,1.244431
4,0.152874,1.71771,1.949989,0.820471,-0.880683
5,0.885392,-1.376826,0.120086,0.820471,-0.544493
6,0.258043,1.500652,-0.826251,0.820471,-0.665061


In [25]:
df2.fillna(method='ffill', limit=1) # grabs the previous col value and use it below just once

Unnamed: 0,0,1,2,3,4
0,0.545183,-0.439927,1.949989,-1.115514,-0.778602
1,0.481379,-0.439927,1.949989,-1.115514,0.516328
2,-0.528012,,,,-1.195508
3,-0.912439,0.789747,,0.820471,1.244431
4,0.152874,1.71771,,0.820471,-0.880683
5,0.885392,-1.376826,0.120086,,-0.544493
6,0.258043,1.500652,-0.826251,,-0.665061


In [26]:
df2.fillna(method='ffill', limit=1).fillna(df2.mean()) # grabs the previous col value and use it below just once, 
# then fills with mean

Unnamed: 0,0,1,2,3,4
0,0.545183,-0.439927,1.949989,-1.115514,-0.778602
1,0.481379,-0.439927,1.949989,-1.115514,0.516328
2,-0.528012,0.438271,0.414608,-0.147521,-1.195508
3,-0.912439,0.789747,0.414608,0.820471,1.244431
4,0.152874,1.71771,0.414608,0.820471,-0.880683
5,0.885392,-1.376826,0.120086,-0.147521,-0.544493
6,0.258043,1.500652,-0.826251,-0.147521,-0.665061


In [27]:
df2.fillna(method='bfill', limit=1) # grabs the below in col and uses it upwards just once

Unnamed: 0,0,1,2,3,4
0,0.545183,-0.439927,1.949989,-1.115514,-0.778602
1,0.481379,,,,0.516328
2,-0.528012,0.789747,,0.820471,-1.195508
3,-0.912439,0.789747,,0.820471,1.244431
4,0.152874,1.71771,0.120086,,-0.880683
5,0.885392,-1.376826,0.120086,,-0.544493
6,0.258043,1.500652,-0.826251,,-0.665061


In [30]:
df2.fillna(method='ffill', limit=1).fillna({1:df2.mean(), 2:df2.std(), 3:df2.median()})

Unnamed: 0,0,1,2,3,4
0,0.545183,-0.439927,1.949989,-1.115514,-0.778602
1,0.481379,-0.439927,1.949989,-1.115514,0.516328
2,-0.528012,0.414608,1.411359,0.120086,-1.195508
3,-0.912439,0.789747,1.368948,0.820471,1.244431
4,0.152874,1.71771,0.876258,0.820471,-0.880683
5,0.885392,-1.376826,0.120086,,-0.544493
6,0.258043,1.500652,-0.826251,,-0.665061


## Removal of repeated data

In [31]:
df3 = DataFrame({'k1':["uno", "dos"]*3 + ["dos"], "k2":[1,1,2,3,3,4,4]})
df3

Unnamed: 0,k1,k2
0,uno,1
1,dos,1
2,uno,2
3,dos,3
4,uno,3
5,dos,4
6,dos,4


In [32]:
df3.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [34]:
df3.drop_duplicates()

Unnamed: 0,k1,k2
0,uno,1
1,dos,1
2,uno,2
3,dos,3
4,uno,3
5,dos,4


In [35]:
df3['lugar'] = range(1,8)
df3

Unnamed: 0,k1,k2,lugar
0,uno,1,1
1,dos,1,2
2,uno,2,3
3,dos,3,4
4,uno,3,5
5,dos,4,6
6,dos,4,7


In [36]:
df3.duplicated(['k1', 'k2']) # use only these columns in duplicated check

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [37]:
df3.drop_duplicates(["k1", "k2"], keep='first')

Unnamed: 0,k1,k2,lugar
0,uno,1,1
1,dos,1,2
2,uno,2,3
3,dos,3,4
4,uno,3,5
5,dos,4,6


## Transform data with mappings

In [41]:
df4 = DataFrame({'platillo':['tocino', 'cochinita', 'tocino', 'costillas', 'vacio', 'ribEye',
                            'arrachera', 'vacio', 'platano'], 
                'peso':[40,30,120,60,75,80,30,50,60]})
df4

Unnamed: 0,platillo,peso
0,tocino,40
1,cochinita,30
2,tocino,120
3,costillas,60
4,vacio,75
5,ribEye,80
6,arrachera,30
7,vacio,50
8,platano,60


In [42]:
animal = {'tocino': 'cerdo', 'cochinita':'cerdo', 'costillas':'cerdo', 'vacio':'res', 'ribeye':'res',
         'arrachera': 'res', 'platano':None }
df4['platillo'].str.lower()

0       tocino
1    cochinita
2       tocino
3    costillas
4        vacio
5       ribeye
6    arrachera
7        vacio
8      platano
Name: platillo, dtype: object

In [43]:
nc = df4['platillo'].str.lower()
nc

0       tocino
1    cochinita
2       tocino
3    costillas
4        vacio
5       ribeye
6    arrachera
7        vacio
8      platano
Name: platillo, dtype: object

In [44]:
nc.map(animal)

0    cerdo
1    cerdo
2    cerdo
3    cerdo
4      res
5      res
6      res
7      res
8     None
Name: platillo, dtype: object

In [46]:
df4['animal'] = df4['platillo'].str.lower().map(animal)
df4

Unnamed: 0,platillo,peso,animal
0,tocino,40,cerdo
1,cochinita,30,cerdo
2,tocino,120,cerdo
3,costillas,60,cerdo
4,vacio,75,res
5,ribEye,80,res
6,arrachera,30,res
7,vacio,50,res
8,platano,60,


In [47]:
s = "el HOMbre    en la    Luna"
def fix(s):
    return " ".join(s.split()).lower()
fix(s)

'el hombre en la luna'

In [48]:
df4['animal'] = df4['platillo'].map(lambda x: animal[fix(x)])
df4

Unnamed: 0,platillo,peso,animal
0,tocino,40,cerdo
1,cochinita,30,cerdo
2,tocino,120,cerdo
3,costillas,60,cerdo
4,vacio,75,res
5,ribEye,80,res
6,arrachera,30,res
7,vacio,50,res
8,platano,60,
