## Handling Missing Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# For data with float64 dtype, pandas uses the floating-point value NaN (Not a Number) to represent missing data
float_data=pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [3]:
float_data.isna() # isna method gives us a Boolean Series with True where values are null

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
# The built-in Python None value is also treated as NA
string_data=pd.Series(["aardvark", np.nan, None, "avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [5]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [6]:
float_data=pd.Series([1,2,None],dtype='float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [8]:
# Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate
string_data.dropna()

0    aardvark
3     avocado
dtype: object

In [9]:
float_data.fillna(0) # Fill in missing data with some value or using an interpolation method such as "ffill" or "bfill"

0    1.0
1    2.0
2    0.0
dtype: float64

In [10]:
string_data.notna() # returns True for non-NA values and False for NA values

0     True
1    False
2    False
3     True
dtype: bool

### Filtering Out Missing Data

In [11]:
data=pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data=pd.DataFrame([[1., 6.5, 3.],[1., np.nan, np.nan],[np.nan, np.nan, np.nan],[np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
data.dropna() # dropna by default drops any row containing a missing value

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [15]:
data.dropna(how="all") # Passing how="all" will drop only rows that are all NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


Keep in mind that these functions return new objects by default and do not modify the contents of the original object

In [16]:
data[4]=np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
data.dropna(axis="columns",how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
df=pd.DataFrame(np.random.standard_normal((7,3)))

In [19]:
df.iloc[:4,1]=np.nan
df.iloc[:2,2]=np.nan

In [21]:
df

Unnamed: 0,0,1,2
0,-1.055032,,
1,0.172601,,
2,-1.25166,,-2.288262
3,-0.328289,,1.345413
4,1.017318,-0.032061,0.265405
5,1.391746,-0.799101,-2.143424
6,1.509865,-0.129817,0.308813


In [22]:
df.dropna()

Unnamed: 0,0,1,2
4,1.017318,-0.032061,0.265405
5,1.391746,-0.799101,-2.143424
6,1.509865,-0.129817,0.308813


In [23]:
df.dropna(thresh=2) # Suppose you want to keep only rows containing at most a certain number of missing observations. You can indicate this with the thresh argument

Unnamed: 0,0,1,2
2,-1.25166,,-2.288262
3,-0.328289,,1.345413
4,1.017318,-0.032061,0.265405
5,1.391746,-0.799101,-2.143424
6,1.509865,-0.129817,0.308813


### Filling In Missing Data

In [25]:
df.fillna(0) #  Calling fillna with a constant replaces missing values with that value

Unnamed: 0,0,1,2
0,-1.055032,0.0,0.0
1,0.172601,0.0,0.0
2,-1.25166,0.0,-2.288262
3,-0.328289,0.0,1.345413
4,1.017318,-0.032061,0.265405
5,1.391746,-0.799101,-2.143424
6,1.509865,-0.129817,0.308813


In [26]:
# Calling fillna with a dictionary, you can use a different fill value for each column
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-1.055032,0.5,0.0
1,0.172601,0.5,0.0
2,-1.25166,0.5,-2.288262
3,-0.328289,0.5,1.345413
4,1.017318,-0.032061,0.265405
5,1.391746,-0.799101,-2.143424
6,1.509865,-0.129817,0.308813


In [54]:
df=pd.DataFrame(np.random.standard_normal((6,3)))
df.iloc[2:,1]=np.nan
df.iloc[4:,2]=np.nan
df

Unnamed: 0,0,1,2
0,0.226164,0.57551,-0.142742
1,0.587662,0.55356,-0.922122
2,-1.727034,,-0.972534
3,0.472291,,0.055167
4,-0.052783,,
5,-0.943489,,


In [29]:
df.ffill()

Unnamed: 0,0,1,2
0,1.563725,-0.977701,0.852114
1,0.654132,-0.125356,0.302954
2,-0.73457,-0.125356,-0.587995
3,-0.480485,-0.125356,-0.75072
4,-0.812021,-0.125356,-0.75072
5,-1.085047,-0.125356,-0.75072


In [32]:
df.ffill(limit=2) # For forward and backward filling, maximum number of consecutive periods to fill

Unnamed: 0,0,1,2
0,1.563725,-0.977701,0.852114
1,0.654132,-0.125356,0.302954
2,-0.73457,-0.125356,-0.587995
3,-0.480485,-0.125356,-0.75072
4,-0.812021,,-0.75072
5,-1.085047,,-0.75072


In [33]:
data=pd.Series([1., np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## Data Transformation

### Removing Duplicates

In [34]:
data=pd.DataFrame({"k1":["one","two"]*3+["two"],"k2":[1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [35]:
# The DataFrame method duplicated returns a Boolean Series indicating whether each row is a duplicate (its column values are exactly equal to those in an earlier row) or not

data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [36]:
data.drop_duplicates() # drop_duplicates returns a DataFrame with rows where the duplicated array is False filtered out

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [38]:
data["v1"]=range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [39]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [41]:
# duplicated and drop_duplicates by default keep the first observed value combination. Passing keep="last" will return the last one
data.drop_duplicates(["k1","k2"],keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [42]:
data.drop_duplicates(subset=["k2"])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


### Transforming Data Using a Function or Mapping

In [64]:
data=pd.DataFrame({"food":["bacon", "pulled pork", "bacon","pastrami", "corned beef", "bacon","pastrami", "honey ham", "nova lox"],"ounces":[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [66]:
meat_to_animal={"bacon":"pig",
                "pulled pork": "pig",
                "pastrami": "cow",
                 "corned beef": "cow",
                "honey ham": "pig",
                "nova lox": "salmon"}

In [67]:
data["animal"]=data["food"].map(meat_to_animal)

In [68]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [71]:
def get_animal(x):
    return meat_to_animal[x]

data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [72]:
data=pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [73]:
data.replace(-999,np.nan) # The -999 values might be sentinel values for missing data. To replace these with NA values that pandas understands, we can use replace, producing a new Series

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [74]:
# replace multiple values at once, you instead pass a list and then the substitute value
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [75]:
# To use a different replacement for each value, pass a list of substitutes
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [76]:
# The argument passed can also be a dictionary
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64