7.1 Handling Missing Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
float_data=pd.Series([1.2,-3.5,np.nan,0])

In [3]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
string_data=pd.Series([
    "aardvark",np.nan,None,"avocado"
])

In [9]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [10]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [11]:
float_data=pd.Series([1,2,None],dtype="float64")

In [12]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [13]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

## Table 7-1. NA handling object methods

| Method   | Description                                                                 | Example (Python) | Output |
|----------|-----------------------------------------------------------------------------|------------------|--------|
| `dropna` | Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate | `pd.Series([1, np.nan, 3]).dropna()` | `[1.0, 3.0]` |
| `fillna` | Fill in missing data with some value or using an interpolation method such as `"ffill"` or `"bfill"` | `pd.Series([1, np.nan, 3]).fillna(0)` | `[1.0, 0.0, 3.0]` |
| `isna`   | Return Boolean values indicating which values are missing/NA                | `pd.Series([1, np.nan, 3]).isna()` | `[False, True, False]` |
| `notna`  | Negation of `isna`, returns True for non-NA values and False for NA values  | `pd.Series([1, np.nan, 3]).notna()` | `[True, False, True]` |

Filtering Out Missing Data

In [16]:
data=pd.Series(
    [
        1,np.nan,3.5,np.nan,7
    ]
)

In [17]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [15]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [19]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],                   
[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [20]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Passing how="all" will drop only rows that are all NA:

In [22]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [23]:
data[4]=np.nan

In [24]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


To drop columns in the same way, pass axis="columns":

In [28]:
data.dropna(axis="columns",how="all") # use rows instead of columns 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [38]:
df=pd.DataFrame(np.random.standard_normal((7,3)))

In [39]:
df

Unnamed: 0,0,1,2
0,-0.098272,1.669512,-0.608732
1,-1.164949,-1.051988,-0.274587
2,0.42007,-0.158215,1.346672
3,-0.059219,-0.524891,0.157957
4,0.65967,1.407219,-1.223984
5,0.92606,0.130834,-2.236518
6,-2.23619,0.363486,-0.26226


In [40]:
df.iloc[:4:1]=np.nan

In [41]:
df.iloc[:2,2]=np.nan

In [42]:
df

Unnamed: 0,0,1,2
0,,,
1,,,
2,,,
3,,,
4,0.65967,1.407219,-1.223984
5,0.92606,0.130834,-2.236518
6,-2.23619,0.363486,-0.26226


In [43]:
df.dropna()

Unnamed: 0,0,1,2
4,0.65967,1.407219,-1.223984
5,0.92606,0.130834,-2.236518
6,-2.23619,0.363486,-0.26226


thresh=2 â†’ Sets a threshold: a row must have at least 2 non-NaN values to be kept.

If a row has fewer than 2 non-null entries, it will be dropped.

In [44]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
4,0.65967,1.407219,-1.223984
5,0.92606,0.130834,-2.236518
6,-2.23619,0.363486,-0.26226


# Filling In Missing Data

In [45]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.65967,1.407219,-1.223984
5,0.92606,0.130834,-2.236518
6,-2.23619,0.363486,-0.26226


In [46]:
df.fillna(
    {1:0.5,2:0}
)

Unnamed: 0,0,1,2
0,,0.5,0.0
1,,0.5,0.0
2,,0.5,0.0
3,,0.5,0.0
4,0.65967,1.407219,-1.223984
5,0.92606,0.130834,-2.236518
6,-2.23619,0.363486,-0.26226


In [47]:
df=pd.DataFrame(np.random.standard_normal((6,3)))

In [48]:
df

Unnamed: 0,0,1,2
0,-0.031964,-0.217009,-1.805714
1,-0.244118,0.908295,-0.287153
2,-0.841317,-1.633387,-0.005172
3,2.65735,-0.215324,-0.725196
4,-0.164561,2.039083,0.053468
5,-0.875722,0.726936,0.728331


In [49]:
df.iloc[2:,1]=np.nan

In [51]:
df.iloc[4:,2]=np.nan

In [52]:
df

Unnamed: 0,0,1,2
0,-0.031964,-0.217009,-1.805714
1,-0.244118,0.908295,-0.287153
2,-0.841317,,-0.005172
3,2.65735,,-0.725196
4,-0.164561,,
5,-0.875722,,


In [53]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,-0.031964,-0.217009,-1.805714
1,-0.244118,0.908295,-0.287153
2,-0.841317,0.908295,-0.005172
3,2.65735,0.908295,-0.725196
4,-0.164561,0.908295,-0.725196
5,-0.875722,0.908295,-0.725196


In [54]:
df.fillna(method="ffill",limit=2)

  df.fillna(method="ffill",limit=2)


Unnamed: 0,0,1,2
0,-0.031964,-0.217009,-1.805714
1,-0.244118,0.908295,-0.287153
2,-0.841317,0.908295,-0.005172
3,2.65735,0.908295,-0.725196
4,-0.164561,,-0.725196
5,-0.875722,,-0.725196


In [56]:
data=pd.Series(
    [1.,np.nan,3.5,np.nan,7]
)

In [57]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [58]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## Table 7-2. `fillna` function arguments

| Argument | Description                                                                 | Example (Python) | Output |
|----------|-----------------------------------------------------------------------------|------------------|--------|
| `value`  | Scalar value or dictionary-like object to use to fill missing values        | `pd.Series([1, np.nan, 3]).fillna(value=0)` | `[1.0, 0.0, 3.0]` |
| `method` | Interpolation method: `"bfill"` (backward fill) or `"ffill"` (forward fill); default is `None` | `pd.Series([1, np.nan, np.nan, 4]).fillna(method="ffill")` | `[1.0, 1.0, 1.0, 4.0]` |
| `axis`   | Axis to fill on (`"index"` or `"columns"`); default is `"index"`            | `df.fillna(method="ffill", axis="columns")` | Missing values filled column-wise |
| `limit`  | For forward/backward filling, maximum number of consecutive periods to fill | `pd.Series([1, np.nan, np.nan, 4]).fillna(method="ffill", limit=1)` | `[1.0, 1.0, NaN, 4.0]` |

# 7.2 Data Transformation

Removing Duplicates

In [60]:
data=pd.DataFrame({
    "k1":["one","two"]*3+["two"],
    "k2":[1,1,2,3,3,4,4]
})

In [61]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [62]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [63]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [64]:
data["v1"]=range(7)

In [65]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [69]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [75]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [76]:
data.drop_duplicates(["k1", "k2"],keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


Transforming Data Using a Function or Mapping

In [77]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                             "pastrami", "corned beef", "bacon",
                             "pastrami", "honey ham", "nova lox"],
                      "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [78]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [79]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [80]:
data["animal"]=data["food"].map(meat_to_animal)

In [81]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [82]:
def get_animal(x):
    return meat_to_animal[x]

In [83]:
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Replacing Values

In [84]:
data=pd.Series([1.,-999.,2.,-999.,-1000.,3.])

In [85]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [86]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [87]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [88]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

Renaming Axis Indexes

In [89]:
data=pd.DataFrame(
    np.arange(12).reshape((3,4)),
    index=["Ohio","Colorado","New York"],
    columns=["one","two","three","four"]
)

In [90]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [91]:
def transform(x):
    return x[:4].upper()

In [92]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [93]:
data.index=data.index.map(transform)

In [94]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [95]:
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [96]:
data.rename(
    index={"OHIO":"INDIANA"},
    columns={"three":"peekaboo"}
)

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# Discretization and Binning

In [97]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [98]:
bins=[18, 25, 35, 60, 100]

In [99]:
age_catgeories=pd.cut(ages,bins)

In [100]:
age_catgeories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [101]:
age_catgeories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [102]:
age_catgeories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [103]:
age_catgeories.categories[0]

Interval(18, 25, closed='right')

In [104]:
pd.value_counts(age_catgeories)

  pd.value_counts(age_catgeories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [105]:
pd.cut(ages,bins,right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [106]:
group_names=["Youth","YoungAdult","MiddleAged","Senior"]

In [111]:
pd.cut(ages,bins,labels=group_names)
data1=pd.cut(ages,bins,labels=group_names)

In [112]:
data1.value_counts()

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
Name: count, dtype: int64

In [116]:
data=np.random.uniform(size=20)

In [117]:
data

array([0.345369  , 0.44076752, 0.02452659, 0.86867963, 0.29155676,
       0.01467201, 0.00276601, 0.11381718, 0.98834332, 0.77033176,
       0.40374968, 0.00951945, 0.98298375, 0.16202593, 0.30089192,
       0.06507449, 0.97836562, 0.45462466, 0.45526557, 0.54778066])

In [125]:
pd.cut(data,4,precision=2)

[(-1.59, -0.072], (-0.072, 1.44], (-0.072, 1.44], (-1.59, -0.072], (-1.59, -0.072], ..., (-0.072, 1.44], (-1.59, -0.072], (-0.072, 1.44], (-0.072, 1.44], (-3.11, -1.59]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.11, -1.59] < (-1.59, -0.072] < (-0.072, 1.44] < (1.44, 2.96]]

In [126]:
data=np.random.standard_normal(1000)

In [127]:
data

array([ 1.21179486e+00,  1.23130391e+00, -4.12725737e-01, -1.21563407e+00,
        2.13150853e-01, -2.42371643e+00,  5.83258639e-01,  1.62047102e-01,
       -3.34608821e-01, -1.51087867e+00,  1.61253374e+00, -1.13376848e-01,
        3.96463765e-01, -2.04280954e-01,  1.01513898e+00, -3.83200797e-01,
       -1.38711621e+00,  1.06515310e-01,  1.85918035e+00,  3.92594078e-01,
       -1.25487522e+00,  7.22395469e-01,  7.75130404e-01,  3.09137371e-01,
       -1.01692421e+00,  2.45501653e-01,  1.13363774e+00,  3.34522474e-01,
       -9.73312102e-01, -5.03135469e-01, -2.23106903e-01,  5.77254719e-01,
        1.00269811e+00,  1.22605625e+00, -1.42660698e+00, -2.00784850e+00,
        2.68391729e+00, -4.89530548e-01, -7.12006008e-01, -3.38498562e-01,
       -9.98298473e-01, -2.23460661e-01, -1.10868369e+00, -6.36480948e-02,
       -8.50943594e-01, -1.46102238e-01, -1.95643255e+00,  2.50202205e-01,
        4.03074395e-01, -1.87853034e-01, -9.51393001e-01,  1.74936291e-01,
       -2.53270861e+00,  

In [128]:
quartiles=pd.qcut(data,4,precision=2)

In [129]:
quartiles

[(0.7, 3.41], (0.7, 3.41], (-0.66, 0.032], (-3.11, -0.66], (0.032, 0.7], ..., (0.032, 0.7], (-0.66, 0.032], (-3.11, -0.66], (-0.66, 0.032], (-0.66, 0.032]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.11, -0.66] < (-0.66, 0.032] < (0.032, 0.7] < (0.7, 3.41]]

In [130]:
pd.value_counts(quartiles)

  pd.value_counts(quartiles)


(-3.11, -0.66]    250
(-0.66, 0.032]    250
(0.032, 0.7]      250
(0.7, 3.41]       250
Name: count, dtype: int64

In [131]:
pd.qcut(data,[0,0.1,0.5,0.9,1]).value_counts()

(-3.101, -1.244]    100
(-1.244, 0.0316]    400
(0.0316, 1.224]     400
(1.224, 3.41]       100
Name: count, dtype: int64


# Detecting and Filtering Outliers

In [151]:
data=pd.DataFrame(np.random.standard_normal((1000,4)))

In [152]:
data

Unnamed: 0,0,1,2,3
0,0.810900,-0.916399,0.436168,0.549994
1,0.242958,-0.160791,1.287486,-1.335386
2,-0.922054,1.150017,-0.115648,0.391079
3,-0.769177,0.928576,1.214777,0.756846
4,-0.803651,-1.332821,0.309456,-0.073828
...,...,...,...,...
995,-0.181836,-0.848761,1.718168,0.350029
996,1.353697,-1.372409,-1.443486,-0.879906
997,0.040515,0.719458,0.675076,-0.251764
998,1.640560,-0.391408,1.698854,0.355733


In [153]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.026554,-0.016677,0.005972,-0.017697
std,0.994713,0.948939,0.971055,1.026676
min,-2.896423,-2.791413,-3.399079,-3.074274
25%,-0.631211,-0.641661,-0.60027,-0.724481
50%,-0.001835,-0.034751,0.045702,-0.027514
75%,0.722397,0.650594,0.647939,0.689509
max,3.437981,3.231371,3.049216,3.609261


In [154]:
col=data[2]

In [155]:
col

0      0.436168
1      1.287486
2     -0.115648
3      1.214777
4      0.309456
         ...   
995    1.718168
996   -1.443486
997    0.675076
998    1.698854
999   -0.330527
Name: 2, Length: 1000, dtype: float64

In [156]:
col[col.abs()>3]

52    -3.126179
65     3.049216
632   -3.399079
Name: 2, dtype: float64

In [157]:
data[(data.abs()>3).any(axis="columns")]

Unnamed: 0,0,1,2,3
36,0.081042,-1.81272,-1.130081,3.234768
52,-1.885076,1.022188,-3.126179,-0.312911
65,1.190262,-0.958871,3.049216,-0.705813
174,-0.925241,-0.221987,0.639135,-3.074274
213,-0.146655,3.231371,-1.912784,-1.084959
247,3.11458,1.15675,0.334257,-0.386857
467,0.627577,-1.369638,0.415352,3.609261
632,-0.218445,-0.22632,-3.399079,-1.198592
649,3.437981,-0.801845,0.136945,-0.928386
976,-0.125032,-1.590673,1.289214,3.102487


In [159]:
np.sign(data)*3

Unnamed: 0,0,1,2,3
0,3.0,-3.0,3.0,3.0
1,3.0,-3.0,3.0,-3.0
2,-3.0,3.0,-3.0,3.0
3,-3.0,3.0,3.0,3.0
4,-3.0,-3.0,3.0,-3.0
...,...,...,...,...
995,-3.0,-3.0,3.0,3.0
996,3.0,-3.0,-3.0,-3.0
997,3.0,3.0,3.0,-3.0
998,3.0,-3.0,3.0,3.0


In [160]:
data[data.abs()>3]=np.sign(data)*3

In [161]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.026001,-0.016908,0.006448,-0.018569
std,0.992954,0.948175,0.969182,1.023454
min,-2.896423,-2.791413,-3.0,-3.0
25%,-0.631211,-0.641661,-0.60027,-0.724481
50%,-0.001835,-0.034751,0.045702,-0.027514
75%,0.722397,0.650594,0.647939,0.689509
max,3.0,3.0,3.0,3.0


In [162]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,1.0,-1.0,1.0,-1.0
2,-1.0,1.0,-1.0,1.0
3,-1.0,1.0,1.0,1.0
4,-1.0,-1.0,1.0,-1.0
