In [1]:
import pandas as pd
import numpy as np

In [2]:
float_data = pd.Series([1.2,-3.4,np.nan,5])
float_data

0    1.2
1   -3.4
2    NaN
3    5.0
dtype: float64

In [3]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
float_data[~float_data.isna()]

0    1.2
1   -3.4
3    5.0
dtype: float64

In [7]:
float_data[float_data.notna()]

0    1.2
1   -3.4
3    5.0
dtype: float64

In [10]:
data = pd.DataFrame([[1,5.6,3], # each list is a separate row
                     [1.,np.nan,np.nan],
                     [np.nan,np.nan,np.nan],
                     [np.nan,3.5,3.]],
                   columns=['a','b','c'])

In [11]:
data

Unnamed: 0,a,b,c
0,1.0,5.6,3.0
1,1.0,,
2,,,
3,,3.5,3.0


In [12]:
data1 = pd.DataFrame({'a':[1,1,np.nan,np.nan],
                      'b':[5.6,np.nan,np.nan,3.5],
                      'c':[3,np.nan,np.nan,3]})

In [13]:
data1

Unnamed: 0,a,b,c
0,1.0,5.6,3.0
1,1.0,,
2,,,
3,,3.5,3.0


In [18]:
print(data1.dropna()) # by default it is how = any
print(data1.dropna(how= 'any')) 

     a    b    c
0  1.0  5.6  3.0
     a    b    c
0  1.0  5.6  3.0


In [19]:
data1.dropna(how='all') # how ='all' means all the values of a row should be null then only it will get dropped

Unnamed: 0,a,b,c
0,1.0,5.6,3.0
1,1.0,,
3,,3.5,3.0


In [27]:
data.dropna(axis=True,how='all') # be default the axis is row that the values is 0, False

Unnamed: 0,a,b,c
0,1.0,5.6,3.0
1,1.0,,
2,,,
3,,3.5,3.0


In [30]:
df = pd.DataFrame(np.random.standard_normal((7,3)))
df

Unnamed: 0,0,1,2
0,-0.432698,1.502212,-1.197421
1,-3.099905,-0.310957,-0.561378
2,0.227361,-1.797763,-2.161525
3,-1.648474,1.997603,-1.750501
4,-0.241063,-0.763577,1.111396
5,-1.139065,-0.608434,-1.529794
6,-1.640502,0.205644,-1.380876


In [33]:
df.iloc[:4,1]=np.nan
df.iloc[:2,2]=np.nan
df

Unnamed: 0,0,1,2
0,-0.432698,,
1,-3.099905,,
2,0.227361,,-2.161525
3,-1.648474,,-1.750501
4,-0.241063,-0.763577,1.111396
5,-1.139065,-0.608434,-1.529794
6,-1.640502,0.205644,-1.380876


In [38]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.227361,,-2.161525
3,-1.648474,,-1.750501
4,-0.241063,-0.763577,1.111396
5,-1.139065,-0.608434,-1.529794
6,-1.640502,0.205644,-1.380876


In [39]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.432698,0.0,0.0
1,-3.099905,0.0,0.0
2,0.227361,0.0,-2.161525
3,-1.648474,0.0,-1.750501
4,-0.241063,-0.763577,1.111396
5,-1.139065,-0.608434,-1.529794
6,-1.640502,0.205644,-1.380876


In [40]:
df.fillna({1:0,2:2})

Unnamed: 0,0,1,2
0,-0.432698,0.0,2.0
1,-3.099905,0.0,2.0
2,0.227361,0.0,-2.161525
3,-1.648474,0.0,-1.750501
4,-0.241063,-0.763577,1.111396
5,-1.139065,-0.608434,-1.529794
6,-1.640502,0.205644,-1.380876


In [41]:
df = pd.DataFrame(np.random.standard_normal((6,3)))

df.iloc[2:,1]=np.nan
df.iloc[4:,2]=np.nan
df

Unnamed: 0,0,1,2
0,1.170973,-0.87761,-0.25574
1,-0.958684,-0.361728,-1.156715
2,1.37491,,-0.621841
3,1.608864,,-0.136871
4,1.077966,,
5,-0.461113,,


In [44]:
df.ffill()# fillna(method = ffill) is deprecated and ffill funtion is created

Unnamed: 0,0,1,2
0,1.170973,-0.87761,-0.25574
1,-0.958684,-0.361728,-1.156715
2,1.37491,-0.361728,-0.621841
3,1.608864,-0.361728,-0.136871
4,1.077966,-0.361728,-0.136871
5,-0.461113,-0.361728,-0.136871


In [47]:
df.ffill(axis=1,limit=1)

Unnamed: 0,0,1,2
0,1.170973,-0.87761,-0.25574
1,-0.958684,-0.361728,-1.156715
2,1.37491,1.37491,-0.621841
3,1.608864,1.608864,-0.136871
4,1.077966,1.077966,
5,-0.461113,-0.461113,


## 7.2 Data Transformation

## Removing Duplicates

In [48]:
data= pd.DataFrame({'k1':['one',"two"]*3+['two'],
                    'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [66]:
data[data.duplicated(subset=['k1'],keep='last')] # last will be the 1st occurence

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
5,two,4


In [73]:
data.drop_duplicates(subset=['k1'],
                     keep='first',
                     ignore_index=True)

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [74]:
data['v1']=range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [78]:
data.drop_duplicates(subset=['k1'],
                    keep='last')

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


In [79]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                              "pastrami", "corned beef", "bacon",
                              "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


## Transforming Data Using a Function or Mapping

In [81]:
meat_to_animal={
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox': 'salmon'
}

In [83]:
data['animal'] = data['food'].map(meat_to_animal)

In [84]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon
