In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.Series([0, 1, 2, 3, 4, 5,np.nan, 6, 7, 8])
data

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
5    5.0
6    NaN
7    6.0
8    7.0
9    8.0
dtype: float64

In [3]:
#to detect missing values
data.isnull()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8    False
9    False
dtype: bool

In [4]:
# Detect existing (non-missing) values.
data.notnull()

0     True
1     True
2     True
3     True
4     True
5     True
6    False
7     True
8     True
9     True
dtype: bool

In [5]:
# creating a dataframe
dict1 = {
    0:[1,4,7,np.nan],
    1:[2,5,np.nan,np.nan],
    2:[3,np.nan,np.nan,np.nan],
    3:[np.nan,np.nan,np.nan,np.nan]
}

df = pd.DataFrame(dict1)
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [6]:
#drops an entire row if one nan value is present
df.dropna()

Unnamed: 0,0,1,2,3


In [7]:
# deletes the rows in which all the values are null
df.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,


In [8]:
# deletes the columns in which all the values are null
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,5.0,
2,7.0,,
3,,,


In [9]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [10]:
# threshold Require that many 'non-nan' values.
df.dropna(thresh=1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,


In [11]:
df.dropna(thresh=1,axis = 1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,5.0,
2,7.0,,
3,,,


In [None]:
df

#### Filling the null values
* syntax --- > **df.fillna(val)**
* returns a new DataFrame

In [12]:
df_fill = df.fillna(10)
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,10.0
1,4.0,5.0,10.0,10.0
2,7.0,10.0,10.0,10.0
3,10.0,10.0,10.0,10.0


In [13]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [14]:
# to fill perticular values in perticular columns
df_fill = df.fillna({0: 10, 1: 20, 2: 30, 3: 40})
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,40.0
1,4.0,5.0,30.0,40.0
2,7.0,20.0,30.0,40.0
3,10.0,20.0,30.0,40.0


In [15]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [16]:
# fills the nan value with previous observation/row value
df_fill = df.fillna(method='ffill')
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,3.0,
2,7.0,5.0,3.0,
3,7.0,5.0,3.0,


In [17]:
#if you specify the limit,then only for those many 'NaN' values in a column it will fill the values
df_fill = df.fillna(method='ffill', limit = 2)
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,3.0,
2,7.0,5.0,3.0,
3,7.0,5.0,,


In [18]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


In [19]:
#fills the nan value with previous column's value when axis = 1
df_fill = df.fillna(axis = 1, method='ffill',limit = 1)
df_fill

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,3.0
1,4.0,5.0,5.0,
2,7.0,7.0,,
3,,,,


In [20]:
df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,,
2,7.0,,,
3,,,,


#### df.mean()
* gives the mean
* to fill the missing values by the mean values we use mean()


In [21]:
df_mean = df.fillna(df.mean())
df_mean

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,4.0,5.0,3.0,
2,7.0,3.5,3.0,
3,4.0,3.5,3.0,


#### .replace(old_val,new_val)
* replaces the old values with new values
* values can be anything.

In [22]:
ser1 = pd.Series([1,2,100,4,5,100,7,8,100])
ser1

0      1
1      2
2    100
3      4
4      5
5    100
6      7
7      8
8    100
dtype: int64

In [23]:
# to replace 100 with 1
ser1.replace(100, 1)

0    1
1    2
2    1
3    4
4    5
5    1
6    7
7    8
8    1
dtype: int64

In [24]:
ser1

0      1
1      2
2    100
3      4
4      5
5    100
6      7
7      8
8    100
dtype: int64

In [25]:
# to replace multiple values with one single value
ser1.replace([100, 1, 2], 50)

0    50
1    50
2    50
3     4
4     5
5    50
6     7
7     8
8    50
dtype: int64

In [26]:
ser1

0      1
1      2
2    100
3      4
4      5
5    100
6      7
7      8
8    100
dtype: int64

In [27]:
# replacing different values with some other different values
ser1.replace({100: np.nan, 1: 50})

0    50.0
1     2.0
2     NaN
3     4.0
4     5.0
5     NaN
6     7.0
7     8.0
8     NaN
dtype: float64

#### get_dummies()
* it is one of the method to convert categorical variable to numerical variables

In [28]:
ser2 = pd.Series(list('abcdeabcd'))
ser2

0    a
1    b
2    c
3    d
4    e
5    a
6    b
7    c
8    d
dtype: object

In [29]:
pd.get_dummies(ser2)

Unnamed: 0,a,b,c,d,e
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,0,1
5,1,0,0,0,0
6,0,1,0,0,0
7,0,0,1,0,0
8,0,0,0,1,0
