In [1]:
import numpy as np
import pandas as pd

# Finding Null Values and getting rid of them
### Works on Series(one column of data) or complete data frames

In [2]:
pd.isnull(None)

True

In [4]:
pd.notnull(None)

False

In [5]:
pd.notnull(pd.Series([1, np.nan, 7]))

0     True
1    False
2     True
dtype: bool

In [6]:
s = pd.Series([1, 2, 3, np.nan, np.nan, 4])

In [7]:
pd.notnull(s).sum()

4

In [8]:
pd.isnull(s).sum()

2

In [9]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [10]:
s[pd.isnull(s)]

3   NaN
4   NaN
dtype: float64

In [12]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [14]:
# dropping columns with certain amount of nulls
df = pd.DataFrame({
    "Column A": [1, np.nan, 30],
    "Column B": [2, np.nan,31],
    "Column C": [np.nan, np.nan, 100]
})

In [15]:
df.dropna(how ="all")

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
2,30.0,31.0,100.0


In [17]:
df.dropna(thresh=3, axis = "columns")

0
1
2


In [18]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [19]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [20]:
s.fillna(method="ffill")

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [21]:
s.fillna(method="bfill")

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [22]:
df.fillna(method="ffill")

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,1.0,2.0,
2,30.0,31.0,100.0


In [23]:
df.fillna(method="bfill")

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,100.0
1,30.0,31.0,100.0
2,30.0,31.0,100.0


In [24]:
# Filling downwards
df.fillna(method="ffill", axis=0)

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,1.0,2.0,
2,30.0,31.0,100.0


In [25]:
df.fillna(method="ffill", axis=1)

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,2.0
1,,,
2,30.0,31.0,100.0


# Fixing invalid or outlier data

In [26]:
df2 = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    "Age": [29, 30, 24, 290, 25],
})
df2

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [28]:
df2['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [30]:
df2['Sex'].value_counts()

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [31]:
df2['Sex'].replace('D', 'F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [33]:
df2['Sex'].replace({"D":'F', 'N':'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [34]:
df2[df2['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [38]:
df2.loc[df2['Age'] > 100, 'Age'] = df2.loc[df2['Age'] > 100, 'Age'] / 10
df2

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


In [41]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index= [
    'Gerard Aruad',
    'Kim Darroch',
    'Tess Wilhelm',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth'
    ]
)
ambassadors

Gerard Aruad                  France
Kim Darroch           United Kingdom
Tess Wilhelm          United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [42]:
ambassadors.duplicated()

Gerard Aruad          False
Kim Darroch           False
Tess Wilhelm           True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [43]:
ambassadors.duplicated(keep='last')

Gerard Aruad          False
Kim Darroch            True
Tess Wilhelm          False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [44]:
ambassadors.drop_duplicates()

Gerard Aruad                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [45]:
ambassadors.drop_duplicates(keep='last')

Gerard Aruad                  France
Tess Wilhelm          United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [51]:
players = pd.DataFrame({
    'Name' : ['Kobe Bryant', 'Lebron James', 'Kobe Bryant', 'Carmelo Anthony', 'Kobe Bryant'],
    'Pos' : ['SG', 'SF', 'SG', 'SF', 'SF'],
})
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,Lebron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [50]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [52]:
players.duplicated(subset='Name')

0    False
1    False
2     True
3    False
4     True
dtype: bool