In [1]:
import numpy as np
import pandas as pd

In [2]:
falsy_values = ['', ' ', False, None, np.nan, [], {}, 0] # list of falsy values

In [3]:
any(falsy_values) # any returns True if any element in the list is truthy


True

In [4]:
np.nan # NaN (Not a Number) is a special floating-point value in NumPy that represents undefined or unrepresentable data.

nan

In [5]:
5 + np.nan # NaN is a special floating-point value in NumPy that represents undefined or unrepresentable data. It is not equal to any value, including itself. Any operation involving NaN will result in NaN.

nan

In [6]:
x = np.array([1, 2, 3, np.nan, np.nan,4 , 5]) # create an array with NaN values
x

array([ 1.,  2.,  3., nan, nan,  4.,  5.])

In [7]:
x.sum() # sum of the array will be NaN because it includes NaN values

nan

In [8]:
x.mean() # mean of the array will be NaN because it includes NaN values

nan

In [9]:
x = np.array([1, 2, 3, np.nan, np.nan,4 , 5], dtype=np.float64) # 
x

array([ 1.,  2.,  3., nan, nan,  4.,  5.])

In [10]:
np.inf # Infinity (Inf) is a special floating-point value in NumPy that represents a value too large to be represented as a float. It is greater than any other numeric value.

inf

In [11]:
np.inf / 3 # np.inf / 3 will be np.inf because it is a finite number divided by a finite number.

inf

In [12]:
np.inf / np.inf # np.inf / np.inf will be NaN because it is an undefined operation.

nan

In [13]:
y =np.array([1, 2, 3, np.inf, np.nan,4 , 5], dtype=np.float64) # create an array with Inf and NaN values
y

array([ 1.,  2.,  3., inf, nan,  4.,  5.])

In [14]:
y.sum() # sum of the array will be Nan  because it includes Inf and nan valuesvalues

nan

In [15]:
np.isnan(np.nan) # check if the value is NaN


True

In [16]:
np.isinf(np.inf) # check if the value is Inf

True

In [17]:
np.isfinite(np.nan) # check if the value is finite

False

In [18]:
np.isfinite(np.inf) # check if the value is finite

False

In [19]:
# PANDAS UTILITIES FOR MISSING DATA

#isnull() function in pandas can be used to detect missing values in a DataFrame or Series. It returns a boolean array indicating whether each element is an NA, NaN, None, or False.
pd.isnull(np.nan) # check if the value is null

True

In [20]:
pd.isnull(None) # check if the value is null

True

In [21]:
# isnall() function in pandas can be used to detect missing values in a DataFrame or Series. It returns a boolean array indicating whether each element is an NA, NaN, None, or False.

pd.isna(np.nan) # check if the value is na

True

In [22]:
pd.isna(None) # check if the value is na

True

In [23]:
# notnull() function in pandas can be used to detect non-missing values in a DataFrame or Series. It returns a boolean array indicating whether each element is not NA, NaN, None, or False.

pd.notnull(np.nan) # check if the value is not null

False

In [24]:
pd.notnull(None) # check if the value is not null

False

In [25]:
pd.notnull(789) # check if the value is not null

True

In [26]:
x

array([ 1.,  2.,  3., nan, nan,  4.,  5.])

In [27]:
pd.notnull(x) # check if the value is not null

array([ True,  True,  True, False, False,  True,  True])

In [28]:
# FILTERING MISSING DATA

seri = pd.Series([1, 2, 3, np.nan, np.nan, 4, 5])

In [29]:
pd.notnull(seri) # check if the value is not null

0     True
1     True
2     True
3    False
4    False
5     True
6     True
dtype: bool

In [30]:
pd.isnull(seri) # check if the value is null  

0    False
1    False
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [31]:
pd.notnull(seri).sum() # count of non-missing values

5

In [32]:
pd.isnull(seri).sum() # count of missing values

2

In [33]:
seri[pd.notnull(seri)] # filter out the missing values

0    1.0
1    2.0
2    3.0
5    4.0
6    5.0
dtype: float64

In [34]:
seri[pd.isnull(seri)] # filter in the missing values

3   NaN
4   NaN
dtype: float64

In [35]:
seri.notnull() # another way to use pd.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
6     True
dtype: bool

In [36]:
seri.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [37]:
seri[seri.notnull()] # another way to filter out the missing values]

0    1.0
1    2.0
2    3.0
5    4.0
6    5.0
dtype: float64

In [38]:
seri[seri.isnull()] # another way to filter in the missing values]

3   NaN
4   NaN
dtype: float64

In [39]:
#DROPPING MISSING DATA, NULL VALUES


In [40]:
seri

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
6    5.0
dtype: float64

In [41]:
#dropna() method removes missing data

seri.dropna() # drop all rows that contain at least one null value

0    1.0
1    2.0
2    3.0
5    4.0
6    5.0
dtype: float64

In [42]:
#dropping null values on dataframe

df = pd.DataFrame({
    'Product ID': [101, np.nan, 103, np.nan],
    'Price': [15, 22, 35, 40],
    'Stock': [35, 50, 12, 75],
    'Sold': [3, 8, np.nan, 6]
}) # create a dataframe with null values
df

Unnamed: 0,Product ID,Price,Stock,Sold
0,101.0,15,35,3.0
1,,22,50,8.0
2,103.0,35,12,
3,,40,75,6.0


In [43]:
df.shape # check the shape of the dataframe before dropping na

(4, 4)

In [44]:
df.info() # check info about the dataframe before dropping na

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Product ID  2 non-null      float64
 1   Price       4 non-null      int64  
 2   Stock       4 non-null      int64  
 3   Sold        3 non-null      float64
dtypes: float64(2), int64(2)
memory usage: 260.0 bytes


In [45]:
df.isnull() # check for missing data in each column

Unnamed: 0,Product ID,Price,Stock,Sold
0,False,False,False,False
1,True,False,False,False
2,False,False,False,True
3,True,False,False,False


In [46]:
df.isnull().sum() # count the number of null values in each column

Product ID    2
Price         0
Stock         0
Sold          1
dtype: int64

In [47]:
df.dropna() # drop all rows with at least one null value, default axis=0 (rows)

Unnamed: 0,Product ID,Price,Stock,Sold
0,101.0,15,35,3.0


In [48]:
df.dropna(axis=1) # drop columns that contain at least one null value, axis=1 (columns)

Unnamed: 0,Price,Stock
0,15,35
1,22,50
2,35,12
3,40,75


In [49]:
#dropna() with any or all keyword arguments

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', np.nan, 'Dave'],
    'Math': [90, np.nan, 75, 88],
    'Science': [np.nan, 85, 92, np.nan],
    'History': [78, 80, np.nan, 95]
})

In [50]:
df

Unnamed: 0,Name,Math,Science,History
0,Alice,90.0,,78.0
1,Bob,,85.0,80.0
2,,75.0,92.0,
3,Dave,88.0,,95.0


In [84]:
df.shape

(4, 4)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Player   3 non-null      object 
 1   Goals    3 non-null      float64
 2   Assists  2 non-null      float64
 3   Team     3 non-null      object 
dtypes: float64(2), object(2)
memory usage: 260.0+ bytes


In [51]:
df.dropna(how='any') # drop rows that contain any null values, also default behavior

Unnamed: 0,Name,Math,Science,History


In [52]:
df.dropna(how='all') # drop only the rows where all columns are null

Unnamed: 0,Name,Math,Science,History
0,Alice,90.0,,78.0
1,Bob,,85.0,80.0
2,,75.0,92.0,
3,Dave,88.0,,95.0


In [53]:
# threshold parameter in dropna() to specify a minimum number of non-null values required for a row or column to be kept
df.dropna(thresh=3) # keep rows with at least 3 non-null values, default is axis=0 (rows)

Unnamed: 0,Name,Math,Science,History
0,Alice,90.0,,78.0
1,Bob,,85.0,80.0
3,Dave,88.0,,95.0


In [54]:
df.dropna(thresh=3, axis=1) # keep columns with at least 2 non-null values

Unnamed: 0,Name,Math,History
0,Alice,90.0,78.0
1,Bob,,80.0
2,,75.0,
3,Dave,88.0,95.0


In [55]:
df.dropna(thresh=3, axis='columns') # same as above

Unnamed: 0,Name,Math,History
0,Alice,90.0,78.0
1,Bob,,80.0
2,,75.0,
3,Dave,88.0,95.0


In [56]:
# FILLING NULL VALUES

In [57]:
seri

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
6    5.0
dtype: float64

In [58]:
seri.fillna(0) # fill all null values with 0

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
6    5.0
dtype: float64

In [59]:

seri.fillna(seri.mean()) # fill null values with the mean of the series

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [60]:
#filling null values with contiguous values - ffill and bfill

seri.ffill() # forward fill, use previous value to fill nulls

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [61]:
seri.bfill() # backward fill, use next value to fill nulls

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
6    5.0
dtype: float64

In [62]:
# beware of any values that lie at the beginning or end of a series where there is no preceding or following value to use for filling

pd.Series([np.nan, 5, np.nan, 7]).ffill() # forward fill nulls with previous non-null value

0    NaN
1    5.0
2    5.0
3    7.0
dtype: float64

In [63]:
pd.Series([1, np.nan, 5, np.nan, np.nan]).bfill() # backward fill nulls with next non-null value

0    1.0
1    5.0
2    5.0
3    NaN
4    NaN
dtype: float64

In [64]:
# FILLING NULL VALUES ON A DATAFRAME

df = pd.DataFrame({
    'Player': ['Messi', 'Ronaldo', np.nan, 'Neymar'],
    'Goals': [25, np.nan, 18, 22],
    'Assists': [np.nan, 10, 12, np.nan],
    'Team': ['PSG', 'MU', np.nan, 'PSG']
})

In [65]:
df

Unnamed: 0,Player,Goals,Assists,Team
0,Messi,25.0,,PSG
1,Ronaldo,,10.0,MU
2,,18.0,12.0,
3,Neymar,22.0,,PSG


In [66]:
df.fillna({"Player": "Toygar", "Goals": 30, "Assists": df["Assists"].mean(), "Team": "Fenerbahçe"}) # fill nulls with specific values or mean of the column, specify columns to be filled with exact match for string, number and mean for numeric data

Unnamed: 0,Player,Goals,Assists,Team
0,Messi,25.0,11.0,PSG
1,Ronaldo,30.0,10.0,MU
2,Toygar,18.0,12.0,Fenerbahçe
3,Neymar,22.0,11.0,PSG


In [67]:
df

Unnamed: 0,Player,Goals,Assists,Team
0,Messi,25.0,,PSG
1,Ronaldo,,10.0,MU
2,,18.0,12.0,
3,Neymar,22.0,,PSG


In [68]:
df.ffill() # forward fill - fills previous value in case of missing data, only works on single column at a time, axis=0 is default but can also use axis=1 for columns

Unnamed: 0,Player,Goals,Assists,Team
0,Messi,25.0,,PSG
1,Ronaldo,25.0,10.0,MU
2,Ronaldo,18.0,12.0,MU
3,Neymar,22.0,12.0,PSG


In [69]:
pd.set_option('future.no_silent_downcasting', True) # opt into future behavior globally
df.ffill(axis=1).infer_objects(copy=False) # forward fill by columns, explicitly handle dtype conversion not to get a warning from deprecation warning

Unnamed: 0,Player,Goals,Assists,Team
0,Messi,25.0,25.0,PSG
1,Ronaldo,Ronaldo,10.0,MU
2,,18.0,12.0,12.0
3,Neymar,22.0,22.0,PSG


In [70]:
df.ffill(axis=1).astype(object) # If you want to keep object dtype (no conversion)

Unnamed: 0,Player,Goals,Assists,Team
0,Messi,25.0,25.0,PSG
1,Ronaldo,Ronaldo,10.0,MU
2,,18.0,12.0,12.0
3,Neymar,22.0,22.0,PSG


In [71]:
# checking if there are any nulls in the dataframe by checking the length

seri.dropna().count()


5

In [72]:
na = len(seri.dropna()) != len(seri) # check for NA values by comparing lengths of original and cleaned series
print("There is missing data: ", na)

There is missing data:  True


In [73]:
len(seri)

7

In [74]:
seri.count()

5

In [75]:
na2 = seri.count() != len(seri) # check for NA values by comparing length of non-NA count with total length
print("There is missing data: ", na2)

There is missing data:  True


In [76]:
#isnull method returns a boolean array with the same shape as the original array, where each element is a boolean indicating whether that element is null or not. The any method returns True if there is at least one True value in the array, and False otherwise.
seri.isnull().any() # check for NA values by using isnull method

True

In [77]:
seri.isnull().all() # check if all values are null

False

In [78]:
seri

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
6    5.0
dtype: float64

In [79]:
pd.Series([True, False, False]).all() # check if all values in the series are true

False

In [80]:
pd.Series([True, True, True]).all() # check if all values in the series are true

True

In [81]:
# isnull() with .values attribute
seri.isnull().values # returns a numpy array of boolean values

array([False, False, False,  True,  True, False, False])

In [82]:
seri.isnull().values.any() # returns a numpy array of boolean values

True

In [83]:
seri.isnull().values.all() # returns a numpy array of boolean values checking if all values are null

False