# Missing values analysis in Pandas

In [1]:
import pandas as pd
import numpy as np
data = {
    'A': [1, 2, np.nan, 4],
    'B': [np.nan, 2, 3, 4],
    'C': [1, np.nan, np.nan, 4]
}
df = pd.DataFrame(data)
print(df)

     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  NaN
2  NaN  3.0  NaN
3  4.0  4.0  4.0


* Checking for Missing Values           
1. isnull() : The isnull() function returns a DataFrame of the same shape as the original, with True for missing values and False for non-missing values.    
2. notnull() : notnull() returns True for non-missing values and False for missing values.python


In [2]:
# 1.
missing_values = df.isnull()
print(missing_values)

       A      B      C
0  False   True  False
1  False  False   True
2   True  False   True
3  False  False  False


In [3]:
# 2.
non_missing_values = df.notnull()
print(non_missing_values)

       A      B      C
0   True  False   True
1   True   True  False
2  False   True  False
3   True   True   True


In [4]:
# isna() and notna()
# isna() is an alias for isnull(), and notna() is an alias for notnull().

missing_values = df.isna()
print(missing_values)

       A      B      C
0  False   True  False
1  False  False   True
2   True  False   True
3  False  False  False


In [5]:
# info()
# The info() method provides a concise summary of the DataFrame, including the number of non-null values per column.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       3 non-null      float64
 2   C       2 non-null      float64
dtypes: float64(3)
memory usage: 228.0 bytes


* Handling Missing Values
1. Dropping Missing Values
   * To drop rows with any missing values, use dropna():
   * To drop columns with any missing values, use dropna() with axis parameter

In [6]:
# Drop rows with missing values
df_dropped_rows = df.dropna()
print(df_dropped_rows)

     A    B    C
3  4.0  4.0  4.0


In [7]:
# Drop columns with missing values
df_dropped_columns = df.dropna(axis=1)
print(df_dropped_columns)

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]


* fil missing value (mean, median)

In [10]:
import pandas as pd
import numpy as np
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, np.nan],
    'C': [1, np.nan, np.nan, 4, 5]
}
df = pd.DataFrame(data)
print(df)

# Filling Missing Values with Mean
# To fill missing values with the mean of the column, you can use the fillna() method with the mean() function:

df_filled_mean = df.copy()
df_filled_mean['A'] = df['A'].fillna(df['A'].mean())
df_filled_mean['B'] = df['B'].fillna(df['B'].mean())
df_filled_mean['C'] = df['C'].fillna(df['C'].mean())

print(df_filled_mean)

     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  NaN
2  NaN  3.0  NaN
3  4.0  4.0  4.0
4  5.0  NaN  5.0
     A    B         C
0  1.0  3.0  1.000000
1  2.0  2.0  3.333333
2  3.0  3.0  3.333333
3  4.0  4.0  4.000000
4  5.0  3.0  5.000000


In [11]:
# Filling Missing Values with Median
# To fill missing values with the median of the column, you can use the fillna() method with the median() function:

df_filled_median = df.copy()
df_filled_median['A'] = df['A'].fillna(df['A'].median())
df_filled_median['B'] = df['B'].fillna(df['B'].median())
df_filled_median['C'] = df['C'].fillna(df['C'].median())

print(df_filled_median)

     A    B    C
0  1.0  3.0  1.0
1  2.0  2.0  4.0
2  3.0  3.0  4.0
3  4.0  4.0  4.0
4  5.0  3.0  5.0
