# How to handle missing data in a pandas data frame

In [None]:
import pandas as pd

In [None]:
weather_data_df = pd.read_csv("C:\\PythonTutorial\\MyPandas_Blog\\data sets\\Weather Data With Missing Info.csv") 
weather_data_df

## Use Case : Converting the date column into date data type

In [None]:
type(weather_data_df["Date"][0]) #The read_csv method reads date as a string data type

In [None]:
#To read a column as date pass the argument parse_date = [list of columns]
weather_data_df = pd.read_csv("C:\\PythonTutorial\\MyPandas_Blog\\data sets\\Weather Data With Missing Info.csv",
                              parse_dates = ["Date"]) 
type(weather_data_df["Date"][0])

In [None]:
weather_data_df

## To convert the Date column as the index 

In [None]:
weather_data_df.set_index("Date", inplace = True)
weather_data_df

## Use Case : To replace the NaN values with some meaningful values or some guess

In [None]:
new_weather_data_df = weather_data_df.fillna(0)
new_weather_data_df

#Note : All the NaN values goto replaced with zero (0) irrespective of the column

In [None]:
# To replace the NaN values with different values based on the column , pass a dictionary
new_weather_data_df = weather_data_df.fillna({
                                              'Temperature' : 0,
                                              'Wind Speed':0,
                                              'Condition': 'Unknown'
                                             })
new_weather_data_df

## Use Case : Replacing the NaN values in the column Temperature and Wind Speed with zero will be misleading as the avaerage value, if calculated will be wrong. Moreover someone might think that one day the temperature was 29 and the next day it was zero. One alternative soltion will be to replace the unavailable Temperature with previous day's temperature.

In [None]:
new_weather_data_df = weather_data_df.fillna(method = "ffill") #ffill -> forward fill, meaning : if one value of missing then fill it with previous day's value
new_weather_data_df

## Another option will be to copy the previous day's value

In [None]:
new_weather_data_df = weather_data_df.fillna(method = "bfill") #bfill -> backward fill, meaning : if one value of missing then fill it with next day's value
new_weather_data_df

## Passing the argument axis = "column"

In [None]:
new_weather_data_df = weather_data_df.fillna(method="bfill", axis="columns") #The argument axis = "columns" will copy the values from previous or next column based on the method parameter
new_weather_data_df

## The argument limit will only copy the missing value for the specific number of times

In [None]:
new_weather_data_df = weather_data_df.fillna(method="ffill", limit = 1)
new_weather_data_df

![fillna with limit](images/fillna_limit.jpg "fillna_limit.jpg")

## Use Case : Interpolate a missing value

In [None]:
new_weather_data_df = weather_data_df.interpolate() #By default the method is linear, but we can pass a different method to interpolate
new_weather_data_df

![interpolate](images/interpolate.jpg "interpolate.jpg")

In [None]:
# Do interpolation with method = "time"
new_weather_data_df = weather_data_df.interpolate(method = "time")
new_weather_data_df

## To drop the rows having na values in any of the columns

In [None]:
new_weather_data_df = weather_data_df.dropna()
new_weather_data_df

## To drop the rows having na values in all the columns pass the argument how = "all"

In [None]:
new_weather_data_df = weather_data_df.dropna(how = "all")
new_weather_data_df

## To keep the row which has atleat one non na value pass the argument thresh = 1

In [None]:
new_weather_data_df = weather_data_df.dropna(thresh = 1)
new_weather_data_df

## If we pass thresh = 2, then the rows with two na values will be retained

In [None]:
new_weather_data_df = weather_data_df.dropna(thresh = 2)
new_weather_data_df

## Use Case: I added the temperature, wind speed and condition for December 13 and want to fill in the missing rows for 11th and 15th December

In [None]:
weather_data_df = pd.read_csv("C:\\PythonTutorial\\MyPandas_Blog\\data sets\\Weather Data With Missing Info New.csv", 
                              parse_dates = ["Date"]) 
weather_data_df.set_index("Date", inplace = True)

In [None]:
dt = pd.date_range("2019-12-01","2019-12-15")
idx = pd.DatetimeIndex(dt)
new_weather_data_df = weather_data_df.reindex(idx)
new_weather_data_df

## Use Case : Fixing missing values in a data frame having dummy values in the cells where valid value is missing 

In [None]:
import numpy as np # We need to use the NumPy library to solve this use case

weather_data_with_missing_value_df = pd.read_csv("C:\\PythonTutorial\\MyPandas_Blog\\data sets\\weather_data_having_dummy_values_for_misisng_info.csv", 
                              parse_dates = ["Date"])

weather_data_with_missing_value_df

In [None]:
# Replace the values -100000 with NaN
new_weather_data_with_missing_value_df = weather_data_with_missing_value_df.replace(-100000,np.NaN)
new_weather_data_with_missing_value_df

## Use Case : Different column has different placeholder values  

In [None]:
weather_data_with_missing_value_df1 = pd.read_csv("C:\\PythonTutorial\\MyPandas_Blog\\data sets\\weather_data_having_dummy_values_for_misisng_info1.csv", 
                              parse_dates = ["Date"])

weather_data_with_missing_value_df1

In [None]:
# To fix these unknown values we have to pass a list 
new_weather_data_with_missing_value_df1 = weather_data_with_missing_value_df.replace([-100000,-100], np.NaN)
new_weather_data_with_missing_value_df1

## Use Case : Replacing the different placeholder values in different columns

In [None]:
## To replacing the different placeholder values in different columns we have to pass a dictionary instead of a list

new_weather_data_with_missing_value_df1 = weather_data_with_missing_value_df.replace({
    'Temperature':-100000,
    'Wind Speed':-100,
    'Condition':'Not Known'
}, np.NaN)
new_weather_data_with_missing_value_df1

In [None]:
# We can also pass a simple dictionary , which is a mapping
new_weather_data_with_missing_value_df1 = weather_data_with_missing_value_df.replace({
    -100000:np.NaN,
    -100:np.NaN,
    'Not Known':np.NaN
})
new_weather_data_with_missing_value_df1

## Use Case : Replacing the different placeholder values with different data types  in different columns

In [None]:
weather_data_with_missing_value_df2 = pd.read_csv("C:\\PythonTutorial\\MyPandas_Blog\\data sets\\weather_data_having_dummy_values_for_misisng_info2.csv", 
                              parse_dates = ["Date"])

weather_data_with_missing_value_df2

In [None]:
## Here to want to remove all the alphabetic characters in the columns Temperature and Wind Speed with a blank string
## Here we will use regular expression to replece the values
new_weather_data_with_missing_value_df2 = weather_data_with_missing_value_df2.replace('[A-Za-z]','',regex = True)
new_weather_data_with_missing_value_df2

## This also erased the Condition column

*So we need to use a dictionary to replace the alphabets in specific columns*

In [None]:
new_weather_data_with_missing_value_df2 = weather_data_with_missing_value_df2.replace({
    'Temperature':'[A-Za-z]',
    'Wind Speed': '[A-Za-z]'
},'',regex = True)
new_weather_data_with_missing_value_df2

## Use Case : To replace a list of values with another list of values

In [None]:
student_df = pd.DataFrame({
    'score': ['exceptional','average', 'good', 'poor', 'average', 'exceptional'],
    'student': ['rob', 'maya', 'parthiv', 'tom', 'julian', 'erica']
})
student_df

In [None]:
# To replace the score with numbers
new_student_df = student_df.replace(["exceptional", "average", "good", "poor"],[3,2,1,0])

new_student_df