<center><h1>Working with missing datta</h1></center>

In [36]:
# Import the pandas library and assign it the alias 'pd'
import pandas as pd

import warnings
# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

In [37]:
# Create a pandas DataFrame 'temps_df' containing temperature measurements

# The DataFrame consists of three columns:
    # 'sequence': Sequence numbers for the measurements.
    # 'measurement_type': Type of measurement ('actual' or 'estimated').
    # 'temperature_f': Temperature values in Fahrenheit.
temps_df = pd.DataFrame({
    "sequence": [1, 2, 3, 4, 5],  # Sequence numbers
    "measurement_type": ['actual', 'actual', 'actual', None, 'estimated'],  # Type of measurement
    "temperature_f": [67024, 84.56, 91.61, None, 49.64]  # Temperature values in Fahrenheit
})

# Display the DataFrame 'temps_df'
temps_df

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67024.0
1,2,actual,84.56
2,3,actual,91.61
3,4,,
4,5,estimated,49.64


## Using isna() to identify null values in a dataframe

In [38]:
# Check for missing values (NaNs) in the DataFrame 'temps_df'

# The output will be a DataFrame of the same shape as 'temps_df',
# where each cell contains True if the corresponding value in 'temps_df' is NaN, 
# and False otherwise.
temps_df.isna()

Unnamed: 0,sequence,measurement_type,temperature_f
0,False,False,False
1,False,False,False
2,False,False,False
3,False,True,True
4,False,False,False


## How is missing data handled?

In [39]:
# Calculate the cumulative sum of the 'temperature_f' 
# column in the DataFrame 'temps_df'

# The output will be a Series containing the cumulative sum of the 
#'temperature_f' column. Each value represents the sum of all previous 
# temperature values along the column.
temps_df['temperature_f'].cumsum()

0    67024.00
1    67108.56
2    67200.17
3         NaN
4    67249.81
Name: temperature_f, dtype: float64

In [40]:
# Calculate the cumulative sum of the 'temperature_f' column 
# in the DataFrame 'temps_df', including NaN values

# The output will be a Series containing the cumulative sum of the 
# 'temperature_f' column, where NaN values are treated as NaNs in the calculation.
temps_df['temperature_f'].cumsum(skipna=False)

0    67024.00
1    67108.56
2    67200.17
3         NaN
4         NaN
Name: temperature_f, dtype: float64

In [41]:
# Group the DataFrame 'temps_df' by the 'measurement_type' 
# column and calculate the maximum value for each group

# The output will be a DataFrame with 'measurement_type' as the index and the 
# maximum value of 'sequence' and 'temperature_f' columns for each group. Since 
# 'sequence' is a numerical column, the maximum value is the largest sequence 
# number in each group. For 'temperature_f', the maximum value will be the largest # temperature value in each group.
temps_df.groupby(by=['measurement_type']).max()

Unnamed: 0_level_0,sequence,temperature_f
measurement_type,Unnamed: 1_level_1,Unnamed: 2_level_1
actual,3,67024.0
estimated,5,49.64


In [42]:
# Retain NA dimensions in grouping
temps_df.groupby(by=['measurement_type'], dropna=False).max()

Unnamed: 0_level_0,sequence,temperature_f
measurement_type,Unnamed: 1_level_1,Unnamed: 2_level_1
actual,3,67024.0
estimated,5,49.64
,4,


### Dealing with missing data: The blunt approach using dropna()

In [43]:
# Remove rows with missing values (NaNs) from the DataFrame 'temps_df'
# using axis=0 (default)
temps_df.dropna()

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67024.0
1,2,actual,84.56
2,3,actual,91.61
4,5,estimated,49.64


In [44]:
# Remove columns with missing values (NaNs) from the DataFrame 'temps_df'

# The axis=1 parameter specifies that columns should be checked for missing 
# values and dropped if any are found.
temps_df.dropna(axis=1)

Unnamed: 0,sequence
0,1
1,2
2,3
3,4
4,5


### Replace null values using fillna()

In [45]:
# Replace missing values (NaNs) in the DataFrame 'temps_df' with zeros
temps_df.fillna(0)

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67024.0
1,2,actual,84.56
2,3,actual,91.61
3,4,0,0.0
4,5,estimated,49.64


In [46]:
# Replace missing values (NaNs) in the DataFrame 'temps_df' 
# with the preceding non-null value along each column

# The fillna(method='pad') method, also known as forward fill, fills missing values 
# in 'temps_df' with the value from the previous row along each column. If the 
# first row contains NaNs, they will remain unchanged.
temps_df.fillna(method='pad')

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67024.0
1,2,actual,84.56
2,3,actual,91.61
3,4,actual,91.61
4,5,estimated,49.64


### Interpolate

In [47]:
temps_df.interpolate()

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67024.0
1,2,actual,84.56
2,3,actual,91.61
3,4,,70.625
4,5,estimated,49.64


In [48]:
# Perform linear interpolation to fill in missing values 
# (NaNs) in the DataFrame 'temps_df'

# The interpolate() method fills missing values in 'temps_df' by linear
# interpolation along each column. It computes intermediate values based on the 
# neighboring data points, providing a smooth transition between them.
temps_df.interpolate()

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67024.0
1,2,actual,84.56
2,3,actual,91.61
3,4,,70.625
4,5,estimated,49.64
