In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/csv_for_07.csv')
df

Unnamed: 0,employee_id,first_name,last_name,department,salary,join_date
0,101,John,Doe,HR,60000.0,2021-01-15
1,102,Jane,Smith,Engineering,85000.0,2019-05-20
2,103,Peter,Jones,,62000.0,2021-08-10
3,104,Mary,Williams,Sales,,2020-03-22
4,105,David,Brown,Engineering,90000.0,
5,106,Emily,Davis,Sales,78000.0,2020-07-11
6,107,Michael,Miller,Engineering,92000.0,2017-02-28
7,108,Sarah,Wilson,HR,65000.0,2022-04-05
8,109,Chris,Evans,HR,80000.0,
9,110,Anna,Lee,,75000.0,2020-10-10


In [3]:
# View the DataFrame to see the missing values (NaN)
print("Original DataFrame:")
print(df)
print("\n" + "="*50 + "\n")

# Use .isnull() to get a boolean DataFrame showing where values are missing
print("Boolean DataFrame showing null values:")
print(df.isnull())
print("\n" + "="*50 + "\n")

# Use .isnull().sum() to get a count of missing values per column
print("Count of null values per column:")
print(df.isnull().sum())
print("\n" + "="*50 + "\n")

Original DataFrame:
   employee_id first_name last_name   department   salary   join_date
0          101       John       Doe           HR  60000.0  2021-01-15
1          102       Jane     Smith  Engineering  85000.0  2019-05-20
2          103      Peter     Jones          NaN  62000.0  2021-08-10
3          104       Mary  Williams        Sales      NaN  2020-03-22
4          105      David     Brown  Engineering  90000.0         NaN
5          106      Emily     Davis        Sales  78000.0  2020-07-11
6          107    Michael    Miller  Engineering  92000.0  2017-02-28
7          108      Sarah    Wilson           HR  65000.0  2022-04-05
8          109      Chris     Evans           HR  80000.0         NaN
9          110       Anna       Lee          NaN  75000.0  2020-10-10


Boolean DataFrame showing null values:
   employee_id  first_name  last_name  department  salary  join_date
0        False       False      False       False   False      False
1        False       False     

In [4]:
# ====================================================================
# Step 3: Handling Missing Values with dropna()
# ====================================================================

# The .dropna() method removes rows or columns with missing data.

# Drop any row that contains at least one null value
df_dropped_any = df.dropna()
print("DataFrame after dropping rows with ANY null values:")
print(df_dropped_any)
print("\n" + "="*50 + "\n")

# Drop rows only if ALL values in the row are null (not applicable in this dataset)
df_dropped_all = df.dropna(how='all')
print("DataFrame after dropping rows with ALL null values:")
print(df_dropped_all)
print("\n" + "="*50 + "\n")

# Drop rows only if null values are present in specific columns
# Let's drop rows where 'salary' or 'join_date' are null
df_dropped_subset = df.dropna(subset=['department', 'salary'])
print("DataFrame after dropping rows with null values in 'department' or 'salary' columns:")
print(df_dropped_subset)
print("\n" + "="*50 + "\n")


DataFrame after dropping rows with ANY null values:
   employee_id first_name last_name   department   salary   join_date
0          101       John       Doe           HR  60000.0  2021-01-15
1          102       Jane     Smith  Engineering  85000.0  2019-05-20
5          106      Emily     Davis        Sales  78000.0  2020-07-11
6          107    Michael    Miller  Engineering  92000.0  2017-02-28
7          108      Sarah    Wilson           HR  65000.0  2022-04-05


DataFrame after dropping rows with ALL null values:
   employee_id first_name last_name   department   salary   join_date
0          101       John       Doe           HR  60000.0  2021-01-15
1          102       Jane     Smith  Engineering  85000.0  2019-05-20
2          103      Peter     Jones          NaN  62000.0  2021-08-10
3          104       Mary  Williams        Sales      NaN  2020-03-22
4          105      David     Brown  Engineering  90000.0         NaN
5          106      Emily     Davis        Sales  7800

In [5]:
# ====================================================================
# Step 4: Handling Missing Values with fillna()
# ====================================================================

# The .fillna() method fills missing values with a specified value or method.

# Create a fresh copy of the original DataFrame to work with
df_fill = df.copy()

# Fill all null values with a static value (e.g., 'Unknown' or 0)
df_fill['department'].fillna('Unknown', inplace=True)
print("DataFrame after filling null 'department' values with 'Unknown':")
print(df_fill)
print("\n" + "="*50 + "\n")

# Fill missing numerical values with the mean of the column
# Let's fill the missing 'salary' values with the mean salary
mean_salary = df_fill['salary'].mean()
df_fill['salary'].fillna(mean_salary, inplace=True)
print(f"Mean salary is: {mean_salary:.2f}")
print("DataFrame after filling null 'salary' values with the mean:")
print(df_fill)
print("\n" + "="*50 + "\n")

# Fill missing values using a forward-fill method (ffill)
# This propagates the last valid observation forward to the next valid one.
# Let's use this for 'join_date' assuming we want to use the previous employee's date
df_ffill = df.copy()
df_ffill['join_date'].fillna(method='ffill', inplace=True)
print("DataFrame after forward-filling null 'join_date' values:")
print(df_ffill)
print("\n" + "="*50 + "\n")

# Fill different columns with different values using a dictionary
df_dict_fill = df.copy()
fill_values = {
    'department': 'No Dept',
    'salary': df_dict_fill['salary'].median() # Use the median for salary
}
df_dict_fill.fillna(value=fill_values, inplace=True)
print("DataFrame after filling nulls with a dictionary:")
print(df_dict_fill)
print("\n" + "="*50 + "\n")

DataFrame after filling null 'department' values with 'Unknown':
   employee_id first_name last_name   department   salary   join_date
0          101       John       Doe           HR  60000.0  2021-01-15
1          102       Jane     Smith  Engineering  85000.0  2019-05-20
2          103      Peter     Jones      Unknown  62000.0  2021-08-10
3          104       Mary  Williams        Sales      NaN  2020-03-22
4          105      David     Brown  Engineering  90000.0         NaN
5          106      Emily     Davis        Sales  78000.0  2020-07-11
6          107    Michael    Miller  Engineering  92000.0  2017-02-28
7          108      Sarah    Wilson           HR  65000.0  2022-04-05
8          109      Chris     Evans           HR  80000.0         NaN
9          110       Anna       Lee      Unknown  75000.0  2020-10-10


Mean salary is: 76333.33
DataFrame after filling null 'salary' values with the mean:
   employee_id first_name last_name   department        salary   join_date
0  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fill['department'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fill['salary'].fillna(mean_salary, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [6]:
# ====================================================================
# Step 5: Interpolating missing data
# ====================================================================
# For numeric data, interpolation can be a good way to fill missing values
# It estimates values based on other values in the series.
df_interp = df.copy()

# Resetting salary to original state with NaNs to demonstrate interpolation
df_interp['salary'] = df['salary']

# Interpolate the missing salary values
df_interp['salary'].interpolate(method='linear', inplace=True)
print("DataFrame after linear interpolation of 'salary' values:")
print(df_interp)
print("\n" + "="*50 + "\n")


DataFrame after linear interpolation of 'salary' values:
   employee_id first_name last_name   department   salary   join_date
0          101       John       Doe           HR  60000.0  2021-01-15
1          102       Jane     Smith  Engineering  85000.0  2019-05-20
2          103      Peter     Jones          NaN  62000.0  2021-08-10
3          104       Mary  Williams        Sales  76000.0  2020-03-22
4          105      David     Brown  Engineering  90000.0         NaN
5          106      Emily     Davis        Sales  78000.0  2020-07-11
6          107    Michael    Miller  Engineering  92000.0  2017-02-28
7          108      Sarah    Wilson           HR  65000.0  2022-04-05
8          109      Chris     Evans           HR  80000.0         NaN
9          110       Anna       Lee          NaN  75000.0  2020-10-10




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_interp['salary'].interpolate(method='linear', inplace=True)
