In [19]:
import pandas as pd
import numpy as np

In [20]:
# Cleaning Data
df = pd.read_csv("csv1.csv") 

In [21]:
df
# NaN means null or None for missing value.

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


# lets handle missing values first

In [22]:
df.isnull()            # True for null values (NaN)
# df.isna() is same function and does same thing

Unnamed: 0,id,name,age,country,gender,income
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,True,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,False
6,False,True,False,False,False,False
7,False,False,False,True,False,False
8,False,False,False,False,False,False
9,False,False,True,False,False,False


In [23]:
df.isnull().sum()      # counts missing vals per column

id         0
name       1
age        3
country    1
gender     1
income     1
dtype: int64

In [24]:
df.dropna()            # drops rows with missing values

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
5,5,Li Wei,27.0,China,Male,51000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0


In [25]:
df.dropna(axis = 1)    # drops cols with missing values

Unnamed: 0,id
0,1
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [26]:
df

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


# Filling the empty rows

In [27]:
cleaned_data = df.copy()
age_mean = cleaned_data["age"].mean()

cleaned_data['age'] = cleaned_data['age'].fillna(age_mean)

cleaned_data


Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,32.75,Canada,Female,62000.0
3,3,Alex,32.75,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,32.75,Mexico,Male,45000.0


In [32]:
# you can make a copy of the data 
# or you could just make changes on the data temporarily

In [28]:
df.fillna(0)           # fills NaN with value
df["age"] = df["age"].fillna("Not Specified")           # fills column with mean


In [31]:
df[df['age'] == 'Not Specified']

Unnamed: 0,id,name,age,country,gender,income
2,2,Jane Smith,Not Specified,Canada,Female,62000.0
3,3,Alex,Not Specified,USA,Unknown,47000.0
9,9,Carlos Ruiz,Not Specified,Mexico,Male,45000.0


In [33]:
# forward fill - .ffill() - fills the empty row with the previous data in the same column
# backward fill - .bfill() - fills the empty row with the next data in the same column


In [36]:
cleaned_data
# Lets try this for name, country, gender and income section

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,32.75,Canada,Female,62000.0
3,3,Alex,32.75,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,32.75,Mexico,Male,45000.0


In [45]:
cleaned_data.ffill()             # forward fill

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,32.75,Canada,Female,62000.0
3,3,Alex,32.75,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,47000.0
5,5,Li Wei,27.0,China,Male,51000.0
6,6,Li Wei,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,India,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,32.75,Mexico,Male,45000.0


In [46]:
df.bfill()             # backward fill

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,Not Specified,Canada,Female,62000.0
3,3,Alex,Not Specified,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,51000.0
5,5,Li Wei,27.0,China,Male,51000.0
6,6,Ahmed Khan,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,USA,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,Not Specified,Mexico,Male,45000.0
