# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
x = pd.read_csv("NaN.csv")

In [3]:
x

Unnamed: 0,name,english_marks,maths_marks,hindi_marks,science_marks,city,id
0,sahil,45.0,39.0,,13.0,gwalior,3455.0
1,nitin,,34.0,12.0,15.0,,1642.0
2,pankaj,39.0,46.0,3.0,,delhi,
3,neeraj,32.0,,12.0,15.0,agra,3415.0
4,vinay,40.0,35.0,,43.0,pune,1512.0
5,viraj,,42.0,15.0,13.0,,
6,vijay,13.0,35.0,,,mumbai,1515.0
7,kavya,33.0,,45.0,15.0,indore,5145.0
8,gagan,49.0,43.0,15.0,12.0,,1321.0
9,deepak,50.0,24.0,50.0,12.0,kanpur,1551.0


# Find how many NaN values per column

In [4]:
nan_values = x.isnull().sum()

In [5]:
nan_values

name             0
english_marks    2
maths_marks      2
hindi_marks      3
science_marks    2
city             3
id               2
dtype: int64

In [6]:
# see NaN values which column you see

In [7]:
nan_values[0:5]

name             0
english_marks    2
maths_marks      2
hindi_marks      3
science_marks    2
dtype: int64

In [8]:
nan_values[5] # we seen that 5 column have 3 NaN values

3

# Find total missing values in data

In [9]:
total_missing = nan_values.sum()

In [10]:
total_missing

14

In [11]:
# we have total 14 missing values

# Find total cells in data

In [12]:
total_cells = np.product(x.shape)

In [13]:
total_cells

70

# Find percent of data that is missing

In [14]:
percent_missing = (total_missing/total_cells)*100

In [15]:
percent_missing

20.0

# Remove all the rows that contain a missing value

In [16]:
drop_row = x.dropna()

In [17]:
drop_row

Unnamed: 0,name,english_marks,maths_marks,hindi_marks,science_marks,city,id
9,deepak,50.0,24.0,50.0,12.0,kanpur,1551.0


In [18]:
# it means 9 row dont have any NaN values

In [19]:
x

Unnamed: 0,name,english_marks,maths_marks,hindi_marks,science_marks,city,id
0,sahil,45.0,39.0,,13.0,gwalior,3455.0
1,nitin,,34.0,12.0,15.0,,1642.0
2,pankaj,39.0,46.0,3.0,,delhi,
3,neeraj,32.0,,12.0,15.0,agra,3415.0
4,vinay,40.0,35.0,,43.0,pune,1512.0
5,viraj,,42.0,15.0,13.0,,
6,vijay,13.0,35.0,,,mumbai,1515.0
7,kavya,33.0,,45.0,15.0,indore,5145.0
8,gagan,49.0,43.0,15.0,12.0,,1321.0
9,deepak,50.0,24.0,50.0,12.0,kanpur,1551.0


# Remove all the column that contain a missing value

In [20]:
drop_col = x.dropna(axis = 1)

In [21]:
drop_col

Unnamed: 0,name
0,sahil
1,nitin
2,pankaj
3,neeraj
4,vinay
5,viraj
6,vijay
7,kavya
8,gagan
9,deepak


In [22]:
# it means 0 column dont have any NaN values

In [23]:
print("column in orignal data set",x.shape[1])

column in orignal data set 7


In [24]:
print("column after drop NaN value",drop_col.shape[1])

column after drop NaN value 1


In [25]:
x

Unnamed: 0,name,english_marks,maths_marks,hindi_marks,science_marks,city,id
0,sahil,45.0,39.0,,13.0,gwalior,3455.0
1,nitin,,34.0,12.0,15.0,,1642.0
2,pankaj,39.0,46.0,3.0,,delhi,
3,neeraj,32.0,,12.0,15.0,agra,3415.0
4,vinay,40.0,35.0,,43.0,pune,1512.0
5,viraj,,42.0,15.0,13.0,,
6,vijay,13.0,35.0,,,mumbai,1515.0
7,kavya,33.0,,45.0,15.0,indore,5145.0
8,gagan,49.0,43.0,15.0,12.0,,1321.0
9,deepak,50.0,24.0,50.0,12.0,kanpur,1551.0


# Replace all NaN values with 0

In [26]:
replace_NaN_0 = x.fillna(0)

In [27]:
replace_NaN_0

Unnamed: 0,name,english_marks,maths_marks,hindi_marks,science_marks,city,id
0,sahil,45.0,39.0,0.0,13.0,gwalior,3455.0
1,nitin,0.0,34.0,12.0,15.0,0,1642.0
2,pankaj,39.0,46.0,3.0,0.0,delhi,0.0
3,neeraj,32.0,0.0,12.0,15.0,agra,3415.0
4,vinay,40.0,35.0,0.0,43.0,pune,1512.0
5,viraj,0.0,42.0,15.0,13.0,0,0.0
6,vijay,13.0,35.0,0.0,0.0,mumbai,1515.0
7,kavya,33.0,0.0,45.0,15.0,indore,5145.0
8,gagan,49.0,43.0,15.0,12.0,0,1321.0
9,deepak,50.0,24.0,50.0,12.0,kanpur,1551.0


# Replace all NaN values with lower values

In [28]:
replace_NaN_lower = x.fillna(method = "bfill")

In [29]:
replace_NaN_lower

Unnamed: 0,name,english_marks,maths_marks,hindi_marks,science_marks,city,id
0,sahil,45.0,39.0,12.0,13.0,gwalior,3455.0
1,nitin,39.0,34.0,12.0,15.0,delhi,1642.0
2,pankaj,39.0,46.0,3.0,15.0,delhi,3415.0
3,neeraj,32.0,35.0,12.0,15.0,agra,3415.0
4,vinay,40.0,35.0,15.0,43.0,pune,1512.0
5,viraj,13.0,42.0,15.0,13.0,mumbai,1515.0
6,vijay,13.0,35.0,45.0,15.0,mumbai,1515.0
7,kavya,33.0,43.0,45.0,15.0,indore,5145.0
8,gagan,49.0,43.0,15.0,12.0,kanpur,1321.0
9,deepak,50.0,24.0,50.0,12.0,kanpur,1551.0


# Replace all NaN values with upper values

In [30]:
replace_NaN_upper = x.fillna(method = "ffill")

In [31]:
replace_NaN_upper

Unnamed: 0,name,english_marks,maths_marks,hindi_marks,science_marks,city,id
0,sahil,45.0,39.0,,13.0,gwalior,3455.0
1,nitin,45.0,34.0,12.0,15.0,gwalior,1642.0
2,pankaj,39.0,46.0,3.0,15.0,delhi,1642.0
3,neeraj,32.0,46.0,12.0,15.0,agra,3415.0
4,vinay,40.0,35.0,12.0,43.0,pune,1512.0
5,viraj,40.0,42.0,15.0,13.0,pune,1512.0
6,vijay,13.0,35.0,15.0,13.0,mumbai,1515.0
7,kavya,33.0,35.0,45.0,15.0,indore,5145.0
8,gagan,49.0,43.0,15.0,12.0,indore,1321.0
9,deepak,50.0,24.0,50.0,12.0,kanpur,1551.0
