# Dealing with unclean data

We're going to look at data that may require some cleansing.

In [3]:
import numpy as np
import pandas as pd

## Read the admissions data that is not so clean

In [4]:
data_location  = 'https://elephantscale-public.s3.amazonaws.com/data/college-admissions/admission-data-dirty.csv'

admissions = pd.read_csv(data_location)
print("admissions size : ", admissions.size)
admissions

admissions size :  80


Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
5,0.0,,3.35,
6,1.0,520.0,,3
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
9,,600.0,2.82,4


## Get Summary
See what we get.  It will skip null values

In [5]:
## TODO : use 'describe' functions 
admissions.describe()

Unnamed: 0,admit,gre,gpa
count,18.0,19.0,19.0
mean,0.5,594.736842,3.499474
std,0.514496,109.309368,0.353467
min,0.0,400.0,2.82
25%,0.0,510.0,3.25
50%,0.5,600.0,3.56
75%,1.0,690.0,3.715
max,1.0,800.0,4.0


In [6]:
## TODO : make describe include all columns
admissions.describe(include = 'all')

Unnamed: 0,admit,gre,gpa,rank
count,18.0,19.0,19.0,19.0
unique,,,,5.0
top,,,,2.0
freq,,,,10.0
mean,0.5,594.736842,3.499474,
std,0.514496,109.309368,0.353467,
min,0.0,400.0,2.82,
25%,0.0,510.0,3.25,
50%,0.5,600.0,3.56,
75%,1.0,690.0,3.715,


In [7]:
## TODO : Describe more than one column : gre and gpa
## Hint : add 'gpa' column
admissions[['gre', 'gpa']].describe()

Unnamed: 0,gre,gpa
count,19.0,19.0
mean,594.736842,3.499474
std,109.309368,0.353467
min,400.0,2.82
25%,510.0,3.25
50%,600.0,3.56
75%,690.0,3.715
max,800.0,4.0


## Drop all null values

In [9]:
print("raw data size : ", admissions.size)

## TODO : use 'dropna' function
dropped_na = admissions.dropna()
print()
print("after drop size : ", dropped_na.size)
dropped_na


raw data size :  80

after drop size :  64


Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
10,1.0,500.0,3.6,3
11,0.0,500.0,3.95,4
13,1.0,560.0,3.59,2


In [10]:
# only drop nulls from admit & gre column
print("raw data size : ", admissions.size)

print()

dropped2 = admissions.dropna(subset=['admit', 'gre'])
print("after drop size : ", dropped2.size)
dropped2

raw data size :  80

after drop size :  68


Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
6,1.0,520.0,,3
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
10,1.0,500.0,3.6,3
11,0.0,500.0,3.95,4


## Fill in the values

In [11]:
## TODO :  fill every thing with zero
## Hint : use 'fillna'
zero_fill = admissions.fillna(0)
zero_fill

Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
5,0.0,0.0,3.35,0
6,1.0,520.0,0.0,3
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
9,0.0,600.0,2.82,4


In [None]:
# or we can specify per column default value
## TODO : specify different default values per column
##        default value for gre = -100
fill2 = admissions.fillna({'admit': -1, 'gre':??? , 'gpa':-1, 'rank':10})
fill2

## Replace values

In [None]:
print (admissions)

admissions2 = admissions.copy(deep=True)

## TODO : use replace to change 800 to 1000
## Hint : replace (800, 1000)
admissions2['gre'].replace(???, ???, inplace=True)

print()
print (admissions2)

## Clean out RANK column

In [None]:
## TODO : filter out any thing other than 1,2,3,4  in rank column
a = admissions[admissions['rank'].isin(['1','2','3','4'])]
a