## Cleaning and Preparing Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

from pandas import DataFrame, Series

#### Using NaN

In [12]:
str_data = pd.Series(['aa','ar',np.nan,'av']) # converted a list to a series
str_data[1] = None
str_data

0      aa
1    None
2     NaN
3      av
dtype: object

In [13]:
clean_data = str_data.dropna()
clean_data

0    aa
3    av
dtype: object

In [15]:
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3., NA],[1., NA, 3.5, NA], [NA, NA, NA, NA],[NA, 6.5, 3., NA]])
data

Unnamed: 0,0,1,2,3
0,1.0,6.5,3.0,
1,1.0,,3.5,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna() # drop all rows where ANY member has NaN

Unnamed: 0,0,1,2,3


In [18]:
data.dropna(how='all') # drop all rows where ALL members have NaN
data.dropna(how='all', axis=1) # drop all columns where ALL members have NaN, default is axis=0 for rows

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,3.5
2,,,
3,,6.5,3.0


In [19]:
data.dropna(thresh=2) # threshold for dropping rows where at least 2 values are not NaN

Unnamed: 0,0,1,2,3
0,1.0,6.5,3.0,
1,1.0,,3.5,
3,,6.5,3.0,


#### Replace data

In [31]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[3:6, 1] = NA # start at index location 3 until iloc 4 in column 1
df.iloc[:2, 2] = NA # start at index location 0 until iloc 2 in column 2
#df.dropna(thresh=2)
#df.fillna(0, inplace=True) # True persists the data, default is False ie non-persistent
#df.fillna({1:0.5, 2:0.8}) # fill col 1 with 0.5 and col 2 with 0.8
df

Unnamed: 0,0,1,2
0,-0.182692,-1.368081,
1,0.688198,-0.852702,
2,-1.011141,0.606293,-1.532373
3,-0.663864,,0.610684
4,0.031719,,-1.479221
5,0.476536,,0.59882
6,-0.567729,-0.697346,-0.999434


In [34]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[2:, 1] = NA # start at index location 3 until iloc 4 in column 1
df.iloc[4:, 2] = NA # start at index location 0 until iloc 2 in column 2
df.fillna(method='ffill', limit=2)
#df

Unnamed: 0,0,1,2
0,2.515702,-1.031059,1.190316
1,0.364237,0.702375,0.44643
2,-0.768559,0.702375,-2.771422
3,-0.337888,0.702375,-0.536931
4,-0.252854,,-0.536931
5,-0.600841,,-0.536931
6,-0.112868,,


In [35]:
# remove duplicates
data = pd.DataFrame({'k1':['one','two']*3 +['two'],
                     'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [36]:
data.duplicated() # spot dupes

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [39]:
data.drop_duplicates() # drop dupe rows, non-persistent
data.drop_duplicates(keep='last') # drop dupe rows, non-persistent, but keep the last found dupe

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
6,two,4


In [43]:
data.replace (1, 100) # replae all 1s with 100, non-persistent
#data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


#### Discrete data and bins

In [47]:
ages = [20,22,25,27,21,23,37,31,61,45,41,42]
bins = [18, 25, 35, 60, 100]
categories = pd.cut(ages, bins)
categories # output will be bins with notation (])
categories.codes # code bin by index 0,3
categories.categories # whcih bins
pd.value_counts(categories)

(18, 25]     5
(35, 60]     4
(25, 35]     2
(60, 100]    1
dtype: int64

In [51]:
pd.cut(ages, [18, 22, 36,61,100], right=False)
grp_names = ['Teen','Youth,','Adult','Senior']
pd.cut(ages, bins, labels=grp_names)

['Teen', 'Teen', 'Teen', 'Youth,', 'Teen', ..., 'Youth,', 'Senior', 'Adult', 'Adult', 'Adult']
Length: 12
Categories (4, object): ['Teen' < 'Youth,' < 'Adult' < 'Senior']

#### Grouping Data

In [55]:
data.groupby('k1').count() # select count(8) from data group by 'k1'

Unnamed: 0_level_0,k2
k1,Unnamed: 1_level_1
one,3
two,4


In [56]:
data.groupby('k1').sum()

Unnamed: 0_level_0,k2
k1,Unnamed: 1_level_1
one,6
two,12
