## Handling Missing Data 

In [15]:
import pandas as pd 
import numpy as np 
from numpy import nan as NA  ## To follow the R convention

In [37]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data.isnull() # Returns Ture where have np.nan

## @note: Floating point nan is used as sentinel value to detect missing values in pandas objects

## Filtering out missing data 
data = pd.Series([NA, 1, 2, NA, 4])
data.dropna() # Returns object without the NA values 

## Or using boolean indexing 
data[~data.isnull()] # or data[data.notnull()]

## With frame's, will drop rows that have missing values by default 
data = pd.DataFrame([[1., 6.5, 3.], [2., NA, NA],
                            [NA, NA, NA], [NA, 2.5, 1.]])
data.dropna() # Will only keep first row 
data.dropna(axis = 1) # Will drop all cols since they all have NAs 
data.dropna(how = 'all') # Will remove rows that are all NA, so only the 2nd one 

## Filling missing data 
## Can use the same int. methods used for reindexing - or...
data.fillna(0)
data.fillna({1:1, 2:-1}) # With dict, key will represent column - diff. fill value for each colunn 
data.fillna(method='ffill')
data.fillna(data.mean(axis = 0)) # Will fill based on col mean 


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,2.0,4.5,2.0
2,1.5,4.5,2.0
3,1.5,2.5,1.0


## Data Transformations

In [112]:
## Removing duplicates 
## Can check for duplicates with duplicated() 

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                            'k2': [1, 1, 2, 3, 3, 4, 4]})
# print("Duplicated data example:\n ", data)
data.duplicated() ## Will return first row that is duplcated, based on what was seen above 
data.drop_duplicates() # Return an object without the duplicates

## Can also add col as an argument to remove all duplicates on that col only 
## vs. doing search accross all cols 
data.drop_duplicates('k2') # Will return first instance for each unique value seen in k2. 

## @note, use the argument last (expects a bool value) to specify if want to keep first or last observed unique value

## Transforming data using functions or mappings 
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
meat_to_animal = {
      'bacon': 'pig',
      'pulled pork': 'pig',
      'pastrami': 'cow',
      'corned beef': 'cow',
      'honey ham': 'pig',
      'nova lox': 'salmon'
}

## Add a column to have the above mapping in frame 
data['animal'] = data['food'].str.lower().map(meat_to_animal)
## Alternatively: 
data['animal2'] = data['food'].map(lambda x: meat_to_animal[x.lower()]) # Why not x.str.lower()

## Replacing values, using replace() 
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace(-999, NA) 

## @note: Can also pass list of elements to replace with a sub. (sub argument can be list of subs)
## Can also be done with dict. 
data.replace({-999:NA, 1: -1000}) ## dict example 
data.replace([-999,1],NA) ## list example 

## Renaminx axis indices (with rename if you don't want to replace original object)
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                            index=['Ohio', 'Colorado', 'New York'],
                            columns=['one', 'two', 'three', 'four'])
data.index = data.index.map(lambda x: x[:4].upper())
data.columns = data.columns.map(lambda x: x.upper())

data.rename(index = str.title, columns=str.title) # Not sure why just doing str.title - vs. title() or columns.title()
## Why --> @important because .str is a Series' attribute. It's an array oriented method that skips NA


## Discretization and binning 
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
cuts = [10,20,30,40,50,np.max(ages)]
cats = pd.cut(ages, cuts)

## can also pass number of desired cuts and will create equally pop. bins
cats = pd.cut(ages, 4)

## Bins based on sample quantiles: 
data = np.random.randn(100)
cats = pd.qcut(data,4)
cats

## Detecting and filtering outliers
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
col = data[2]
col[np.abs(col) > 3]

data[(np.abs(data) > 3).any(1)] # Returns rows where have an outlier 
## replace outlier with threshold value of 3 
data[(np.abs(data) > 3)] = np.sign(data) * 3


## permutation and random sampling: 
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
df.iloc[sampler] # Using iloc 
df.take(sampler) # Using take function 

## Samples 
df.sample(3, replace = True)

## Computing dummy variables 
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                           'data1': range(6)})
pd.get_dummies(df, 'key') # Does the join automatically
df.join(pd.get_dummies(df['key'], prefix='key')) # Need to join 



Unnamed: 0,data1,key,key_a,key_b,key_c
0,0,b,0,1,0
1,1,b,0,1,0
2,2,a,1,0,0
3,3,c,0,0,1
4,4,a,1,0,0
5,5,b,0,1,0
