### Notes and scratch pad from Python for Data Analysis, 2nd Edition, Wes McKinney

#### Chapter 2

In [None]:
%matplotlib inline  # magic function that integrates matplotlib with jupyter

import matplotlib.pyplot as plt
import numpy as np

plt.plot(5,1)

In [None]:
def append_element(some_list, element):
    some_list.append(element)
    
data = [1,2,3,4]
append_element(data,5)
data

In [None]:
a = 3.5
b = 2
isinstance(a,(int,float))

In [None]:
def isiterable(obj):
    try:
        iter(obj)
        return True
    except TypeError:
        return False

# if x is not a list, but is iterable, convert it to a list
if not isinstance(x,list) and isiterable(x):
    x = list(x)

In [None]:
a = [1,2,3]
b = a   # reference to same object
a is b  # are they the same object?

In [None]:
c = list(a)  # creates a copy of the object and puts it into a list
c is a       # are they the same object?

In [None]:
c == a      # are their contents equal?

In [None]:
#format arg0 as a float with 2 decimal places, 
#arg1 as a string, arg2 as an integer
template = '{0:.2f} {1:s} are worth US ${2:d}' 
template.format(67.123,'Canadian dollars',30)

In [None]:
x = 5
'non-negative' if x >=0 else 'negative'

### Chapter 3, data structures: tuples
Tuples and lists are semantically similar. Tuples use parantheses and lists use square brackets. The difference is that tuples are immutable. Their individual member objects may be mutable and therefore may be mutated; but the membership reference is immutable. Thus, tuples are simpler in use and much more limited in what methods may be called on them.

In [None]:
tup = (1,2,3) # this is a tuple
tup2 = tuple([1,2,3])  # this converts the list (or iterator) into a tuple
tup is tup2

In [None]:
tup == tup2

In [None]:
values = 1,2,3,4,5
a,b,*everything_else = values # conventional variable name is _ as in a,b,*_ = values
a

In [None]:
b

In [None]:
everything_else

In [None]:
replist = (1,2,3,1,2,1,0)
replist.count(1)

In [None]:
replist.count(2)

### Lists
Lists are often used to materialize iterators or generators

In [None]:
a = range(5)
list(a)
#tuple(a)

In [None]:
b = list(a)
b.append(5)
b

In [None]:
b.pop(3)

In [None]:
b

In [None]:
b.insert(3,'red balloon')
b

In [None]:
2 in b  # this is more expensive in lists that in dicts

In [None]:
2 not in b

In [None]:
b.append(2)
b

In [None]:
b.remove(2)  # removes the first instance of the value 2
b

In [None]:
b + b  # this concatenates a copy of b onto b

In [None]:
b.extend(b)  # this also concatenates a copy of b onto b, but is more efficient. use this.
b

In [None]:
c = ['abc','bc','c']  # sorts by the default function; here, alphabetically
c.sort()
c

In [None]:
c.sort(key=len)  # sorts by a different function; here, length
c

In [None]:
# to capture an index alongside a list, use the enumerate function
tracy = ['t','r','a','c','y']
mapping = {} # creating a dict
for (i,value) in enumerate(tracy):
    mapping[value] = i
mapping['a'] # where did the value 'a' appear in the list?

sorted(tracy)

import bisect
stracy = sorted(tracy)  # sorted(list) returns a sorted copy of the list
bisect.bisect(stracy,'d')  # where would 'd' be inserted to keep the list sorted?
stracy
bisect.insort(stracy,'s')  # go ahead and insert 's' in the right place
stracy

In [None]:
tracy = ['t','r','a','c','y']
frank = ['f','r','a','n','k']
trank = zip(tracy,frank)  # let's zip those two crazy kids together
trank = list(trank)
trank

In [None]:
me,him = zip(*trank)   # let's unzip that list and pull all the first elements back out
me
him

In [None]:
# two great tastes that taste great together (think data formatting)
for i,(a,b) in enumerate(zip(tracy,frank)):
    print('{0}: {1}, {2}'.format(i,a,b))

### dicts - hashtables

In [None]:
empty_dict = {}     # you can make an empty dict to populate later
d1 = {1:'a',2:'b'}  # you can create one with curly braces and colons
d1[7]='c'           # you can assign a new value to a new or existing key
del d1[7]           # remove by key
d1.pop(1)           # another way to remove by key, and return value
d2 = {4:'t',6:'r'} 
d1.update(d2)       # appending one dict's entries to another dict
d1.update({4:'s'})  # but watch out because repeated keys will get overwritten
d1

tracy = ('t','r','a','c','y')
frank = ('f','r','a','n','k')
trank = zip(tracy,frank)
list(trank)

# one way to create a dict from two lists
mapping = {}
for key,value in zip(tracy,frank):
    mapping[key] = value

# another way
mapping = dict(zip(tracy,frank))   # you can use the dict() function with a list of 2-tuples
mapping

# return a value or a default value
val = mapping.get('q','z')  # get the value associated with the key 'q', or return 'z'
val

# creating a dict from a default rule
# ie, creating a dict of words where the key is the first letter and the value is the
# list of words

# 3 ways - the Python long way that works
words = ('a','as','apple','bee','bat')
by_letters = {}
for word in words:
    letter = word[0]
    if letter not in by_letters:
        by_letters[letter] = [word]
    else:
        by_letters[letter].append(word)

# another way - using the setdefault dict method
for word in words:
    letter = word[0]
    by_letters.setdefault(letter,[]).append(word)
    
# one more way using a class from the collections module
from collections import defaultdict
by_letters = defaultdict(list)
for word in words:
    by_letters[word[0]].append(word)
by_letters

# keys must be immutable, so if you have a list, convert it to a tuple first
#tracy = {['t','r','a','c','y']} # doesn't work!
tracy = {tuple(['t','r','a','c','y'])} 
tracy



### Sets and set functions

Sets are unique, non-ordered sets of values, denoted by a comma-separated list inside curly braces

In [None]:
set1 = {1,2,2,2,3}
set1
set2 = {3,4,4,4,5}
set2
set1 & set2
set1 | set2
set1 ^ set2
set1 - set2
set1.add(4)
set1.remove(2)

### Comprehensions - filtering and defining lists, sets, dicts in one line

In [None]:
# example: convert words to uppercase if they are at least 2 letters long
# this is a LIST COMPREHENSION because it makes a list
words = {'a','as','apple','bee','bat'}
lc = [word.upper() for word in words if len(word)>2]
lc

# let's make a dict comprehension
dc = {word[0] : word for word in words if len(word)>2}  # note this doesn't make a list of words as a value, just the latest one
dc

# set comprehension is like a list comprehension but creates a set
words = ['a','as','apple','apple','bee','bat']
lc = [word.upper() for word in words if len(word)>2]
lc  # this gives you duplicates of APPLE because it's a list
sc = {word.upper() for word in words if len(word)>2}
sc  # this gives you one APPLE because it's a set

# we can go nuts with it...
# two lists of names, together in a list
names1 = ['tracy','frank']
names2 = ['jonas','trey','kit']
family = [names1,names2]
family
# let's make a list of all names with 'r' in them
r_names = []
for groups in family:
    r_people = [name for name in groups if name.count('r')>=1]
    r_names.extend(r_people)
r_names

# a crazy one-liner version!
# this says...
# for groups in family, for name in groups, if name has an 'r', then it's part of the list

r_names = [name for groups in family for name in groups if name.count('r')>=1 ]
r_names

short_names = [name for groups in family for name in groups if len(name)<5]
short_names

# you can use this idea to "flatten" a list of lists or tuples, eg
some_tuples = [(1,2,3),(4,5,6),(7,8,9)]
[x for tuple in some_tuples for x in tuple]
# just think of it as taking the expanded form of nested for and if statements,
# and collapsing them onto a single line

# you can also use a comprehension inside a comprehension, eg
[[x for x in tuple] for tuple in some_tuples]
# this produces a tuple of tuples rather than a flattened tuple of lists as before

### functions as objects
Treating functions as objects means you can use them as arguments in a couple handy ways

In [None]:
# say we want to apply a series of cleaning operations to a list of strings (like
# user input data)
clean_op1 = str.strip # removes whitespace
clean_op2 = str.title # capitalizes the first letter and lowercases the rest

import re
# now we want to build a custom function that removes !, #, and ? from the string
def remove_punctuation(value):
    return re.sub('[!#?]','',value)

clean_op3 = remove_punctuation
clean_ops = [clean_op1, clean_op2, clean_op3]

# let's write a function that applies a list of cleaning operatinos to a set of strings
def clean_strings(strings,ops):
    result = []
    for value in strings:
        for function in ops:
            value = function(value)
        result.append(value)
    return result

sample_strings = ['tracy?','  frank!   ','TREY']
clean_strings(sample_strings,clean_ops)


### Lambda (anonymous) functions

In [None]:
def short_function(x):
    return x*2

equiv_lambda = lambda x: x*2

# you can use these to pass a short function to another function, or to define
# a custom sort function, etc

# custom sort function - say you wanted to sort a list of strings by the number
# of distinct letters in each string
# then you want to sort by len(set(list(word)))
# you can pass that function to the sort operator:
words = ['aaaa','tracy','abc','bbaabbaabbaa']
words.sort(key=lambda x:len(set(list(x))))
words

### Generators create a list one element at a time

In [None]:
def squares(n=10):
    print('Generating squares:')
    for i in range(1,n+1):
        yield i**2

gen = squares()   # this just defines gen but does not call the function

# you have to request elements from gen for it to start working, like this:
for n in gen:
    print(n,end=' ')


In [None]:
# you can also use a generator expression, like:
gen = (x**2 for x in range(10))
for y in gen:
    print(y,' ')

sumsquares = sum(i**2 for i in range(10))
sumsquares
dict((i,i**2) for i in range(1,11))  # constructing a dict by generating a list of 2-tuples

### Exception Handling

In [None]:
# catch exceptions and/or do stuff regardless
try:
    #the stuff you are going to try
    pass
except:
    #catches any exception. to specify exception types, use a comma separated list
    #as in except (Type1, Type2)
    pass
else:
    #what if there were no exceptions? then do this stuff
    pass
finally:
    #regardless, do this stuff
    #like closing a file if you've opened it
    pass

### Files in Python

In [None]:
# easy to open files, default is read-only
# use the 'with' command to ensure the file is closed after being used
path = '/dir/to/file.txt'
with open(path) as file:
    lines = [lines.rstrip() for line in file]
# this gives you a list of the lines in the file, with newline removed, and closes the file

In [None]:
# open(path) - opens file read-only
# open(path,'w') - creates a new file for writing at that path
# open(path,'x') - creates a new file for writing at that path unless it already exists
# open(path,'a') - appends to the file
# open(path,'r+') - opens for read/write# file = open(path)
# file.read(24) - reads 24 characters or bytes if opened with 'rb'
# file.tell() - tells you which byte the file handle is sitting at
# file.seek(3) - moves to the 3rd byte in the file
# file.close()

# use 'write' and 'writelines' to write to a file, eg
# with open(outfile) as outfile:
#     outfile.writelines(line for lines in open(infile) if len(line)>1)

### Chapter 4 - NumPy

In [None]:
import numpy as np
data = np.random.randn(2,3)
data * 100
data.shape
data.dtype

In [None]:
# you can convert any list type object to an array
mylist = [1,2,4,5,3]
myarray = np.array(mylist)

# a list of lists becomes a multidimensional array
mylist2 = [[1,2,3],[4,5,6]]
myarray2 = np.array(mylist2)
myarray2.shape # 'rows' by 'columns', if you will
myarray2.ndim  # how many dimensions?

# special functions
np.ones(10)
np.ones((2,3))
np.ones_like(myarray2)
np.zeros(4)
np.zeros_like(myarray)
np.empty((4,3))
count_to_100 = np.arange(100)

# converting string data representing numbers, to actual numbers:
numstring = ['1.2','3.4','5']
numarray = np.array(numstring)
numarray.dtype
floatarray = numarray.astype(float)
floatarray.dtype

In [None]:
# indexing and slicing
# slicing is similar to Python list slices except that they are NOT copies,
# but rather reference the original. example:
arr = np.array([1,2,3,4,5,6,7])
arr_slice = arr[2:4]
arr_slice[:] = 1
arr

# you can index and slice in many dimensions at once
# note: indices in the form [a][b] are equivalent to [a,b]
arr2dim = np.array([[1,2,3],[4,5,6]])
arr2dim[1][2] == arr2dim[1,2]

In [None]:
# you can use boolean arrays to index data
names = np.array(['tracy','frank','trey','jonas','kit','tracy'])
names == 'tracy'  # generates an array of boolean data; use this as an index
data = np.random.randn(6,5)
data[names=='tracy']
# same as
cond = names=='tracy'
data[cond]

data[data<0] = 0   # using another boolean array as an index
data[names == 'tracy',2:]  # combine boolean indices with slices
data[names != 'tracy'] = 3  # using nots in boolean expressions
# note you cannot use 'and' or 'or' in boolean arrays, you have to use & or |
data[(names=='tracy') | (names=='frank')] = 5
data

In [None]:
# fancy indexing
# you can grab rows (or columnns) in a specified order by listing them in the order
# you want, comma-separated
# arr[1,2] means give me the element at arr[1][2]
# but arr[[1,2]] means give me the rows 1 and 2
# example
arr44 = np.empty((4,4))
for i in range(4):
    arr44[i] = i
arr44
# to grab the element at arr44[2][1], i can use:
arr44[2,1]
# or
arr44[2][1]
# but to grab the rows 2 and 1 in that order, i can use:
arr44[[2,1]]
# you can do this in multiple dimensions, and it will zip the entries into tuples, eg
# to grab the rows 2 and 1, and from them the entries in col 0 and 2 respectively, do:
arr44[[2,1],[0,2]]
# you can use slices with either index
arr44[[2,1],:2]

In [None]:
# reshape & transpose
# each array has a .T which is its transpose
arr = np.random.randn(2,5)
arr.shape
arr.T.shape

# .T is swapping axes. there is another function 'swapaxes' you can use to complicate things
arr.swapaxes(0,1) # gives you the .T function back

# you can swapaxes WHILE doing a transpose with the .transpose function
arr3 = np.random.randn(2,3,4) #gives you a 2x3x4
arr3
arr3.transpose(1,0,2) # gives you a 3x2x4

### Universal Functions (ufuncs)

In [None]:
# optimized functions that work element-wise over an array
import numpy as np
arr = np.random.randn(2,4)
np.sqrt(arr)
np.exp(arr)
arr2 = np.random.randn(2,4)
np.maximum(arr,arr2)
remainder,whole = np.modf(arr)
# you can add an output argument to enable the function to modify the array in place
tracy1 = np.array([1,2,3,4,5],np.float)
np.sqrt(tracy1,tracy1)
tracy1
# other UNARY (operating on one array) functions include: 
# abs, fabs, square, log, log10, log2, log1p, sign, cell, floor,
# rint - round to the nearest int, isnan, isfinite/ininf, cos etc., 
# arccos etc, logical_not

# other BINARY (operating on multiple arrays) functions include:
# add, subtract, multiply, divide, floor_divide, power, maximum, fmax, minimum, fmin
# mod, copysign, greater, greater_equal, less, less_equal, equal, not_equal,
# logica_and, logical_or, logical_xor

In [None]:
# meshgrid makes a 2D array by taking all combinations of elements from 2 1D arrays
list1 = [1,2,3,4,5]
list2 = [3,3,3,3,3]
x,y = np.meshgrid(list1, list2)
z = np.sqrt(x**2+y**2)  # efficient way to evaluate a function
z

import matplotlib.pyplot as plt
%matplotlib inline
plt.imshow(z,cmap=plt.cm.gray); plt.colorbar()
plt.title("Image plot of metric")


In [None]:
# np.where is an efficient way to use a conditional
# equivalent to "if condition, then x, else y"

arr = np.random.randn(3,3)
condition = arr > 0
np.where(condition,arr,0)  # if positive, use the value, else assign 0
# semantically equivalent to arr[arr<0] = 0 but way more efficient

In [None]:
# arrays have some aggregator functions like min, max, mean, etc
# but so does NumPy itself. you can use either

arr = np.random.randn(5,6)
arr.mean() == np.mean(arr)
arr.max() == np.max(arr)
arr.mean(axis=0) == np.mean(arr,axis=0)

arr.cumsum() # cumulative sum
arr.cumsum(axis=1)
arr.cumprod()

# sum, mean, std, var, min, max, argmin, argmax, cumsum, cumprod

# for boolean arrays, helpful functions include sum, any, and all
(arr>0).sum()  # how many values are greater than 0?
arr.any()
arr.all()

# you can sort arrays in place
arr.sort()
arr
arr.sort(axis=1)
arr

# as before, this overwrites the existing array. using np.sort(arr) would provide
# a copy

# np.unique(arr) will return the unique values, sorted
# equivalent to sorted(set(somelist))

# np.inid(givenarray,testvalues) will return a boolean array stating whether, for each
# element of givenarray, its value is one of the testvalues

In [None]:
# reading input and writing output with arrays

### Pandas - Chapter 5

NumPy is good for numerical data sets.
Use Pandas for mixed data sets and for data manipulation.

In [None]:
# pandas has two major data structures, Series and DataFrame
# let's look at Series first

import pandas as pd
from pandas import Series,DataFrame

obj = pd.Series([4,7,-5,3])
obj.index
obj.values

obj2 = pd.Series([4,7,-5,3],index=['d','b','a','c'])
obj2.index
obj2['a']
obj2*2
'c' in obj2  # checking the indices
-5 in obj2   

# you can create a Series from a dict
casabiancas = {'tracy':42,'frank':47,'trey':10,'jonas':8,'kit':6}
casabiancas['tracy']
obj3 = pd.Series(casabiancas)
obj3

kids_only = ['trey','jonas','kit','dog']   # you can specify a subset of your indices
                                           # when you create a Series from a dict
                                           # even if a given key didn't appear before
obj4 = pd.Series(casabiancas,index=kids_only)
obj4

pd.isnull(obj4)
pd.notnull(obj4)
#or, another way:
obj4.isnull()
obj4.notnull()

# kinda like a join...
obj5 = obj3+obj4
obj5  # will add values where they match, and set to NaN if they didn't match

# Series have names (for the values column) and index names, eg
obj5.name = 'age'
obj5.index.name = 'person'
obj5

# and you can override the index values if you'd like:
obj5.index = ['pet','dad','kid2','kid3','mom','kid1']
obj5

In [1]:
import pandas as pd
from pandas import Series, DataFrame

# now we'll look at DataFrames
# think of these as a dict of Series all sharing a common index
# it's basically a spreadsheet of columns which can have different types

# one common construction
family = {'name':['frank','tracy','trey','jonas','kit'],
          'age':[47,42,10,8,6],
          'job':['work','work','school','school','school']}
frame = pd.DataFrame(family)
frame.head(2)
# you can rearrange the columns
frame2 = pd.DataFrame(family,columns=['name','job','age'])
frame2
# you can change the index
frame3 = pd.DataFrame(family,index=['one','two','three','four','five'])
frame3
# you can add new columns that will be blank
frame4 = pd.DataFrame(family,columns=['name','age','gender'])
frame4
# you can retrieve columns by [] or by .
frame4.age
frame4['age']
# you can retrieve rows by loc
frame4.loc[2]
# you can assign values to columns
frame4.gender = 'M'
frame4

# you can add a whole column, and then delete it
frame4['lies'] = frame4['age']<40
frame4
del frame4['lies']
frame4

frame4.values
frame4.columns.name = 'data'   # meta heading on the column headers
frame4.index.name = 'number'   # meta heading on the index
frame4.T

3 in frame4.index
'name' in frame4.columns

# what can you do with the index? it is an immutable set. you can:
# append, difference, intersection, union, isin, delet, drop, insert, is_monotonic,
# is_unique, unique

True

In [2]:
# manipulating and cleaning data in a DataFrame
# you can reindex a df
data = pd.Series(['tracy','frank','trey','jonas','kit'],index=[1,3,2,4,5])
data2 = data.reindex([1,2,3,4,5,6])
data2  # reorders the rows to conform to this index, using NaN for new values

data3 = pd.Series(['blue','purple','yellow'],index=[0,2,4])
data4 = data3.reindex(range(6),method='ffill')  # forward fills missing values, but
                                                # existing index must be monotonic first
data4
# you can drop rows (the default) or columns (specify axis=1)
data5 = data4.drop(2)  # defaults to dropping the row with index = 2
data5
data5 = pd.DataFrame(data4)
data5.index.name = 'i'
data5.columns.name = 'color'  # this isn't working as i expect?
data5['name'] = 'tracy'       # adding a column to drop
data5.drop('name',axis=1)     # dropping a column - need to specify axis=1

color,0
i,Unnamed: 1_level_1
0,blue
1,blue
2,purple
3,purple
4,yellow
5,yellow


In [3]:
import numpy as np

# slicing with Pandas is a lot like slicing with NumPy
obj = pd.Series(np.arange(4.0),index=['a','b','c','d'])
obj[1:2]

# except you can slice by the label value, too
# note slicing with pandas by label is INCLUSIVE:
obj['a':'c']

data = pd.DataFrame(np.arange(16).reshape(4,4), index=['Ohio','Maryland','Florida','Texas'],columns=['one','two','three','four'])
data
data[:2]  # row selection
data[data['three']>5]
data[data>5] = 8
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Maryland,4,5,8,8
Florida,8,8,8,8
Texas,8,8,8,8


In [9]:
# you can select from a DataFrame using index (iloc) or labels (loc)
data.loc['Maryland',['two','three']]
data.iloc[1,[1,2]]
data.iloc[:,:3]
data.loc[:'Florida',:]
# you can index this result, too:
data.loc[:'Florida',:][data<=5]

data2 = pd.DataFrame(np.arange(12).reshape(3,4),index=['Ohio','Colorado','Florida'],columns=['one','two','four','five'])
data2+data  # notice what happens when you add
            # where there is data in both, it adds
            # where one set is missing data, there is no result
        
# but you can also specify a fill value
data3 = data2.add(data,fill_value=0)
data3

# you can add, subtract, divide, multiply, floor_divide, etc
# 1 / data is equivalent to data.rdiv(1)

# you can add a Series to a DataFrame. it will add the Series to each of the rows,
# adding entries from the column in the DataFrame with a label that matches the label 
# in the Series. where there aren't matching labels, NaN will be returned.

# to add the Series to each of the columns instead where the row labels match the
# labels in the Series, you have to use frame.add(series, axis='index')

# you can use NumPy's ufuncs (univariate functions / element-wise array methods) on
# Pandas structures like np.abs(data)

# you can apply functions to Pandas structure with apply: for a defined function f, you
# can data.apply(f). eg
f = lambda x: x.max() - x.min()
data4 = pd.DataFrame(np.random.randn(4,3))
data4.apply(f)
# notice the function was applied to the columns, and a Series of results was returned
# you can apply it to the rows instead using data.apply(f,axis='columns')

def func(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])

data4.apply(func)  # notice you get a Series with multiple values in each element

# you can use map for Series and applymap for DataFrames to apply a function to
# each element (rather than each row or column). eg to format a string, you could
# write a string formatter
format = lambda x: '%.2f' % x
data4.applymap(format)

# you can sort by either index. for rows, data.sort_index(). for columns,
# data.sort_index(axis=1)
# default is ascending order. to change that:
# data.sort_index(axis=1, ascending=False)

# you can sort by values - series_data.sort_values()
# missing values are sorted to the end
# you can sort by values in a particular column - frame_data.sort_values(by='b')

# similarly, there is a rank function that lets you rank and assign a rank
# by different functions for tie-breakers, like average rank, min rank, max rank,
# and first appearance

# labels do not need to be unique in Pandas
# accessing data by a label that has a duplicate will return a Series of results

# summarizing and computing descriptive statistics:
data4.mean(axis='columns',skipna=False)  # statistics ignore NaN by default (skipna=True)
                                         # note by default, if the whole row or col is
                                         # NaN, the answer will be NaN even if skipna=True

# descriptive and summary stats include:
# count, describe, min, max, argmin, argmax, idxmin, idxmax, quantile, sum,
# mean, median, mad, prod, var, std, skew, kurt, cumsum, cummin, cummax, cumprod,
# diff, pct_change

# Series can do correlation and covariance, as data_series.corr() and data_series.cov()
# DataFrame can correlate with Series (looking for matching labels)
# default is to correlate DF columns with the Series, but of course you can axis=1

# you can data_series.unique() to get unique values; data_series.value_counts() to get
# counts of the unique values; check membership with data_series.isin(['val1','val2']) to
# return a Series of Booleans

# you can also get an index array that checks membership - look up Index.get_indexer
# for more information

# a clever way to make a histogram
data5 = pd.DataFrame({'Q1':[1,3,4,3,4],'Q2':[2,3,1,2,3],'Q3':[1,5,2,4,4]})
data5
result = data5.apply(pd.value_counts).fillna(0)
# this will return the value counts on each of the columns, filling missing data with 0
# essentially creating a histogram!
result

Unnamed: 0,Q1,Q2,Q3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


### Chapter 7: Data Cleaning and Preparation

In [1]:
# Pandas handles missing data as gracefully as possible. Series and DFs can:
# dropna - drop rows or columns with missing data; can set a tolerance for how much
# fillna - fills in a value or interpolation like 'ffill' or 'bfill'
# isnull - Booleans identifying missing data
# notnull - opposite of isnull

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

tracy = pd.DataFrame(np.random.randn(4,3),index=['a','b','c','d'])
tracy2 = tracy[tracy>0]
tracy3 = tracy2.dropna()  # drops all rows with an NA
tracy4 = tracy2.dropna(how='all')   # drops rows with all NA
tracy5 = tracy2.dropna(axis=1)    # drops all columns with an NA
tracy6 = tracy2.dropna(axis=1,how='all')  # drops columns with all NA
tracy7 = tracy2.dropna(thresh=2)   # drops rows with 2 or more NA
#tracy7 = tracy2.fillna(0)
tracy2.fillna(0)   # creates a new version filled in, but if you want to memorialize it:
#tracy2.fillna(0,inplace=True)
tracy2.fillna({1:.1,2:.2})   # sets the fill value by column
tracy2.fillna(tracy2.mean())  # sets the fill value to the mean
tracy2.fillna(method='ffill')
tracy2.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
a,,0.681386,1.071944
b,,1.816869,1.071944
c,,1.816869,0.998188
d,,1.816869,0.998188


In [2]:
# data transformation

# identifying and removing duplicates
tracy = pd.DataFrame({'k1':['one','two']*3 + ['two'],'k2':[1,1,2,3,3,4,4]})
tracy[tracy.duplicated()]   # finds the duplicates
tracy.drop_duplicates()   # drops the duplicates
tracy['v1'] = range(7)
tracy.drop_duplicates(['k1','k2'])  # drops duplicates in the specified columns
tracy.drop_duplicates(['k1'],keep='last')  # keeps the last one instead of the first one

# transforming using a map or function
# you can transform data using Python or NumPy functions
tracy['k3'] = tracy['k2']+tracy['v1']
# you can use .map with a function
tracy['v2'] = tracy['k1'].map(lambda x:x.upper())
# you can use .map with a map
tracy_map = {1:'a',2:'b',3:'f',4:'p'}
tracy['q'] = tracy['k2'].map(tracy_map)
tracy

# replacing data
tracy.replace(4,np.nan)
tracy.replace([1,2,3,4],np.nan)

# you can also map or rename index and column labels
tracy.index = tracy.index.map(lambda x:x-1)
tracy

# binning: you can create bins and count your data!
ages = [4, 10, 18, 6, 37, 20, 5, 50]
bins = [0,10,18,25,100]
ages_by_bin = pd.cut(ages,bins)
ages_by_bin
ages_by_bin.codes   # gives the code for which bin each age fell into
ages_by_bin.categories   # shows you the categories in order alluded to with .codes
ages_by_bin.value_counts()  # a handy histogram
group_names = ['child','teen','young_adult','adult']
ages_by_bin = pd.cut(ages,bins,labels=group_names)
ages_by_bin.value_counts()

ages_by_bin2 = pd.cut(ages,3)  # cut this data into 3 bins of equal width
# note this does not mean bins have same number of entries!!! unless uniformly distro
ages_by_bin2.value_counts()

ages_by_bin3 = pd.qcut(ages,3)  # cut this data into bins with roughly equal membership
ages_by_bin3.value_counts()

(3.999, 7.333]     3
(7.333, 19.333]    2
(19.333, 50.0]     3
dtype: int64

In [3]:
# handling outliers
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()  # shows you mean, std, min, quantiles, max
col = data[2]
col[np.abs(col)>3]
data[(np.abs(data)>3).any(1)] # return all rows where the abs val of any entry in that
                              # row is greater than 3

# if you wanted to cap the data, use something like this:
data[np.abs(data)>3] = np.sign(data)*3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.028605,-0.026657,0.04482,-0.008158
std,1.009663,0.986713,1.003974,0.987752
min,-3.0,-3.0,-3.0,-2.886351
25%,-0.672149,-0.692741,-0.609328,-0.656018
50%,0.013178,-0.066074,0.056123,-0.022131
75%,0.701586,0.657008,0.786417,0.664041
max,3.0,2.935015,2.677308,3.0


In [46]:
# permutation and random sampling
df = pd.DataFrame(np.arange(5*4).reshape(5,4))
sampler = np.random.permutation(5)  # NumPy provides a permutation function
sampler
df.take(sampler)          # takes a sample according to the sampler
df.iloc[sampler]          # does the same thing, a different way. notice square brackets.

df.sample(n=3)            # takes a random subset without replacement
df.sample(n=10,replace=True)  # takes a random subset with replacement


# one-hot encoding
# Pandas offers some built-in functions, but you can also roll-your-own

# built-in - use pd.get_dummies
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1': range(6)})
# make a one-hot encoder for this data
dummies = pd.get_dummies(df['key'],prefix='val')
dummies
df2 = df[['data1']].join(dummies)  # joins the integer data with the one-hot data
df2

# roll-your-own
df3 = pd.DataFrame({'fam':['tracy','frank','trey','jonas','kit'],'age':[42,47,10,8,6],'colors':['purple|orange','orange|green','orange|green|red','blue|red|black','pink|purple|baby blue']})
# how many unique colors do we have?
colors = []
for x in df3['colors']:
    colors.extend(x.split('|'))
uniq_colors = pd.unique(colors)
zeros = np.zeros((len(df3),len(uniq_colors)))  # a column for each color
dummies = pd.DataFrame(zeros,columns=uniq_colors)  # turn it into a dataframe
for i,x in enumerate(df3['colors']):
    indices = dummies.columns.get_indexer(x.split('|'))  # figure out which cols you want
    dummies.iloc[i,indices] = 1   # set those entries to 1
df4 = df3[['fam','age']].join(dummies)
df4


# you can combine get_dummies with cut to create one-hot vectors with bins
np.random.seed(12345)
values = np.random.rand(10)
bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))  # remember this guy?

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [14]:
# string operations
# Python has built-in string operations like .split and .strip
# also: count, endswith, startswith, join, index, find, rfind, replace,
# strip, rstrip, lstrip, split, lower, upper, casefold, ljust, rjust

# for more, using the regex library re
import re
text = "Tracy is    the\tbest!\n"
text2 = text.split('\s+')   # this is trying to split on the char sequence \s+
text3 = re.split('\s+',text)   # this is actually splitting on whitespace

# if you are using a regex often, compile it once and reuse
regex = re.compile('\s+')
text4 = re.split(regex,text)   # same result but you can reuse this version

#regex can do:
# findall (including with groups), finditer, match, search, split, sub, subn
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'  # the r says don't compile my \ chars
regex = re.compile(pattern, flags=re.IGNORECASE)
text = """Frank frank@google.com
Trey trey@gmail.com
Jonas jonas@yahoo.com"""

re.findall(regex,text)

for x in re.finditer(regex,text):
    print(x)

regex.sub(r'User: \1, Domain: \2, Suffix: \3',text)  # substitution using groups

<re.Match object; span=(6, 22), match='frank@google.com'>
<re.Match object; span=(28, 42), match='trey@gmail.com'>
<re.Match object; span=(49, 64), match='jonas@yahoo.com'>


'Frank User: frank, Domain: google, Suffix: com\nTrey User: trey, Domain: gmail, Suffix: com\nJonas User: jonas, Domain: yahoo, Suffix: com'

In [22]:
# Pandas has added string functions that gracefully handle missing data
# use these by invoking a Series' .str attribute
import pandas as pd

data = {'Frank':'frank@google.com','Trey':'trey@gmail.com','Jonas':'jonas@yahoo.com'}
data = pd.Series(data)
data.str.contains('gmail')
data.str.findall(regex)  # using the regex we built in the previous block
# notice this returns a list that only has one element
# we can extract that one element like:
data.str.findall(regex).str[0]

# using .str, you can:
# cat, contains, count, extract, endswith, startswith, findall, get, isalnum
# isalpha, isdecimal, isdigit, islower, isnumeric, isupper, join, len, lower, upper,
# match, extract, pad, center, repeat, replace, slice, split, strip, rstrip, lstrip

Frank    (frank, google, com)
Trey       (trey, gmail, com)
Jonas     (jonas, yahoo, com)
dtype: object