# This notebook is a pratice on ["Python Data Science Handbook"(Jake VanderPlas)](https://jakevdp.github.io/PythonDataScienceHandbook/index.html)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **1 PYTHON DATA STRUCTURES**

# 1.1 List

In [2]:
#create a list of integer
l = list(range(10))
l

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [3]:
#create a list of string
l2 = [str(c) for c in l]
l2

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [5]:
#create a heterogeneous list
l3 = [True, "2", 3.0, 4]
[type(item) for item in l3]

[bool, str, float, int]

# 1.2 Array

In [6]:
import array
#create an array
a = array.array('i', l) #'i' is an indication of type of integer
a

array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# **2 NUMPY**

# 2.1 Basics of Array

# 2.1.1 Initialization

In [7]:
#create a NumPy array from a python list
np.array([1,2,3,4], dtype = 'float32')

array([1., 2., 3., 4.], dtype=float32)

In [8]:
#create a multi-dimensional array (from nested list)
np.array([range(i,i+3) for i in [2,4,6]])

array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

In [9]:
#create an array with ten 0's
np.zeros(10, dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [10]:
#create a 3x5 floating-point array filled with ones
np.ones((3,5), dtype=float)

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [11]:
#create a 3x5 array filled with 3.14
np.full((3,5), 3.14)

array([[3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14]])

In [12]:
#create an array filled with linear sequence
#start at 0, end 20, step by 2
np.arange(0,20,2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [13]:
#create an array of evenly seperated sequence
#start at 0, end at 100, 5 points in total
np.linspace(0, 100, 5)

array([  0.,  25.,  50.,  75., 100.])

In [14]:
#create a 3x3 array of *uniformly* distributed random values 
#between 0 and 1
np.random.random((3,3))

array([[0.5945694 , 0.05750434, 0.10197721],
       [0.88783268, 0.33627386, 0.99774796],
       [0.76770659, 0.0121771 , 0.21281814]])

In [15]:
#create a 3x3 array of *normally* distributed random values
#with mean 0 and standard deviation 1
np.random.normal(0, 1, (3,3))

array([[ 1.92031335,  0.49199309, -0.58797629],
       [-0.18021428,  0.77939681, -0.19392965],
       [-0.00284028, -0.10461239,  0.90558781]])

In [16]:
#create a 3x3 array of random integers in the interval [0,10)
np.random.randint(0, 10, (3,3))

array([[9, 4, 4],
       [8, 4, 0],
       [4, 4, 2]])

In [17]:
#create a 4x4 identity matrix
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [18]:
#create an uninitialized array of five integers
#the value will be whatever happens to already exist at that memory
np.empty(5)

array([  0.,  25.,  50.,  75., 100.])

# 2.1.2 Attributes

Each array has attributes: <br>
`ndim`, the number of dimentsions <br>
`shape`, the size of each dimension <br>
`size`, the total size of the array <br>
`dtype`, the data type of the array <br>
`itemsize`, the size (in bytes) of each item <br>
`nbytes`, the total size (in bytes) of the array <br>


In [19]:
np.random.seed(0)

x1 = np.random.randint(10, size=6) # 1D array
x2 = np.random.randint(10, size=(3,4)) # 2D array
x3 = np.random.randint(10, size=(3,4,5)) # 3D array

In [20]:
print("x3's attibutes: ", x3.ndim, x3.shape, x3.size, x3.dtype, x3.itemsize, x3.nbytes)

x3's attibutes:  3 (3, 4, 5) 60 int64 8 480


# 2.1.3 Indexing

In [21]:
# for 1D array
print(x1)
print(x1[4], x1[-1])

[5 0 3 3 7 9]
7 9


In [22]:
# for multi-dimensional array
print(x2)
print(x2[0,0], x2[-1,-2])

[[3 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]
3 7


# 2.1.4 Slicing

To access a subarray, we can use *slice* notation (`:`) in this way: <br>
`x[start:stop:step]`, by default start=0, stop=len(x), step=1 <br>
<br>
*Note*: We create a *reference* in this way, so any modification on subarray of `x` will also change the original array. To make a *copy* (allocate a new block of memory), the method `.copy()` is added. <br>
Generally speaking, the opetaion such as setnull() which modifies the array will create a copy simultaneously, but the operation such as reshape() which doesn't influence the value of any data will create a reference.

In [23]:
# for 1D array
x = np.arange(10)
print(x)
print(x[::2])
print(x[1::2])
print(x[::-1]) #convenient way for reversion

[0 1 2 3 4 5 6 7 8 9]
[0 2 4 6 8]
[1 3 5 7 9]
[9 8 7 6 5 4 3 2 1 0]


In [24]:
#for multi-dimensional array
print(x2)
print(x2[:2, :3]) # first two rows, first three columns
print(x2[:2, ::2]) # first two rows. every other columns
print(x2[::-1, ::-1]) # reverse

[[3 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]
[[3 5 2]
 [7 6 8]]
[[3 2]
 [7 8]]
[[7 7 6 1]
 [8 8 6 7]
 [4 2 5 3]]


In [25]:
# access rows and columns
print(x2[:,0]) # first column
print(x2[0,:]) # first row
print(x2[0])   # also first row

[3 7 1]
[3 5 2 4]
[3 5 2 4]


In [26]:
# Difference between *reference* and *copy*
x2_ref = x2[:2, :3]
x2_copy = x2[:2, :3].copy()
x2_ref[0,0] = 0
print(x2)
x2_copy[0,0] = 99
print(x2)

[[0 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]
[[0 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]


# 2.1.5 Reshaping

Use `reshape()` method to reshape to any dimension <br>
Also use `np.newaxis` to add a new axis<br>

In [None]:
grid_ref = x1.reshape((2,3)) # the size of x1 must be equal to 2*3
print(x1)
print(grid_ref) # on most cases, grid_ref is a reference, but with non-contiguous memory buffers it may not be.

In [None]:
# reshape 1D array to multi-dimensional array
print(x1)

print(x1.reshape(1, 6)) # into a row
print(x1[np.newaxis, :]) # also into a row

print(x1.reshape(6, 1)) # into a column
print(x1[:, np.newaxis]) #also into a column

# 2.1.6 Joining and splitting

For concatenation of arrays:<br>
`np.concatenate`, can concatenate at defined axis <br>
`np.vstack`, concatenate vertically <br>
`np.hstack`, horizontally <br>
`np.dstack`, concatenate in the third axis <br>
For seperation of arrays: <br>
`np.split`, split an array along specified axis (default=0) <br>
`np.hsplit`, horizontally <br>
`np.vsplit`, vertically <br>
`np.dsplit`, along the 3rd axis <br>

In [None]:
# usage of np.concatenate
# np.concatenate([a,b,c,...])
a = np.array([0,1,2,3,4])
b = np.array([5,6,7,8,9])
c = np.concatenate([a,b])
print("a+b\n",c)
print("a+b+a+b\n",np.concatenate([a,b,a,b])) # nested concatenation
c = c.reshape((2,5))
print("c\n", c)
print("c+c\n", np.concatenate([c,c])) # join along the first (default) axis
print("c+c, axis=1\n",np.concatenate([c,c], axis=1)) # join along the second axis

In [None]:
# usage of np.vstack (vertical stack), np.hstack (horizontal) and np.dstack (depth)
a = np.array([0,1,2,3,4])
b = np.array([5,6,7,8,9])
print("a+b, vertical\n",np.vstack([a,b]))
print("a+b, horizontal\n", np.hstack([a,b]))
print("a+b, in 3rd axis\n", np.dstack([a,b]))

In [None]:
# usage of np.split
# np.split(c, [*SLIPPING POINTS*])
c = np.array([0,1,2,3,4,5,6,7,8,9])
a,b,d = np.split(c, [2,7])
print(a)
print(b)
print(d)

In [None]:
# usage of np.vsplit, np.hsplit
c = np.arange(16).reshape((4,4))
print(c)
a,b = np.vsplit(c, [2])
print("vsplit\na", a, "\nb", b)
a,b = np.hsplit(c, [2])
print("hsplit\na", a, "\nb", b)

# 2.2 Computation on NumPy Array: Universal Functions (ufuncs)

Vectorized operations are implemented via ufuncs, whose main purpose is to QUICKLY execute repeated operations on values in NumPy arrays. Ufuncs exist in two flavors: unary ufuncs (single input) and binary ufuncs (two inputs).

In [None]:
x = np.arange(1, 6)
print("x:\n",x, "\n1/x:\n", 1/x)
y = np.arange(5, 0, -1)
print("y:\n",y, "\nx/y:\n", x/y)
print("2**np.vstack([x,y]):\n",2**np.vstack([x,y]))

# 2.2.1 Array arithmetic

In [None]:
x = np.arange(5)
print("x:\n",x)
print("x+5:\n", x+5)  # np.add(x,5)
print("x-5:\n", x-5)  # np.subtract(x,5)
print("x*2:\n", x*2)  # np.multiply(x,2)
print("x/2:\n", x/2)  # np.divide(x,2)
print("x//2:\n", x//2)  # np.floor_divide(x,2)
print("-x:\n", -x)    # np.negative(x)
print("x**2:\n", x**2) # np.power(x,2)
print("2**x:\n", 2**x) # np.power(2,x)
print("x%2:\n", x%2)  # np.mod(x,2)

# 2.2.2 Absolute value

In [None]:
x = np.arange(-6,-1)
print("x:\n",x)
print("abs(x):\n", abs(x)) # np.abs(x)

# 2.2.3 Trigonometric functions

`sin(x) cos(x) tan(x) arcsin(x) arccos(x) arctan(x)`, `x` is in rad

# 2.2.4 Exponents and logarithms

`np.exp(x)` = e^x <br>
`np.exp2(x)` = 2^x <br>
`np.power(3,x)` = 3^x <br>
<br>
`np.log(x)` = ln(x) <br>
`np.log2(x)` = log2(x) <br>
`np.log10(x)` = log10(x) <br>
<br>
Those are more precise when `x` is very small: <br>
`np.expm1(x)` = exp(x) - 1 <br>
`np.log1p(x)` = ln(1+x) <br>

# 2.2.5 Specialized ufuncs

In [None]:
from scipy import special

In [None]:
# Gamma functions (generalized factorials) and related functions
x = [1, 5, 10]
print("gamma(x):\n", special.gamma(x))
print("ln|gamma(x):\n", special.gammaln(x))
print("beta(x, 2):\n", special.beta(x,2))

In [None]:
# Error function (integral of Gaussian)
# its comlement, and its inverse
x = np.array([0, 0.3, 0.7, 1.0])
print("erf(x):\n", special.erf(x))
print("erfc(x):\n", special.erfc(x))
print("erfinv(x):\n", special.erfinv(x))

# 2.2.6 Advanced Ufunc Features

Specify the `out` attribute.

In [None]:
# specifying `out' is faster than using 'y=x*5'
# the later first write x*5 to empty memory then
# copy data to y

x = np.arange(5)
y = np.empty(5)
np.multiply(x, 5, out=y)
print(y)

# `:` is also appliable
y = np.zeros(15)
np.add(x, 10, out=y[5:15:2])
print(y)

In [None]:
# aggregates

## `reduce`, like the next half of mapreduce
x = np.arange(1,10)
print(x)
print("The sum is:", np.add.reduce(x))
print("The product is:", np.multiply.reduce(x))
print("The accumulated sum is:", np.add.accumulate(x))
print("The accumulated product is", np.multiply.accumulate(x))

In [None]:
# Outer products - the opposite of inner product

x = np.arange(1,6)
print(x)
print("inner product\n", np.add.reduce(np.multiply(x,x)))
print("outer product\n", np.multiply.outer(x,x))

# 2.3 Aggregations: Min, Max, and Everything In Between

**Those are available aggregation functions**<br>
Func NaN-safe Version	Description<br>
`np.sum`	` np.nansum`	Compute sum of elements<br>
`np.prod`	 `np.nanprod`	Compute product of elements<br>
`np.mean`	 `np.nanmean`	Compute mean of elements<br>
`np.std`	 `np.nanstd`	Compute standard deviation<br>
`np.var`	`np.nanvar`	Compute variance<br>
`np.min`	`np.nanmin`	Find minimum value<br>
`np.max`	`np.nanmax`	Find maximum value<br>
`np.argmin`	`np.nanargmin`	Find index of minimum value<br>
`np.argmax`	`np.nanargmax`	Find index of maximum value<br>
`np.median`	`np.nanmedian`	Compute median of elements<br>
`np.percentile`	`np.nanpercentile`	<br>Compute rank-based statistics of elements. E.g., `np.percentile(x, 25)` compute 25th percentile<br>
`np.any`	`N/A`	Evaluate whether any elements are true<br>
`np.all`	`N/A`	Evaluate whether all elements are true<br>

# 2.3.1 Sum

np.sum(x) is faster than sum(x)

In [None]:
# difference between two
x = np.random.randint(1, 10, (4,4))
print(x)
print("sum(x):\n", sum(x))
print("np.sum(x) or x.sum():\n", np.sum(x))

# 2.3.2 Min & Max

`np.min` faster than `min`
`np.max` faster than `max`

In [None]:
x = np.arange(0,10).reshape([2,5])
print(x)
print("np.min(x) or x.min():\n", np.min(x))
print("np.max(x) or x.max():\n", np.max(x))
print("x.min(axis=0)\n", x.min(axis=0))
print("x.min(axis=1)\n", x.min(axis=1))

# 2.4 Broadcasting

Broadcasting rules apply to any binary `ufunc`. <br>
Rules:<br>
(a) If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.<br>
(b) If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.<br>
(c) If in any dimension the sizes disagree and neither is equal to 1, an error is raised.<br>

In [None]:
a = np.arange(0,3)
b = np.arange(3,6)
c = np.ones((3,3))
print("a\n", a)
print("b\n", b)
print("c\n", c )
print("a+b\n", a+b)
print("c+a\n", c+a)

In [None]:
a = np.ones((2,3))
b = np.arange(3)
print("a.shape=",a.shape)
print(a)
print("b.shape=",b.shape)
print(b)


# by rule (a), b.shape = (3,) --> (1,3)
# by rule (b), b.shape = (1,3)--> (2,3), same as a.shape
print("a+b.shape=", (a+b).shape)
print(a+b)

In [None]:
a = np.arange(3)
b = np.arange(3,6).reshape(3,1)
print("a.shape=",a.shape)
print(a)
print("b.shape=",b.shape)
print(b)

# by rule (a), a.shape = (3,)  --> (1,3)
# by rule (b), a.shape = (1,3) --> (3,3)
#              b.shape = (3,1) --> (3,3)
print("(a+b).shape=", (a+b).shape)
print(a+b)

In [None]:
a = np.ones((3,2))
b = np.arange(3)
print("a.shape=", a.shape)
print(a)
print("b.shape=", b.shape)
print(b)

# by rule (a) and (b), b.shape = (3,) --> ... --> (3,3)
# but a.shape = (3,2), according to rule (c), an error happens
#print(a+b)

# 2.5 Comparisons, Masks, and Boolean Logic

# 2.5.1 Comparison Operations as ufuncs:<br>
`>`,`>=`,`<`,`<=`,`==`,`!=`; or<br>
`np.greater`,`np.greater_euqal`,`np.less`,`np.less_equal`,`np.equal`,`np.not_equal`

In [None]:
# comparison
x = np.arange(5)
print("x\n",x)
print("x<3\n",x<3)
print("x>=2\n",x>=2)
print("2*x==x**2\n", 2*x==x**2)

# 2.5.2 Working with Boolean Arrays

We can apply `np.sum()`,`np.any()`and `np.all()`

In [None]:
x = np.arange(5)
print("x\n",x)
print("np.count_nonzero(x<3)\n", np.count_nonzero(x<3))
print("np.sum(x<3)\n",np.sum(x<3)) #sum default to columns
x1 = x[:,np.newaxis]
print("x1\n",x1)
print("np.sum(x1<3,axis=1)\n",np.sum(x1<3,axis=1)) #sum each row

# 2.5.3 Boolean operators
* `&` `np.bitwise_and`
* `|` `np.bitwise_or`
* `^` `np.bitwise_xor`
* `~` `np.bitwise_not`
<br>E.g., `np.sum((x>5) & (x<10))`

# 2.5.4 Boolean Arrays as Masks

In [None]:
#  Example of a masking operation
x = np.arange(10).reshape(2,5)
print("x=\n",x)
# `(x>3) & (x<7)` is the mask
print("x[(x>3) & (x<7)]=\n",x[(x>3)&(x<7)])

# 2.6 Fancy Indexing

Difference types of indexing:
* Simple indices (e.g., `x[0]`)
* Slices (e.g., `x[:5]`)
* Boolean masks (e.g., `x[x>5]`)
* Fancy Indexing

In [None]:
x = np.arange(10)
print("x\n", x)
fancy_index = np.array([[3,7],[5,4]])
x[fancy_index]

In [None]:
x = np.arange(20).reshape((4,5))
print("x\n",x)
row = np.array([0,3,2])
col = np.array([1,3,4])
# [0,3,2],[1,3,4] = [(0,1), (3,3), (2,4)]
print("x[row,col]\n",x[row,col])

# [row, col] follows the broadcasting rules
row = row[:,np.newaxis]
print("x[row,col]\n", x[row,col])

In [None]:
# combined indexing
x = np.arange(20).reshape((4,5))
print("x=\n", x)
print("x[2,[2,1,4]]=\n", x[2,[2,1,4]]) # combined with simple indices
print("x[1:,[3,0,2]]=\n",x[1:,[3,0,2]]) # combined with clising
mask = np.array([1,0,0,1,0], dtype=bool) # combined with masking
row = np.array([2,1,3])[:, np.newaxis]
print("x[row,mask]=\n", x[row,mask]) # note: len(mask) = len(x.col)

In [None]:
# modify values with fancy indexing
x = np.arange(10)
print(x)
i = np.array([4,2,8])
x[i] = 666
print(x)

In [None]:
# notice the difference
x = np.zeros(10)
i = np.array([1,2,2,3,3,3])
x[i] += 1 # read value of x[i], add 1, then write to x
print(x)

x = np.zeros(10)
i = np.array([1,2,2,3,3,3])
np.add.at(x,i,1) # write in place
print(x)
# TODO find `reduceat()`

# 2.7 Sorting Arrays
Use build-in function `np.sort()` with default algorithm *quicksort*. For multi-dimensional arrays, we can specify axis to be sorted by using `np.sort(x, axis=0)` or `np.sort(x, axis=1)`. <br>
To select *k* smallest values, we use `np.partition()` which arrange the k smallest values at left side. `axis` can also be specified.

In [None]:
x = np.random.randint(0,100,(10,))
print(x)
print("Sorted array\n",np.sort(x))
print("Sorted indices\n", np.argsort(x))

print("Sort in place")
x.sort()
print(x)

In [None]:
# k smallest values
x = np.random.randint(0,100,(10,))
np.partition(x,3) # three smallest values are put at left side

# 2.8 Structured Array

In [None]:
name = ['Alice', 'Bob', "Cathy", "Doug"]
age = [25,45,37,19]
weight = [55.0, 85.5, 68.0, 61.5]

In [None]:
data = np.zeros(4, dtype={'names':('name', 'age', 'weight'), 'formats':('U10', 'i4', 'f8')})
print(data.dtype)

In [None]:
data['name'] = name
data['age'] = age
data['weight'] = weight
print(data)

In [None]:
# indexing
print(data['name'])
print(data[0])
print(data[-1]['name'])

# 2.9 Additional Notes

In [None]:
# randomly choose indices with no repeats
x = np.arange(20)[:,np.newaxis]
indices = np.random.choice(x.shape[0],10,replace=False) # replace=False ==> No duplicates!
print(indices)
print(x[indices])

In [None]:
# select data not in indices
print(x[~indices])

In [None]:
# notes of function `np.searchsorted`
#help(np.searchsorted)

# 3 PANDAS

# 3.1 Introduction

# 3.1.1 Initialization of a Series object:<br>
`pd.Series(data, index=index)` <br>
data: can be a list, a scalar, or a dictionary<br>
index: optional

In [None]:
print(pd.Series([2,4,6]),"\n")
print(pd.Series(5, index=[10,20,30]),"\n")
print(pd.Series({2:'a', 1:'b', 3:'c'}),'\n')
print(pd.Series({2:'a', 1:'b', 3:'c'}, index=[3,2]),'\n')

In [None]:
data = pd.Series([0.25,0.5,0.75,1.0])
print("data:\n",data,'\n')
# Series has attributes `values` and `index`
print("data.values:\n",data.values)
print("data.index:\n",data.index)

# Access data
print("data[1]:\n",data[1])
print("data[1:3]\n",data[1:3])

# Change index
data.index = ['a','b','c','d']
print("\ndata:\n",data)
print("data['b']:\n",data['b'])
print(data.index)
data.index = [4,2,3,1]
print("\ndata:\n",data)
print("data[4]:\n",data[4])
print(data.index)

# 3.1.2 Initialization of DataFrame


In [None]:
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
pd.DataFrame(data)

In [None]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

In [None]:
name = ['A', 'b', 'Hello', 'Coffin', 'dance']
age = pd.Series([1,2,5,999,666], index=name)
height = pd.Series([150,200,130,97,55], index=name)
boo = pd.Series([5,4,3,2,1], index=name)

In [None]:
df = pd.DataFrame({'age':age, 'height':height, 'boo':boo})
print("df\n",df)

print("\ndf.index\n",df.index)
print("df.columns\n",df.columns)

print("\ndf['height']\n",df['height'])

# 3.1.3 Initialization of Index
<br>
Note: `Index` is *immutable*

In [None]:
ind = pd.Index([2,3,5,7,11])
ind

# 3.2 Data Indexing and Selection

# 3.2.1 Series

In [None]:
# Series as dictionary


data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
print("data:\n", data)

# dictionary-like interfaces
print(data['b'])
print('a' in data)
print(data.keys())
list(data.items())
data['e'] = 1.25
print(data)

In [None]:
# Series as one-dimensional array

data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
print("data:\n", data)

# interfaces
print(data['a':'c']) #slicing by EXPLICIT index, inding data['c']
print(data[0:2]) #slicing by IMPLICIT integer index, excluding data[2]
print(data[(data>0.3) & (data<0.8)]) #masking
print(data[['a', 'd']]) #fancy indexing

Due to the confusion from *EXPLICIT* and *IMPLICIT* indexing, we use indexers: `loc`, `iloc`, and `ix`
* `loc` ==> explicit index
* `iloc` ==> implicit index
* `ix` ==> hybrid of the two, same as `[]`-based indexing

# 3.2.2 DataFrame

In [None]:
# as dictionary
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
print("data\n",data,"\n")

print(data['area']) # or we can use `data.area` for string index
data['density'] = data['pop'] / data['area']
print("\n",data)

In [None]:
# as two-dimensionoal array

print(data.values,"\n")
print(data.T,"\n") # transpose
print(data.values[0],"\n")
print(data['area'],"\n")
print(data.iloc[:3,:2], "\n")
print(data.loc[:'Illinois', :'pop'], "\n")
print(data.loc[data.density>100,['pop','density']])

In [None]:
# Additional notes

# indexing refers to *colunms*
print(data['area'])
#print(data['New York']) ==> This is wrong

# but slicing refers to *rows*
print(data['New York':'Florida'])
#print(data['area':'pop']) ==> Wroing!
print(data[1:3])

# masking also operates on *rows*
print(data[data.density>100])

# 3.3 Operating on Data

|     |                                 |
|:---:|:-------------------------------:|
|`+`  |`add()`|
|`-`  | `sub()`,`subtract()`|
|`*`  | `mul()`, `multiply()`|
|`/`  | `truediv()`, `div()`, `divide()`|
|`//` | `floordiv()`|
|`%`  | `mod()`|
|`**` | `pow()`|

## 3.3.1 Ufuncs: Index Preservation

NumPy ufunc is also applicable to DataFrame

In [None]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
print(ser)
print(np.exp(ser))

In [None]:
df = pd.DataFrame(rng.randint(0,10,(3,4)), columns=['a','b','c','d'])
print(df)
print(np.sin(df*np.pi/4))

## 3.3.2 UFuncs: Index Alignment

### Index alignment in Series

In [None]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A)
print(B)

In [None]:
A + B

In [None]:
A.add(B, fill_value=0)

### Index alignment in DataFrame

In [None]:
A = pd.DataFrame(rng.randint(0,20,(2,2)), columns=['a','b'])
B = pd.DataFrame(rng.randint(0,10,(3,3)), columns=['b','a','c'])
print(A,"\n")
print(B)

In [None]:
A + B

In [None]:
A.add(B, fill_value=A.stack().mean())

## 3.3.3 Ufuncs: Operations Between DataFrame and Series

Recall the rules of broadcasting!

In [None]:
B - B.iloc[0] # row-wise

In [None]:
B.subtract(B['b'], axis=0) # column-wise

# 3.4 Handling Missing Data

In [None]:
df_na = pd.DataFrame({'Name':['A', 'b', 'Hello', 'Coffin', 'dance'], 'Age':[1, 2, 5, None, 666], 'Height':[150, None, 130, 97, 55], 10:[None,4,3,2,None]})
df_na

In [None]:
df_na.isnull()

In [None]:
df_na[df_na.notnull()]

In [None]:
#default: axis='rows' or 1, how='any'
#df_na.dropna() 

#df_na.dropna(axis=1) #or axis = 'columns'

#only drop columns w/ all NaN's
#df_na.dropna(axis=1, how='all') 
 
#if the axis has # of non-null < thresh, drop it!
#In other words, the threshold is the minimum # of valid data
df_na.dropna(axis=1, thresh=4)

In [None]:
df_na.fillna(0) #fill w/ 0
#df_na.fillna(0, inplace=True) #Modify the original data, not a copy
#df_na.fillna(df_na.mean()) #fill w/ mean value
#df_na.fillna(method='ffill') #forward fill, fill w/ previous value
#df_na.fillna(method='bfill') #back fill, fill w/ next value
#df_na.fillna(method='ffill', axis=1)

In [None]:
df_na.mean()

# 3.5 Hierarchical Indexing

# 3.10 Additional notes

In [None]:
# select data not in index
indices = [0, 7, 8, 9]
df[~df.index.isin(indices)]