# Slack
You need to fill in the [form](https://docs.google.com/forms/d/1OmT8ODmVBNgl0eOmZT51JMTHUSA_eNrHTcDRnmNDMgQ) to get invitated

Slack url: https://rt-portal.slack.com/

# Numpy Cheat Sheet
http://www.datasciencefree.com/numpy.pdf

# NumPy Basics: Arrays and Vectorized Computation

In [None]:
%matplotlib inline

In [None]:
from __future__ import division
from numpy.random import randn
import numpy as np
np.set_printoptions(precision=4, suppress=True)

# 6 Important things you should know about Numpy and Pandas

1. The data manipulation capabilities of pandas are built on top of the numpy library. In a way, numpy is a dependency of the pandas library.
2. Pandas is best at handling tabular data sets comprising different variable types (integer, float, double, etc.). In addition, the pandas library can also be used to perform even the most naive of tasks such as loading data or doing feature engineering on time series data.
3. Numpy is most suitable for performing basic numerical computations such as mean, median, range, etc. Alongside, it also supports the creation of multi-dimensional arrays.
4. Numpy library can also be used to integrate C/C++ and Fortran code.
5. Remember, python is a zero indexing language unlike R where indexing starts at one.
6. The best part of learning pandas and numpy is the strong active community support you'll get from around the world.

## The NumPy ndarray: a multidimensional array object

In [None]:
data = randn(2, 3)

In [None]:
print data
print data * 10
print data + data

In [None]:
print data.shape
print data.dtype

### Creating ndarrays

In [None]:
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
arr1

In [None]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
print arr2
print arr2.ndim
print arr2.shape

In [None]:
print arr1.dtype
print arr2.dtype

In [None]:
print np.zeros(10)
print np.zeros((3, 6))
print np.empty((2, 3, 2))

In [None]:
print np.arange(15)

### Data Types for ndarrays

In [None]:
arr1 = np.array([1, 2, 3], dtype=np.float64)
arr2 = np.array([1, 2, 3], dtype=np.int32)
print arr1.dtype
print arr2.dtype

In [None]:
arr = np.array([1, 2, 3, 4, 5])
print arr.dtype
float_arr = arr.astype(np.float64)
print float_arr.dtype

In [None]:
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
print arr
print arr.astype(np.int32)

In [None]:
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)
print numeric_strings.astype(float)

In [None]:
int_array = np.arange(10)
calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)
print int_array.astype(calibers.dtype)

In [None]:
empty_uint32 = np.empty(8, dtype='u4')
print empty_uint32

### Operations between arrays and scalars

In [None]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
print arr
print arr * arr
print arr - arr

In [None]:
print 1 / arr
print arr ** 0.5

### Basic indexing and slicing

In [None]:
arr = np.arange(10)
print arr
print arr[5]
print arr[5:8]
arr[5:8] = 12
print arr

In [None]:
arr_slice = arr[5:8]
arr_slice[1] = 12345
print arr
arr_slice[:] = 64
print arr

In [None]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print arr2d[2]

In [None]:
print arr2d[0][2]
print arr2d[0, 2]

In [None]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
print arr3d

In [None]:
print arr3d[0]

In [None]:
old_values = arr3d[0].copy()
arr3d[0] = 42
print arr3d
arr3d[0] = old_values
print arr3d

In [None]:
print arr3d[1, 0]

#### Indexing with slices

In [None]:
print arr[1:6]

In [None]:
print arr2d
print arr2d[:2]

In [None]:
print arr2d[:2, 1:]

In [None]:
print arr2d[1, :2]
print arr2d[2, :1]

In [None]:
print arr2d[:, :1]

In [None]:
arr2d[:2, 1:] = 0
print arr2d

### Boolean indexing

In [None]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = randn(7, 4)
print names
print data

In [None]:
names == 'Bob'

In [None]:
data[names == 'Bob']

In [None]:
print data[names == 'Bob', 2:]
print data[names == 'Bob', 3]

In [None]:
names != 'Bob'
data[~(names == 'Bob')]

In [None]:
mask = (names == 'Bob') | (names == 'Will')
print mask
print data[mask]

In [None]:
data[data < 0] = 0
data

In [None]:
data[names != 'Joe'] = 7
data

### Fancy indexing

In [None]:
arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i
arr

In [None]:
arr[[4, 3, 0, 6]]

In [None]:
arr[[-3, -5, -7]]

In [None]:
# more on reshape in Chapter 12
arr = np.arange(32).reshape((8, 4))
print arr
print arr[[1, 5, 7, 2], [0, 3, 1, 2]]

In [None]:
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]

In [None]:
arr[np.ix_([1, 5, 7, 2], [0, 3, 1, 2])]

### Transposing arrays and swapping axes

In [None]:
arr = np.arange(15).reshape((3, 5))
arr
arr.T

In [None]:
arr = np.random.randn(6, 3)
np.dot(arr.T, arr)

In [None]:
arr = np.arange(16).reshape((2, 2, 4))
arr
arr.transpose((1, 0, 2))

In [None]:
arr
arr.swapaxes(1, 2)

## Universal Functions: Fast element-wise array functions

In [None]:
arr = np.arange(10)
np.sqrt(arr)
np.exp(arr)

In [None]:
x = randn(8)
y = randn(8)
x
y
np.maximum(x, y) # element-wise maximum

In [None]:
arr = randn(7) * 5
np.modf(arr)

## Data processing using arrays

In [None]:
points = np.arange(-5, 5, 0.01) # 1000 equally spaced points
xs, ys = np.meshgrid(points, points)
ys

In [None]:
from matplotlib.pyplot import imshow, title

In [None]:
import matplotlib.pyplot as plt
z = np.sqrt(xs ** 2 + ys ** 2)
z
plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()
plt.title("Image plot of $\sqrt{x^2 + y^2}$ for a grid of values")
plt.draw()

### Expressing conditional logic as array operations

In [None]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

In [None]:
result = [(x if c else y)
          for x, y, c in zip(xarr, yarr, cond)]
result

In [None]:
result = np.where(cond, xarr, yarr)
result

In [None]:
arr = randn(4, 4)
print arr
print np.where(arr > 0, 2, -2)
print np.where(arr > 0, 2, arr) # set only positive values to 2

In [None]:
# Not to be executed

result = []
for i in range(n):
    if cond1[i] and cond2[i]:
        result.append(0)
    elif cond1[i]:
        result.append(1)
    elif cond2[i]:
        result.append(2)
    else:
        result.append(3)

In [None]:
# Not to be executed

np.where(cond1 & cond2, 0,
         np.where(cond1, 1,
                  np.where(cond2, 2, 3)))

In [None]:
# Not to be executed

result = 1 * cond1 + 2 * cond2 + 3 * -(cond1 | cond2)

### Mathematical and statistical methods

In [None]:
arr = np.random.randn(5, 4) # normally-distributed data
print arr.mean()
print np.mean(arr)
print arr.sum()

In [None]:
print arr.mean(axis=1)
print arr.sum(0)

In [None]:
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
print arr.cumsum(0)
print arr.cumprod(1)

### Methods for boolean arrays

In [None]:
arr = randn(100)
(arr > 0).sum() # Number of positive values

In [None]:
bools = np.array([False, False, True, False])
print bools.any()
print bools.all()

### Sorting

In [None]:
arr = randn(8)
print arr
arr.sort()
print arr

In [None]:
arr = randn(5, 3)
print arr
arr.sort(1)
print arr

In [None]:
large_arr = randn(1000)
large_arr.sort()
large_arr[int(0.05 * len(large_arr))] # 5% quantile

### Unique and other set logic

In [None]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
print np.unique(names)
ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])
print np.unique(ints)

In [None]:
sorted(set(names))

In [None]:
values = np.array([6, 0, 0, 3, 2, 5, 6])
np.in1d(values, [2, 3, 6])

## File input and output with arrays

### Storing arrays on disk in binary format

In [None]:
arr = np.arange(10)
np.save('some_array', arr)

In [None]:
np.load('some_array.npy')

In [None]:
np.savez('array_archive.npz', a=arr, b=arr)

In [None]:
arch = np.load('array_archive.npz')
arch['b']

In [None]:
!rm some_array.npy
!rm array_archive.npz

### Saving and loading text files

In [None]:
!cat data/array_ex.txt

In [None]:
arr = np.loadtxt('data/array_ex.txt', delimiter=',')
arr

## Linear algebra

In [None]:
x = np.array([[1., 2., 3.], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])
print x
print y
print x.dot(y)  # equivalently np.dot(x, y)

In [None]:
np.dot(x, np.ones(3))

In [None]:
np.random.seed(12345)

In [None]:
from numpy.linalg import inv, qr
X = randn(5, 5)
mat = X.T.dot(X)
inv(mat)
mat.dot(inv(mat))
q, r = qr(mat)
print r

## Random number generation

In [None]:
samples = np.random.normal(size=(4, 4))
samples

In [None]:
from random import normalvariate
N = 1000000
%timeit samples = [normalvariate(0, 1) for _ in xrange(N)]
%timeit np.random.normal(size=N)

## Example: Random Walks

In [None]:
import random
position = 0
walk = [position]
steps = 1000
for i in xrange(steps):
    step = 1 if random.randint(0, 1) else -1
    position += step
    walk.append(position)

In [None]:
np.random.seed(12345)

In [None]:
nsteps = 1000
draws = np.random.randint(0, 2, size=nsteps)
steps = np.where(draws > 0, 1, -1)
walk = steps.cumsum()

In [None]:
print walk.min()
print walk.max()

In [None]:
(np.abs(walk) >= 10).argmax()

### Simulating many random walks at once

In [None]:
nwalks = 5000
nsteps = 1000
draws = np.random.randint(0, 2, size=(nwalks, nsteps)) # 0 or 1
steps = np.where(draws > 0, 1, -1)
walks = steps.cumsum(1)
walks

In [None]:
print walks.max()
print walks.min()

In [None]:
hits30 = (np.abs(walks) >= 30).any(1)
print hits30
print hits30.sum() # Number that hit 30 or -30

In [None]:
crossing_times = (np.abs(walks[hits30]) >= 30).argmax(1)
print crossing_times.mean()

In [None]:
steps = np.random.normal(loc=0, scale=0.25,
                         size=(nwalks, nsteps))