## NumPy

#### “add[s] support for large, multidimensional arrays and matrices, along with a large library of high-level mathematical functions to operate on these arrays”


In [206]:
import numpy as np

In [208]:
a = np.array([0, 1, 5, 7, 6, 5, 2, 3, 8, 9])
a[3], a

(7, array([0, 1, 5, 7, 6, 5, 2, 3, 8, 9]))

In [209]:
a[3:7]

array([7, 6, 5, 2])

In [210]:
b=np.array([1,5,7])
b

array([1, 5, 7])

In [211]:
a[[b]]

array([1, 5, 3])

In [213]:
a[a > 5]

array([7, 6, 8, 9])

## Working with matrices: 

In [215]:
a = np.ones((3, 3)) #or a1 = np.ones([3,3])
a , a.shape

(array([[ 1.,  1.,  1.],
        [ 1.,  1.,  1.],
        [ 1.,  1.,  1.]]), (3, 3))

In [217]:
b = np.zeros((4, 2))
b , b.shape

(array([[ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.]]), (4, 2))

## RULE: 
Two matrices can be multiplied only when the number of columns in the first equals the number of rows in the second	

In [218]:
a=np.array([[1,2,3],[4,5,6]])
a , a.shape

(array([[1, 2, 3],
        [4, 5, 6]]), (2, 3))

In [219]:
a.T  , a.T.shape

(array([[1, 4],
        [2, 5],
        [3, 6]]), (3, 2))

In [220]:
b=np.array([6, 7])
b , b.shape

(array([6, 7]), (2,))

$$ \bf{A} \dot \bf{B} = \bf{C}$$

In [225]:
c = np.dot(a.T,b) #matrix multiplication : How to (a.T,b,a)?
c , c.shape

(array([34, 47, 60]), (3,))

#### Moving on : 

In [232]:
b = np.arange(12).reshape(6,2)
b , b.shape

(array([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11]]), (6, 2))

In [234]:
b.sum(axis=0) , b.sum(axis=1) , b.sum()

(array([30, 36]), array([ 1,  5,  9, 13, 17, 21]), 66)

### Matric object and 2-d matrix indexing and slicing

In [236]:
b=np.mat('1 2 3 4; 5 6 7 8; 9 10 11 12') #Mat
b , b.shape

(matrix([[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]]), (3, 4))

In [252]:
b , b.shape

(matrix([[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]]), (3, 4))

In [251]:
c= b[:,:3] 

In [248]:
c ,c.shape

(matrix([[ 1,  2],
         [ 5,  6],
         [ 9, 10]]), (3, 2))

In [254]:
a=np.mat('1;2;3;4')
a , a.shape

(matrix([[1],
         [2],
         [3],
         [4]]), (4, 1))

In [255]:
b , b.shape

(matrix([[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]]), (3, 4))

In [256]:
c= np.dot(a,b.T)   # how to multiple a*b?

ValueError: objects are not aligned

In [257]:
aa=np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
aa.shape # # page, row, col → 3D

(2, 2, 3)

In [258]:
bb=np.array([[[3],[4],[6]],[[6],[5],[7]]])
bb.shape # # page, row, col → 3D

(2, 3, 1)

In [260]:
c = np.dot(aa,bb) 

In [261]:
c.shape # --> 4D ! 

(2, 2, 2, 1)

In [262]:
e = np.dot(aa,bb.T) 

In [263]:
e.shape

(2, 2, 1, 2)

### ndarray element-wise operations – scalar and matrix


In [264]:
aa = np.arange(5)
aa , aa.shape

(array([0, 1, 2, 3, 4]), (5,))

In [266]:
aa = aa * 8
aa , aa.shape

(array([  0,  64, 128, 192, 256]), (5,))

In [267]:
bb = np.array([2, 4, 6, 8, 10])
bb , bb.shape

(array([ 2,  4,  6,  8, 10]), (5,))

In [268]:
c = np.multiply(aa, bb)
c , c.shape

(array([   0,  256,  768, 1536, 2560]), (5,))

In [269]:
cc = np.divide(aa.astype(float), bb.astype(float)) #just to make sure
cc , cc.shape

(array([  0.        ,  16.        ,  21.33333333,  24.        ,  25.6       ]),
 (5,))

### Excerise:

####1. Create a 3x3 identity matrix  == A
####2. Create a 4x4x4 array with random values == B
####3. Find A*B

In [278]:
I = np.identity(3)
I , I.shape

(array([[ 1.,  0.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.]]), (3, 3))

In [280]:
R = np.random.rand(4,4,4,4)
R , R.shape

(array([[[[  5.52217393e-01,   4.51297376e-01,   9.24469009e-01,
             8.20136359e-01],
          [  9.09356953e-01,   3.82545754e-01,   4.28531626e-01,
             1.46147024e-01],
          [  7.32296123e-01,   9.36425282e-01,   2.31274637e-01,
             4.02581955e-01],
          [  5.89390800e-01,   6.51792734e-01,   5.03363719e-01,
             4.09579983e-02]],
 
         [[  1.51219722e-01,   3.85939105e-01,   2.87590528e-01,
             4.13375984e-01],
          [  9.21139228e-02,   1.44632271e-01,   5.27822702e-01,
             4.51297027e-01],
          [  2.60656198e-01,   8.32398417e-01,   1.49100425e-01,
             1.16967302e-01],
          [  1.86780879e-01,   2.95813595e-01,   1.06209158e-01,
             5.84355775e-01]],
 
         [[  9.27081858e-01,   9.92481030e-01,   8.74708639e-01,
             7.03831973e-01],
          [  3.78084596e-01,   4.88297611e-01,   6.97152293e-01,
             2.05738245e-01],
          [  6.59937345e-01,   5.98143460e-0

In [284]:
PROD = np.dot(I,R)

ValueError: objects are not aligned

In [None]:
M = np.dot()

### Inner product 

In [285]:
aa = np.arange(5)
aa , aa.shape

(array([0, 1, 2, 3, 4]), (5,))

In [286]:
bb = np.arange(9)
bb , bb.shape

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), (9,))

In [289]:
cc = np.inner?
cc , cc.shape

(array([  0.        ,  16.        ,  21.33333333,  24.        ,  25.6       ]),
 (5,))

In [None]:
cc = np.inner

### Compute the eigenvalues and right eigenvectors of a square array

In [290]:
a = np.array([[1,2],[3,4]])
a , a.shape

(array([[1, 2],
        [3, 4]]), (2, 2))

In [291]:
c = np.linalg.eig(a)
c 

(array([-0.37228132,  5.37228132]), array([[-0.82456484, -0.41597356],
        [ 0.56576746, -0.90937671]]))

### Compute the determinant of an array

In [292]:
cc = np.linalg.det(a)

In [293]:
cc 

-2.0000000000000004

### Excerise:

####1. Multiply a 5x3 matrix by a 3x2 matrix (real matrix product)

####3. Create a 5x5 matrix with values 1,2,3,4 just below the diagonal using a function (Extra Credit)!

## Pandas

### Series - one-dimensional labeled array


In [296]:
import pandas as pd
import numpy as np

In [298]:
s = pd.Series(np.random.randn(3))
s , s.shape

(0    1.214346
 1    0.580148
 2    0.133858
 dtype: float64, (3,))

In [299]:
index = ['A','B','C']

In [300]:
SS = pd.Series(np.random.randn(3), index=index)
SS , SS.shape

(A    0.863543
 B    0.350230
 C    0.043459
 dtype: float64, (3,))

In [301]:
SS['A'] # using the labled index

0.86354348053966601

In [302]:
d = {'a' : 0, 'b' : 1, 'c' : 2} # dic style 
d

{'a': 0, 'b': 1, 'c': 2}

In [303]:
pd.Series(d) #convert to DF

a    0
b    1
c    2
dtype: int64

### Series – vector operation support and index alignment


In [304]:
t = pd.Series([30,40], index=['b', 'c' ]) #another way 
t

b    30
c    40
dtype: int64

In [305]:
s = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd'])
s

a    0
b    1
c    2
d    3
dtype: int64

In [306]:
C = t+s

In [307]:
C 

a   NaN
b    31
c    42
d   NaN
dtype: float64

### DataFrame – 2-d labeled data structure


In [308]:
d = {'one' : [10., 20., 30., 40.], 'two' : [4., 3., 2., 1.]}
d

{'one': [10.0, 20.0, 30.0, 40.0], 'two': [4.0, 3.0, 2.0, 1.0]}

In [309]:
DF = pd.DataFrame(d, index=['a','b','c','d']) #convert to DF
DF

Unnamed: 0,one,two
a,10,4
b,20,3
c,30,2
d,40,1


In [310]:
DF.T

Unnamed: 0,a,b,c,d
one,10,20,30,40
two,4,3,2,1


In [311]:
dd={'0':pd.Series([1,2],index=['a','b']),
...     '1':pd.Series([15,25,35],index=['a','b','c'])} #another way using pd.Serise
dd

{'0': a    1
 b    2
 dtype: int64, '1': a    15
 b    25
 c    35
 dtype: int64}

In [312]:
pd.DataFrame(dd)

Unnamed: 0,0,1
a,1.0,15
b,2.0,25
c,,35


### DataFrame – data alignment and arithmetic operations


In [317]:
df = pd.DataFrame(np.floor(np.random.randn(3,4)*10), columns=['K', 'B', 'C', 'D'])

In [318]:
df 

Unnamed: 0,K,B,C,D
0,-6,-11,-8,-1
1,0,14,8,-1
2,-4,-6,22,-4


In [None]:
df = pd.DataFrame(np.floor?

In [319]:
df2 = pd.DataFrame(np.floor(np.random.randn(3,2)*10), columns=['B', 'C'])

In [320]:
df2

Unnamed: 0,B,C
0,-3,-7
1,6,16
2,17,-2


In [321]:
C = df + df2
C

Unnamed: 0,B,C,D,K
0,-14,-15,,
1,20,24,,
2,11,20,,


In [322]:
panel = pd.Panel(np.random.randn(5,3,2).round(decimals=1),
...                  items=['one', 'two', 'three','four','five'],
...                  major_axis=pd.date_range('1/1/2000', periods=3),
...                  minor_axis=['a', 'b'])

In [323]:
panel.to_frame()#(np.random.randn(5,3,2) →  (start,end,steps)

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three,four,five
major,minor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-01,a,0.3,2.0,-1.4,1.0,1.2
2000-01-01,b,0.1,0.7,-0.8,-0.5,0.7
2000-01-02,a,-0.6,-1.0,-1.3,-0.0,1.2
2000-01-02,b,-0.2,-0.8,1.2,0.2,-1.9
2000-01-03,a,0.9,-0.6,0.2,-0.7,-0.3
2000-01-03,b,-0.4,1.2,1.0,0.0,-1.2


In [324]:
panel.to_frame().T


major,2000-01-01,2000-01-01,2000-01-02,2000-01-02,2000-01-03,2000-01-03
minor,a,b,a,b,a,b
one,0.3,0.1,-0.6,-0.2,0.9,-0.4
two,2.0,0.7,-1.0,-0.8,-0.6,1.2
three,-1.4,-0.8,-1.3,1.2,0.2,1.0
four,1.0,-0.5,-0.0,0.2,-0.7,0.0
five,1.2,0.7,1.2,-1.9,-0.3,-1.2


### CREATING A DATAFRAME

In [325]:
df = pd.DataFrame({'int_col' : [1,2,6,8,-1], 
                   'float_col' : [0.1, 0.2,0.2,10.1,None], 
                   'str_col' : ['a','b',None,'c','a']})
df

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
2,0.2,6,
3,10.1,8,c
4,,-1,a


In [327]:
df.ix[:3,['float_col','int_col']] #[rows, col]

Unnamed: 0,float_col,int_col
0,0.1,1
1,0.2,2
2,0.2,6
3,10.1,8


In [328]:
df.ix[:2,['float_col','int_col']] #[Start:End,[rows, col]]

Unnamed: 0,float_col,int_col
0,0.1,1
1,0.2,2
2,0.2,6


In [329]:
df[(df['float_col'] > 0.1) & (df['int_col']>2)] #and  "|" or

Unnamed: 0,float_col,int_col,str_col
2,0.2,6,
3,10.1,8,c


In [330]:
df[~(df['float_col'] > 0.1)] #invert 

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
4,,-1,a


In [331]:
df2 = df.rename(columns={'int_col' : 'some_other_name'})
df

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
2,0.2,6,
3,10.1,8,c
4,,-1,a


In [332]:
df2.dropna()

Unnamed: 0,float_col,some_other_name,str_col
0,0.1,1,a
1,0.2,2,b
3,10.1,8,c


In [333]:
df3 = df.copy()
df

Unnamed: 0,float_col,int_col,str_col
0,0.1,1,a
1,0.2,2,b
2,0.2,6,
3,10.1,8,c
4,,-1,a


In [335]:
mean = df3['float_col'].mean()
df3['float_col'].fillna(mean)

0     0.10
1     0.20
2     0.20
3    10.10
4     2.65
Name: float_col, dtype: float64

### MAP & APPLY FUNCTIONS

In [336]:
df['str_col'].dropna().map(lambda x : 'map_' + x)


0    map_a
1    map_b
3    map_c
4    map_a
Name: str_col, dtype: object

In [337]:
df[['int_col','float_col']].apply(np.sqrt)

Unnamed: 0,int_col,float_col
0,1.0,0.316228
1,1.414214,0.447214
2,2.44949,0.447214
3,2.828427,3.17805
4,,


### STATISTICS

In [338]:
df.describe()

Unnamed: 0,float_col,int_col
count,4.0,5.0
mean,2.65,3.2
std,4.96689,3.701351
min,0.1,-1.0
25%,0.175,1.0
50%,0.2,2.0
75%,2.675,6.0
max,10.1,8.0


In [339]:
df3.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
float_col,4,2.65,4.96689,0.1,0.175,0.2,2.675,10.1
int_col,5,3.2,3.701351,-1.0,1.0,2.0,6.0,8.0


# Reading Data Frame, Converting to Pandas  

In [340]:
from sklearn.datasets import load_iris

iris = load_iris()

print iris.DESCR

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:
                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)
    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris dat

In [341]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [343]:
iris_df.head(100)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


### Exercise:

1. Print the summary statistics for all features 
2. Group by 'petal width' the 'petal length' feature
3. Using Plot function in Pandas - plot 'sepal length'
4. Are there any missing values in this data frame? - write a function to cheack that there are no missing values in this DF.
5. 