# Intro to numpy

This notebook's gonna teach you to play around with data using numpy, pandas and matplotlib.

Let's get started!

#### Jupyter primer:
* Those blocks below are cells. Click on cell to select. Ctrl+enter to run code from cell.
* Click topbar "+" button to create new cell.
* Sometimes we'll ask you to add `<your code here doing such and such things>`

In [None]:
#execute this cell

from __future__ import print_function

import math

In [None]:
#type "math." or "math.s" and press tab
<here>

In [None]:
#type math.atan2() and press shift+tab inside the brackets
<here>

In [None]:
del math

#oops, i destroyed math :)

# Gettin' all vectorized with Numpy

In [None]:
import numpy as np

#numpy's main object is the homogeneous multidimensional array.
a = np.array([1,2,3,4,5])
b = np.array([5,4,3,2,1])

print ("a = ",a)
print ("b = ",b)

In [None]:
#Arithmetic operators on arrays apply elementwise. 
print ("a+1 =", a+1)
print ("a*2 =", a*2)

#Or pairwise, etc.
print ("a+b =",a+b)
print ("a*b + b**a =",a*b + b**a)


Numpy's also rich on functions that compute all kinds of stuff

In [None]:
print ("numpy.sum(a) = ",np.sum(a))
print ("numpy.mean(a) = ", np.mean(a))
print ("numpy.min(a) = ", np.min(a))
print ("numpy.argmax(a) = ", np.argmax(a)) #index of minimal element
print ("numpy.log(b) = ",np.log(b))
print ("numpy.arctan(b) = ",np.arctan(b))
print ("numpy.unique(['b','c','d','c','b']) = ",np.unique(['b','c','d','c','b']))


### Wax on, wax off

In [None]:
#The basics of numpy
print ("Difference between a and b:", <code>)
print ("Sum of squares of a and b:", <code>)
print ("a minus 2 divided by square root of b", <code>)

In [None]:
print ("difference between minimum and maximum of a", <code>)

In [None]:
print ("mean squared error between a and b", <code>)

## Hello, data

In [None]:
import pandas as pd
data = pd.read_csv("small_higgs.csv",na_values=-999).fillna(0)

In [None]:
#Data's a table, 2d array. Here's first 10 elements.
data[:10]

In [None]:
#Data shapes
print ("len(data) = ",len(data))
print ("data.shape = ",data.shape)
print ("data.columns = ",data.columns)

In [None]:
#selecting just one row
print (data.iloc[4])

In [None]:
#first 5 elements
print(data[0:5])

In [None]:
#Selecting columns
print (data["PRI_jet_leading_eta"])

In [None]:
#Or everything at once
data[["Label","PRI_jet_leading_phi","PRI_jet_leading_eta"]][5:10]

### Column's basically an array

You can do anything you've done before with arrays - now with actual data.

In [None]:
np.cos(data["PRI_jet_leading_phi"])[:5]

In [None]:
phi = data["PRI_jet_leading_phi"]
print ("sin^2(phi) + cos^2(phi) = ",<code>)

### Numpy booleans and indexing

In [None]:
#Logical operations are also elementwise
print ("a==2", a==2)

In [None]:
print "Boolean ops"

print ("a>2",a>2)                       #suports <, !=, <=, etc.
print ("numpy.logical_not(a>2) = ",     np.logical_not(a>2))
print ("numpy.logical_and(a>2,b>2) = ", np.logical_and(a>2,b>2))
print ("numpy.logical_or(a>4,b<3) = ",  np.logical_or(a>2,b<3))

print "\nOr simpler"
print ("~(a>2) = ",~(a>2))               #like logical_not
print ("(a>2)&(b>2) = ", (a>2)&(b>2))    #lk logical_and
print ("(a>2)|(b<3) = ", (a>2)|(b<3))    #как logical_or




In [None]:
print ("\n\nSelecting with booleans")

#Selecting elements that match boolean predicate
print ("a = ",a)
print ("a >= 3  =",a>=3)
print ("a[a>=3] =", a[a>=3])
print ("b[a>=3] =", b[a>=3])
print ("numpy.where(a>=3) =",np.where(a>=3)) #indices where mask is True


print ("\nYou can also select elements by their index")
print ("a[[1,3,4]] = ", a[[1,3,4]])
print ("a[[1,2,2,3]] = ", a[[1,2,2,3]])                    #repeat a[2] twice
print ("a[numpy.where(a%2!=0)] = ", a[np.where(a%2!=0)])   #same as a[a%2!=0]

### Doing stuff

In [None]:
#Compute mean and standard deviation of PRI_jet_all_pt
#of only those events where PRI_jet_num equals 2

In [None]:
<your code here>

In [None]:
#Find the data row with largest PRI_jet_all_pt among those where PRI_jet_num equals 1,
#Print that row
<your code here>

### More numpy

* Full reference aka docs [[here]](https://docs.scipy.org/doc/numpy/reference/)
* Cheat sheet - [[here]](https://pbs.twimg.com/media/C23mQfaWEAAGzBn.jpg)
* Usually you can make your way with tab-ing and shift-tab-ing and following your gut feeling.
 * This is likely not the most efficient way to do so.

In [None]:
#More useful things in numpy:

print("Special numpy array creators:")
print("np.zeros(6) = ",np.zeros(6))
print("np.ones(6) = ",np.ones(6))                 #or you could use np.zeros(7) + 1
print("np.zeros_like(a) = ",np.zeros_like(a))

print("\n")
print("np.arange(5) = ",np.arange(5))
print("np.linspace(0,5,10) = ",np.linspace(0,5,num=11))
print("np.random.normal(0,0.1,10) = ",np.random.normal(0,0.1,size=3)) 


print("Shapes")
v = np.arange(6)
print("v = ",v)
print("np.reshape(v,[2,3]) = \n",np.reshape(v,[2,3]))
print("np.reshape(v,[2,-1]) = \n",np.reshape(v,[2,-1]))   #means "Figure that last dimension out by yourself"

print("np.reshape(v,[10,1]) = \n",np.reshape(v,[6,1]))
print("v[:,None] = \n",v[:,None])

In [None]:
#Multi-dimensional arrays work exactly as 1d ones, but with more axes
mat = np.reshape(v,[3,2])
print("mat = \n",mat)
print("mat.shape =",mat.shape)
print("mat[2,1] = ",mat[2,1])  #second row, first element, both starting from 0
print("mat[:,0] = ",mat[:,0])  #first column
print("mat[0:2,1] = ",mat[0:2,0])  #two elems of second column


print("\nAxes:")
print("np.sum(mat) = ",np.sum(mat))
print("np.sum(mat,axis=0) = ",np.sum(mat,axis=0))
print("np.sum(mat,axis=1) = ",np.sum(mat,axis=1))


In [None]:
print("A glimpse into broadcasting")
print("np.reshape(v,[6,1]) + np.reshape(v,[1,6]) = \n",np.reshape(v,[6,1]) + np.reshape(v,[1,6]))

In [None]:
#In short, you can divide(elem-wise) matrix by a row vector 
#or perform any operation when two arrays share last axes
print("matrix shape =",mat.shape)
print("vector shape =",np.mean(mat,axis=0).shape)
print("Result:")

print(mat / np.mean(mat,axis=0))



In [None]:
#So for example, you can't divide matrix by a column vector...

print("matrix shape =",mat.shape)
print("vector shape =",np.mean(mat,axis=1).shape)
print("Result:")

print(mat / np.mean(mat,axis=1))

In [None]:
#... but you can replace vector with a single-column matrix 
# because the final axis is gonna be broadcasted

print("matrix shape =",mat.shape)
print("vector shape =",np.mean(mat,axis=1).reshape([3,1]).shape)
print("Result:")

mat / np.mean(mat,axis=1).reshape([3,1])

# The quest

In [None]:
matrix = data[["DER_mass_MMC","DER_mass_transverse_met_lep","DER_mass_vis","DER_pt_h"]].values
matrix

__Normalize the matrix:__ subtract from each column its mean and divide by the standard deviation. 

I suggest np.mean, np.std with axis parameter.

In [None]:
matrix_normalized = <your code>

In [None]:
print("Sanity check, print new mean and new std for each column")
<your code>

Print for the following matrix row numbers for those rows, where row sum is greater than 1. 
I suggest np.sum and np.where.

In [None]:
<your code>

```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```


## Performance stuff: "how fast is it, Harry?"
![img](http://statics.viralizalo.com/virs/2016/02/VIR_155188_11939_harry_potter_y_el_prisionero_de_azkaban_test_solo_para_fans.jpg?cb=383)


Let's see if numpy is any faster than loops

* Doing some elementwise ops
* Three options
 * Pure python
 * Start python, convert to numpy
 * Pure numpy



In [None]:
%%time
#This cell magic prints the time it took to run the cell


#Pure python

arr_1 = range(1000000)
arr_2 = range(99,1000099)


a_sum = []
a_prod = []
sqrt_a1 = []

#elementwise sum/prod/sqrt
for i in range(len(arr_1)):
    a_sum.append(arr_1[i]+arr_2[i])
    a_prod.append(arr_1[i]*arr_2[i])
    a_sum.append(arr_1[i]**0.5)
    
#sum of all elements of arr_1
arr_1_sum = sum(arr_1)


In [None]:
%%time

#start with list, convert to numpy
arr_1 = range(1000000)
arr_2 = range(99,1000099)

arr_1, arr_2 = np.array(arr_1) , np.array(arr_2)


a_sum = arr_1 + arr_2
a_prod = arr_1*arr_2
sqrt_a1 = arr_1**.5

np.sum(arr_1)


In [None]:
%%time

# Pure numpy
arr_1 = np.arange(1000000)
arr_2 = np.arange(99,1000099)

a_sum = arr_1 + arr_2
a_prod = arr_1*arr_2
sqrt_a1 = arr_1**.5

np.sum(arr_1)


```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```


# Matplotlib

Simple and powerful plotting to make sense of data.

![img](https://imgs.xkcd.com/comics/extrapolating.png)


In [None]:
import matplotlib.pyplot as plt
#VV This is an official "magic" to force plots inside your notebook
%matplotlib inline                

#line plot
plt.plot([0,1,2,3,4,5],[0,1,4,9,16,25])

In [None]:
#line plot with better formatting

plt.title("Super-plot")

plt.xlabel("Number of layers")
plt.ylabel("Coolness")

plt.plot([0,1,2,3,4,5],[0,1,4,9,16,25],marker='o',label='super-curve')

plt.legend(loc='best')
plt.grid()

In [None]:
#scatter-plot

#draw first plot
plt.scatter(np.arange(10),np.arange(10)**2)     #just to show that np-arrays are allowed

#Show it, start new plot
plt.show()

#Draw two scatters in one plot
plt.scatter([1,1,2,3],[10,12,6,20],c = ["red","blue","blue","green"],marker = "x")
plt.scatter([0,1,2,3,4,5],[0,1,4,9,16,25],c = "black")


In [None]:
#Histogram
plt.hist([0,1,1,1,2,2,3,3,3,3,3,4,4,5,5,5,6,7,7,8,9,10])
plt.show()


plt.hist([0,1,1,1,2,2,3,3,3,3,3,4,4,5,5,5,6,7,7,8,9,10],bins = 5) 


#now gently press shift+tab+tab with cursor inside plt.hist  -^^

In [None]:
#plot a histogram of PRI_jet_leading_pt

<your code>

#bonus: separate histograms for PRI_jet_leading_pt where PRI_jet_num equals 0,1,2 and 3 respectively


In [None]:
#make a scatter-plot of DER_mass_MMC vs DER_mass_jet_jet

<your code here>

#bonus: make it as clear as possible with formatting

### More matplotlib

* Docs - [[url]](https://matplotlib.org/2.0.2/contents.html)
* Cheat sheet - [[url]](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Python_Matplotlib_Cheat_Sheet.pdf)
* Examples - [[url]](https://matplotlib.org/examples/)

In [None]:
#Final boss - find a way to display a 2-dimensional histogram 
#of DER_mass_MMC vs DER_mass_jet_jet

```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```
```

```

<center>You made it!</center>

![img](https://pbs.twimg.com/media/CrHblh9WgAEoH1u.jpg)