In [None]:
!pip install fortran-magic
%matplotlib inline
%load_ext fortranmagic

import sys; sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rc('figure', figsize=(12, 7))

ran_the_first_cell = True

jan2017 = pd.to_datetime(['2017-01-03 00:00:00+00:00',
 '2017-01-04 00:00:00+00:00',
 '2017-01-05 00:00:00+00:00',
 '2017-01-06 00:00:00+00:00',
 '2017-01-09 00:00:00+00:00',
 '2017-01-10 00:00:00+00:00',
 '2017-01-11 00:00:00+00:00',
 '2017-01-12 00:00:00+00:00',
 '2017-01-13 00:00:00+00:00',
 '2017-01-17 00:00:00+00:00',
 '2017-01-18 00:00:00+00:00',
 '2017-01-19 00:00:00+00:00',
 '2017-01-20 00:00:00+00:00',
 '2017-01-23 00:00:00+00:00',
 '2017-01-24 00:00:00+00:00',
 '2017-01-25 00:00:00+00:00',
 '2017-01-26 00:00:00+00:00',
 '2017-01-27 00:00:00+00:00',
 '2017-01-30 00:00:00+00:00',
 '2017-01-31 00:00:00+00:00',
 '2017-02-01 00:00:00+00:00'])
calendar = jan2017.values.astype('datetime64[D]')

event_dates = pd.to_datetime(['2017-01-06 00:00:00+00:00', 
                             '2017-01-07 00:00:00+00:00', 
                             '2017-01-08 00:00:00+00:00']).values.astype('datetime64[D]')
event_values = np.array([10, 15, 20])

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fortran-magic
  Downloading fortran_magic-0.7-py3-none-any.whl (9.6 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, fortran-magic
Successfully installed fortran-magic-0.7 jedi-0.18.2


  self._lib_dir = os.path.join(get_ipython_cache_dir(), 'fortran')


<center>
  <h1>The PyData Toolbox</h1>
  <h3>Scott Sanderson (Twitter: @scottbsanderson, GitHub: ssanderson)</h3>
  <h3><a href="https://github.com/ssanderson/pydata-toolbox">https://github.com/ssanderson/pydata-toolbox</a></h3>
</center>

# About Me:

<img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/me.jpg" alt="Drawing" style="width: 300px;"/>

- Senior Engineer at [Quantopian](www.quantopian.com)
- Background in Mathematics and Philosophy
- **Twitter:** [@scottbsanderson](https://twitter.com/scottbsanderson)
- **GitHub:** [ssanderson](github.com/ssanderson)

## Outline

- Built-in Data Structures
- Numpy `array`
- Pandas `Series`/`DataFrame`
- Plotting and "Real-World" Analyses

# Data Structures

> Rule 5. Data dominates. If you've chosen the right data structures and organized things well, the algorithms
will almost always be self-evident. Data structures, not algorithms, are central to programming.

- *Notes on Programming in C*, by Rob Pike.

# Lists

In [None]:
assert ran_the_first_cell, "Oh noes!"

In [None]:
l = [1, 'two', 3.0, 4, 5.0, "six"]
l

[1, 'two', 3.0, 4, 5.0, 'six']

In [None]:
# Lists can be indexed like C-style arrays.
first = l[0]
second = l[1]
print("first:", first)
print("second:", second)

first: 1
second: two


In [None]:
# Negative indexing gives elements relative to the end of the list.
last = l[-1]
penultimate = l[-2]
print("last:", last)
print("second to last:", penultimate)

last: six
second to last: 5.0


In [None]:
# Lists can also be sliced, which makes a copy of elements between 
# start (inclusive) and stop (exclusive)
sublist = l[1:3]
sublist

['two', 3.0]

In [None]:
# l[:N] is equivalent to l[0:N].
first_three = l[:3]
first_three

[1, 'two', 3.0]

In [None]:
# l[3:] is equivalent to l[3:len(l)].
after_three = l[3:]
after_three

[4, 5.0, 'six']

In [None]:
# There's also a third parameter, "step", which gets every Nth element.
l = ['a', 'b', 'c', 'd', 'e', 'f', 'g','h']
l[1:7:2]

['b', 'd', 'f']

In [None]:
# This is a cute way to reverse a list.
l[::-1]

['h', 'g', 'f', 'e', 'd', 'c', 'b', 'a']

In [None]:
# Lists can be grown efficiently (in O(1) amortized time).
l = [1, 2, 3, 4, 5]
print("Before:", l)
l.append('six')
print("After:", l)

Before: [1, 2, 3, 4, 5]
After: [1, 2, 3, 4, 5, 'six']


In [None]:
# Comprehensions let us perform elementwise computations.
l = [1, 2, 3, 4, 5]
[x * 2 for x in l]

[2, 4, 6, 8, 10]

## Review: Python Lists

- Zero-indexed sequence of arbitrary Python values.
- Slicing syntax: `l[start:stop:step]` copies elements at regular intervals from `start` to `stop`.
- Efficient (`O(1)`) appends and removes from end.
- Comprehension syntax: `[f(x) for x in l if cond(x)]`.

# Dictionaries

In [None]:
# Dictionaries are key-value mappings.
philosophers = {'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}
philosophers

{'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}

In [None]:
# Like lists, dictionaries are size-mutable.
philosophers['Ludwig'] = 'Wittgenstein'
philosophers

{'David': 'Hume',
 'Immanuel': 'Kant',
 'Bertrand': 'Russell',
 'Ludwig': 'Wittgenstein'}

In [None]:
del philosophers['David']
philosophers

{'Immanuel': 'Kant', 'Bertrand': 'Russell', 'Ludwig': 'Wittgenstein'}

In [None]:
# No slicing.
philosophers['Bertrand':'Immanuel']

TypeError: ignored

## Review: Python Dictionaries

- Unordered key-value mapping from (almost) arbitrary keys to arbitrary values.
- Efficient (`O(1)`) lookup, insertion, and deletion.
- No slicing (would require a notion of order).

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/pacino.gif" alt="Drawing" style="width: 100%;"/></center>


In [None]:
# Suppose we have some matrices...
a = [[1, 2, 3],
     [2, 3, 4],
     [5, 6, 7],
     [1, 1, 1]]

b = [[1, 2, 3, 4],
     [2, 3, 4, 5]]

In [None]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    rows_out = len(A)
    cols_out = len(B[0])
    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    
    for i in range(rows_out):
        for j in range(cols_out):
            for k in range(len(B)):
                out[i][j] += A[i][k] * B[k][j]
    return out

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/gross.gif" alt="Drawing" style="width: 50%;"/></center>


In [None]:
%%time

matmul(a, b)

CPU times: user 29 µs, sys: 4 µs, total: 33 µs
Wall time: 37.2 µs


[[5, 8, 11, 14], [8, 13, 18, 23], [17, 28, 39, 50], [3, 5, 7, 9]]

**My own example 0 - cpu info**

In [None]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0xffffffff
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa mmio_stale_data retbleed
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
addres

**My own example 1 - Changing in matmul(A, B) Python len(B) (# of rows of B) for len(A[0]) (# of columns of A)**

In [None]:
A = np.array([[1, 2, 3], [4, 5, 6]])  
B = np.array([[7, 8], [9, 10], [11, 12]])  

C = np.matmul(A, B)
print("Producto de matrices con las dimensiones originales", C)

Producto de matrices con las dimensiones originales [[ 58  64]
 [139 154]]


In [None]:
A = np.array([[1, 2, 3], [4, 5, 6]])
B = np.array([[7, 8, 13], [9, 10, 14], [11, 12, 15]])
C = np.matmul(A, B.T)

print("Producto de matrices con B transpuesta", C)

Producto de matrices con B transpuesta [[ 62  71  80]
 [146 170 194]]


**My own example 2 - Verifiying error with in matmul(A, B) Python with the original matrices when changing len(B) (# of rows of B) for len(A[0]) (# of colums of A)**

In [None]:
#El error ocurrira por que las matrices tienen las dimensiones incorrectas para ser operadas.
A = np.array([[1, 2, 3], [4, 5, 6]])
B = np.array([[7, 8, 13], [9, 10, 14]])
C = np.matmul(A, B)

ValueError: ignored

**My own example 3 - Chekcing the mtarix multiplication compatibility condition  len(A[0]) == len(B)**

In [None]:
A = np.array([[1, 2, 3], [4, 5, 6]])
B = np.array([[7, 8, 13], [9, 10, 14]])

if A.shape[1] == B.shape[0]:
  print("la multiplicacion entre A y B es posible, victoria.")
else:
  print("no se puede realizar la multiplicacion")

no se puede realizar la multiplicacion


**My own example 4 -  Verifiying error with in matmul(A, B) Python when checking the mtarix multiplication compatibility condition  len(A[0]) == len(B)**

In [None]:
A = np.array([[1, 2, 3], [4, 5, 6]])
B = np.array([[7, 8], [9, 10], [11, 12]])

if A.shape[1] != B.shape[0]:
    print("Las matrices no tienen una forma compatible que permita la multiplicacion.")
else:
 C = np.matmul(A, B)
print(C)

[[ 58  64]
 [139 154]]


**My own example 5 - Deifining A and B that are compatiible for multiplcation**

In [None]:
A = np.array([[1, 2], [3, 4], [5, 6]])  
B = np.array([[7, 8, 9], [10, 11, 12]])  

C = np.matmul(A, B)
print("El resultado de la multiplicacion entre estas matrices sera una matris de 3x3", C)

El resultado de la multiplicacion entre estas matrices sera una matris de 3x3 [[ 27  30  33]
 [ 61  68  75]
 [ 95 106 117]]


**My own example 6 - Runinng the correct Python matrix multiplication code with the matrices with dimensions compatible for multiplication.**

In [None]:
import random

In [None]:
random.normalvariate(0,1)

0.5225605886179646

In [None]:
a1 = [[11, 12, 13],
     [16, 15, 14],
     [17, 18, 19]]

b1 = [[11, 21, 31],
     [24, 36, 45],
      [23, 24, 12]]

def matmul(A, B):
    rows_out = len(A)
    cols_out = len(B[0])
    q = len(A[0])
    t = len(B)
    if(q!=t):
      print("Los tamaños de las matrices no son compatibles.")


    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    for x in range(rows_out):
        for z in range(cols_out):
            for k in range(len(B)):
                out[x][z] += A[x][k] * B[k][z]
    return out


matmul(a1,b1)

[[708, 975, 1037], [858, 1212, 1339], [1056, 1461, 1565]]

**My own example 7 - Running 10 times matmul(randa, randb) with randa and randb a randon matrices of 600 x 100 and 100 x 600 and calulating the average execution time**

In [None]:
import time
m = 600
n = 100
randa = np.random.rand(m, n)
randb = np.random.rand(n, m)
num_runs = 10
total_time = 0
for i in range(num_runs):
    start_time = time.time()
    np.matmul(randa, randb)
    end_time = time.time()
    total_time += end_time - start_time
    avg_time = total_time / num_runs

print(f"Tiempo promedio de operacion {num_runs} son: {avg_time:.6f} seconds")

Tiempo promedio de operacion 10 son: 0.003609 seconds


**My own example 8 - Creating the average execution time data frame and adding Python's average execution time**

In [None]:
randa = np.random.rand(600, 100)
randb = np.random.rand(100, 600)
times = []
for i in range(100):
    start_time = time.time()
    result = np.matmul(randa, randb)
    end_time = time.time()
    times.append(end_time - start_time)
custom_avg_time = sum(times) / len(times)
df = pd.DataFrame({'Metodo': ['Custom', 'Python'],
                   'Tiempo de ejecucion': [custom_avg_time, np.nan]})
print(df)



   Metodo  Tiempo de ejecucion
0  Custom             0.005285
1  Python                  NaN


**My own example 9 - Running 10 times randa and randb mutiplicaction as NumPy arrays  adding NumPy's average execution time**

In [None]:
randa = np.random.rand(600, 100)
randb = np.random.rand(100, 600)
np_times = []
for i in range(10):
    start_time = time.time()
    result = np.matmul(randa, randb)
    np_times.append(time.time() - start_time)

np_avg_time = np.mean(np_times)

print("Tiempo promedio de ejecucion en NumPy", np_avg_time)

Tiempo promedio de ejecucion en NumPy 0.004503726959228516


In [None]:
%%time
randa = random_matrix(600, 100)
randb = random_matrix(100, 600)
x = matmul(randa, randb)

CPU times: user 11.4 s, sys: 1.98 ms, total: 11.4 s
Wall time: 11.5 s


In [None]:
# Maybe that's not that bad?  Let's try a simpler case.
def python_dot_product(xs, ys):
    return sum(x * y for x, y in zip(xs, ys))

In [None]:
%%fortran
subroutine fortran_dot_product(xs, ys, result)
    double precision, intent(in) :: xs(:)
    double precision, intent(in) :: ys(:)
    double precision, intent(out) :: result
    
    result = sum(xs * ys)
end

In [None]:
list_data = [float(i) for i in range(100000)]
array_data = np.array(list_data)

In [None]:
%%time
python_dot_product(list_data, list_data)

CPU times: user 12.1 ms, sys: 0 ns, total: 12.1 ms
Wall time: 12.8 ms


333328333350000.0

In [None]:
%%time
fortran_dot_product(array_data, array_data)

CPU times: user 175 µs, sys: 0 ns, total: 175 µs
Wall time: 180 µs


333328333350000.0

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/sloth.gif" alt="Drawing" style="width: 1080px;"/></center>


**My own example 10 - Deifining A (2x2)  and B (2x2)**

In [None]:
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

**My own example 11 - Defining Fortran subroutine matmul(A,B) for 2x2 matrices**

In [None]:
fortran_dot_product(A, B)

70.0

**My own example 12 -Run Fortran subroutine matmul(A,B) with a and b 2x2 matrices**

In [None]:
fortran_dot_product(A,B)

70.0

**My own example 13 - Defining Fortran subroutine matmul(A,B) for 600x100 and 100x600 matrices**

In [None]:
fortran_dot_product(A,B)

15005.96714841066

**My own example 14 -Run Fortran subroutine matmul(A,B) with 600x100 and 100x600 matrices**

In [None]:
a = random_matrix(600, 100)
b = random_matrix(100, 600)
fortran_dot_product(a, b)

15075.966898495348

**My own example 15 - Running 10 times the  Fortran subroutine matmul(A,B) with 600x100 and 100x600 matrices and adding Fortran magic average execution time to the data frame**

In [None]:
%%time
for i in range(10):
  A = random_matrix(600, 100)
  B = random_matrix(100, 600)
  total = fortran_dot_product(A,B)
  print(total)

14983.254903621562
14958.17590583753
14941.077666011563
15047.819820642266
15171.638318731972
14895.774885780364
15083.556874568376
14947.2541232383
14997.009464309227
15020.315097826779
CPU times: user 236 ms, sys: 5.5 ms, total: 242 ms
Wall time: 259 ms


**My own example 16 - Creating a  Fortran program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
def diez(A,B):
  for i in range(10):
    print(fortran_dot_product(A,B))

A = random_matrix(600, 100)
B = random_matrix(100, 600)
diez(A,B)

**My own example 17 - Running the Fortran program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
def diez(A,B):
  for i in range(10):
    print(fortran_dot_product(A,B))

A = random_matrix(600, 100)
B = random_matrix(100, 600)
diez(A,B)

14965.78903442156
14965.78903442156
14965.78903442156
14965.78903442156
14965.78903442156
14965.78903442156
14965.78903442156
14965.78903442156
14965.78903442156
14965.78903442156


**My own example 18 - Adding Fortran average execution time to the data frame**

In [None]:
%%time
def diez(A,B):
  for i in range(10):
    print(fortran_dot_product(A,B))

A = random_matrix(600, 100)
B = random_matrix(100, 600)
diez(A,B)

15005.96714841066
15005.96714841066
15005.96714841066
15005.96714841066
15005.96714841066
15005.96714841066
15005.96714841066
15005.96714841066
15005.96714841066
15005.96714841066
CPU times: user 161 ms, sys: 1.74 ms, total: 162 ms
Wall time: 359 ms


**My own example 19 - Creating a c program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:

%%writefile multma.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>       // for clock_t, clock(), CLOCKS_PER_SEC
#include <unistd.h>     // for sleep()
int main() {
    int i, j, k;
    int n = 600, m = 100, p = 600;
    double *A = (double *) malloc(n * m * sizeof(double));
    double *B = (double *) malloc(m * p * sizeof(double));
    double *C = (double *) malloc(n * p * sizeof(double));
    clock_t start, end;
    double cpu_time_used;

    // Initialize matrices A and B with random values
    srand(time(NULL));
    for (i = 0; i < n * m; i++) {
        A[i] = (double) rand() / RAND_MAX;
    }
    for (i = 0; i < m * p; i++) {
        B[i] = (double) rand() / RAND_MAX;
    }

    // Multiply matrices A and B
    start = clock();
    for (i = 0; i < n; i++) {
        for (j = 0; j < p; j++) {
            C[i * p + j] = 0.0;
            for (k = 0; k < m; k++) {
                C[i * p + j] += A[i * m + k] * B[k * p + j];
            }
        }
    }
    end = clock();

    // Print the resulting matrix C and the execution time
    printf("Matrix C:\n");
    for (i = 0; i < n; i++) {
        for (j = 0; j < p; j++) {
            printf("%.2f ", C[i * p + j]);
        }
        printf("\n");
    }
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
    printf("Execution time: %.4f seconds\n", cpu_time_used);

    // Free memory
    free(A);
    free(B);
    free(C);

    return 0;
}

Overwriting multma.c


**My own example 20 - Running the c program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%time
!g++ multma.c -o matrix_mul
!./matrix_mul

Matrix C:
21.13 26.15 25.56 26.68 26.06 26.32 22.81 23.81 24.05 24.33 25.25 26.09 25.12 27.05 24.97 26.92 24.27 25.63 25.64 24.20 27.11 24.63 27.16 23.37 20.33 23.18 23.99 24.65 23.47 26.62 22.79 23.86 26.71 25.06 24.90 22.82 21.38 23.58 26.09 24.42 23.76 25.74 23.96 24.33 24.35 26.07 25.19 27.21 25.93 29.02 23.43 26.79 28.03 23.20 22.99 25.07 25.57 24.77 24.47 24.56 25.32 25.76 26.39 23.41 28.09 26.47 24.89 24.66 23.93 26.05 24.28 23.76 22.60 24.46 27.56 25.72 27.55 22.50 22.70 24.82 24.06 22.68 24.70 26.30 26.11 26.48 26.47 28.59 25.92 22.64 25.59 23.96 25.10 23.03 22.23 26.70 24.28 27.56 22.67 27.06 23.35 21.65 25.74 24.61 26.57 27.25 23.32 27.08 24.55 26.78 25.47 24.43 23.90 23.98 23.77 24.88 26.63 24.13 23.02 26.45 25.43 23.40 23.08 24.93 23.11 27.04 26.90 27.60 22.32 22.01 25.96 28.55 21.62 22.45 24.93 23.88 25.49 23.28 24.72 24.46 23.02 25.38 25.25 23.84 22.41 24.72 22.99 23.55 23.39 20.84 26.78 22.81 25.07 23.64 24.03 23.31 26.80 28.05 26.20 26.28 28.21 21.88 23.30 25.93 22.18 

**My own example 21 - Adding c average execution time to the data frame**

In [None]:
%%time
!g++ multma.c -o matrix_mul
!./matrix_mul

Matrix C:
23.88 27.02 25.10 25.49 23.94 26.88 24.50 21.64 23.17 24.10 26.77 25.90 25.98 28.04 23.46 22.26 23.94 24.40 26.48 28.77 23.91 26.66 22.42 25.28 22.59 21.98 25.58 24.48 24.34 24.19 25.63 25.77 24.65 27.17 24.38 24.97 25.40 24.78 26.26 23.45 27.81 27.24 24.56 20.98 23.88 23.87 24.39 24.05 22.84 21.03 24.74 24.45 24.43 23.12 25.46 23.70 25.26 22.88 23.76 23.07 25.06 26.61 23.41 25.47 23.45 25.00 24.08 25.12 23.32 23.60 24.71 25.43 23.85 22.93 26.44 26.33 25.94 24.07 24.57 26.34 24.66 25.67 23.83 22.27 23.28 26.59 27.18 27.87 27.86 25.25 24.96 24.50 25.80 23.86 24.36 24.43 26.86 26.29 25.44 24.96 25.39 22.73 24.23 25.97 23.18 24.47 26.77 23.11 24.43 25.85 24.18 24.06 22.75 28.43 22.65 26.59 25.97 26.18 25.38 26.30 22.80 25.85 26.05 21.48 24.72 21.98 25.91 25.96 21.81 22.98 24.05 26.22 28.09 24.74 23.57 23.48 26.76 24.07 27.70 24.53 25.29 24.37 23.10 25.13 27.41 26.44 25.44 24.74 23.31 25.36 25.55 23.19 23.89 24.32 23.35 22.43 26.68 23.16 23.57 25.72 26.54 23.17 24.47 25.26 25.00 

**My own example 22 - Creating a C++ program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%writefile multma.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>       // for clock_t, clock(), CLOCKS_PER_SEC
#include <unistd.h>     // for sleep()

using namespace std;

int main() {
    int N = 10;
    int m1 = 600, n1 = 100, m2 = 100, n2 = 600;
    double* A = (double*) malloc(m1 * n1 * sizeof(double));
    double* B = (double*) malloc(m2 * n2 * sizeof(double));
    double* C = (double*) malloc(m1 * n2 * sizeof(double));

    
    for (int i = 0; i < m1 * n1; i++) {
        A[i] = (double) rand() / RAND_MAX;
    }
    for (int i = 0; i < m2 * n2; i++) {
        B[i] = (double) rand() / RAND_MAX;
    }

    
    double total_time = 0.0;
    for (int k = 0; k < N; k++) {
        auto start_time = chrono::high_resolution_clock::now();
        for (int i = 0; i < m1; i++) {
            for (int j = 0; j < n2; j++) {
                double sum = 0.0;
                for (int h = 0; h < n1; h++) {
                    sum += A[i*n1+h] * B[h*n2+j];
                }
                C[i*n2+j] = sum;
            }
        }
        auto end_time = chrono::high_resolution_clock::now();
        double time = chrono::duration_cast<chrono::microseconds>(end_time - start_time).count() / 1000000.0;
        total_time += time;

Overwriting multma.c


**My own example 23 - Running the C++ program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%writefile multma.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>       // for clock_t, clock(), CLOCKS_PER_SEC
#include <unistd.h>     // for sleep()

using namespace std;

int main() {
    int N = 10;
    int m1 = 600, n1 = 100, m2 = 100, n2 = 600;
    double* A = (double*) malloc(m1 * n1 * sizeof(double));
    double* B = (double*) malloc(m2 * n2 * sizeof(double));
    double* C = (double*) malloc(m1 * n2 * sizeof(double));

    
    for (int i = 0; i < m1 * n1; i++) {
        A[i] = (double) rand() / RAND_MAX;
    }
    for (int i = 0; i < m2 * n2; i++) {
        B[i] = (double) rand() / RAND_MAX;
    }

    
    double total_time = 0.0;
    for (int k = 0; k < N; k++) {
        auto start_time = chrono::high_resolution_clock::now();
        for (int i = 0; i < m1; i++) {
            for (int j = 0; j < n2; j++) {
                double sum = 0.0;
                for (int h = 0; h < n1; h++) {
                    sum += A[i*n1+h] * B[h*n2+j];
                }
                C[i*n2+j] = sum;
            }
        }
        auto end_time = chrono::high_resolution_clock::now();
        double time = chrono::duration_cast<chrono::microseconds>(end_time - start_time).count() / 1000000.0;
        total_time += time;
         }

    // Calculate average execution time and print result
    double avg_time = total_time / N;
    cout << "Average execution time: " << avg_time << " seconds" << endl;

    // Free memory
    free(A);
    free(B);
    free(C);

    return 0;
}

Overwriting multma.c


**My own example 24 - Adding C++ average execution time to the data frame**

In [None]:
%%time
!g++ multma.c -o matrix_mul
!./matrix_mul

[01m[Kmultma.c:[m[K In function ‘[01m[Kint main()[m[K’:
[01m[Kmultma.c:26:27:[m[K [01;31m[Kerror: [m[K‘[01m[Kchrono[m[K’ has not been declared
   26 |         auto start_time = [01;31m[Kchrono[m[K::high_resolution_clock::now();
      |                           [01;31m[K^~~~~~[m[K
[01m[Kmultma.c:36:25:[m[K [01;31m[Kerror: [m[K‘[01m[Kchrono[m[K’ has not been declared
   36 |         auto end_time = [01;31m[Kchrono[m[K::high_resolution_clock::now();
      |                         [01;31m[K^~~~~~[m[K
[01m[Kmultma.c:37:23:[m[K [01;31m[Kerror: [m[K‘[01m[Kchrono[m[K’ has not been declared
   37 |         double time = [01;31m[Kchrono[m[K::duration_cast<chrono::microseconds>(end_time - start_time).count() / 1000000.0;
      |                       [01;31m[K^~~~~~[m[K
[01m[Kmultma.c:37:45:[m[K [01;31m[Kerror: [m[K‘[01m[Kchrono[m[K’ has not been declared
   37 |         double time = chrono::duration_cast<[01;31m

**My own example 25 - Creating a Java program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%writefile multma.java

import java.util.Random;

class Main {

  public static void main(String[] args) {
    
    Random r = new Random();
    int f1,c1,f2,c2;
    f1 = 600;
    c1 = 100;
    f2 = 100; 
    c2 = 100; 

    for(int p = 0; p < 3; p ++ ){

        int[][] a =new int[f1][c1];
        int[][] b = new int[f2][c2];
        int[][] mul = new int [f1][c2];

        //matrix_a generation

        for(int i=0;i<f1;i++)
        {
        for(int j=0;j<c1;j++)
        {
            a[i][j]=r.nextInt(100);
            //System.out.print(a[i][j]+"\t");
        }
        //System.out.print("\n");
        }


        //matrix_b generation

        for(int x=0;x<f2;x++)
        {
        for(int y=0;y<c2;y++)
        {
            b[x][y]=r.nextInt(100);
            //System.out.print(b[x][y]+"\t");
        }
        //System.out.print("\n");
        }

        if(c1 != f2){
            return;
        }


       

        long startTime = System.nanoTime();

        for(int i=0;i<f1;i++)    
            {    
                for(int j=0;j<c1;j++)    
                    {    
                        mul[i][j]=0;    
                        for(int x=0;x<f2;x++)    
                        {    
                            mul[i][j]+=a[i][x]*b[x][j];    
                        }    
                    }    
            }

        long endTime = System.nanoTime();


        long duration = (endTime - startTime)/1000000; 
        System.out.println("Tiempo "+ p + ": " + duration + " milisegundos.");

    }
  }
}
!javac multma.java
!java multma

Overwriting multma.java


**My own example 26 - Running the Java program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%writefile matmul.js

const { performance } = require('perf_hooks');

function getRandomInt(max) {
  return Math.floor(Math.random() * max);
}

function matriz_gen(f,c){
    var matriz = new Array(f);
    for (var i = 0; i < matriz.length; i++) {
        matriz[i] = new Array(c);
}   
 
    for (var i = 0; i < f; i++) {
        for (var j = 0; j < c; j++) {
         matriz[i][j] = getRandomInt(100);
        }
    }

    return matriz;
}

var f1 = 600;
var c1 = 100;
var f2 = 100;
var c2 = 100; 

for( var p = 0; p < 10; p ++){
    var matriz_a = matriz_gen(f1,c1);
    var matriz_b = matriz_gen(f2,c2);
    var mul = matriz_gen(f1,c2);

    if( c1 !== f2){
        return 0; 
    }


    var startTime = performance.now()
    
        for(var i=0; i < f1;i++)    
                {    
                    for(var j=0; j < c1; j++)    
                        {    
                            mul[i][j]=0;    
                            for(var x=0;x<f2;x++)    
                            {    
                                mul[i][j]+=matriz_a[i][x]*matriz_b[x][j];    
                            }    
                        }    
                }
    var endTime = performance.now();
    console.log(`The multiplication number ${p} took  ${endTime - startTime} milliseconds`);

}

Overwriting matmul.js


**My own example 27 - Adding Java average execution time to the data frame**

In [None]:
%%writefile matmul.js
}


    var startTime = performance.now()
    
        for(var i=0; i < f1;i++)    
                {    
                    for(var j=0; j < c1; j++)    
                        {    
                            mul[i][j]=0;    
                            for(var x=0;x<f2;x++)    
                            {    
                                mul[i][j]+=matriz_a[i][x]*matriz_b[x][j];    
                            }    
                        }    
                }
    var endTime = performance.now();
    console.log(`The multiplication number ${p} took  ${endTime - startTime} milliseconds`);

}

Overwriting matmul.js


**My own example 28 - Creating a Javascript program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%javascript
!pip install jupyter-js-widgets 
function matmul(a, b) {
  let result = new Array(a.length);
  for (let i = 0; i < a.length; i++) {
    result[i] = new Array(b[0].length).fill(0);
    for (let j = 0; j < b[0].length; j++) {
      for (let k = 0; k < a[0].length; k++) {
        result[i][j] += a[i][k] * b[k][j];
      }
    }
  }
  return result;
}

<IPython.core.display.Javascript object>

**My own example 29 - Running the Javascript program that mutiplies 10 times A(600x100) and  B (100x600) matrices**

In [None]:
%%javascript
!pip install jupyter-js-widgets 
function matmul(a, b) {
  let result = new Array(a.length);
  for (let i = 0; i < a.length; i++) {
    result[i] = new Array(b[0].length).fill(0);
    for (let j = 0; j < b[0].length; j++) {
      for (let k = 0; k < a[0].length; k++) {
        result[i][j] += a[i][k] * b[k][j];
      }
    }
  }
  return result;
}
let a = [[1, 2, 3], [4, 5, 6]];
let b = [[7, 8], [9, 10], [11, 12]];
let c = matmul(a, b);
console.log(c);

<IPython.core.display.Javascript object>

**My own example 30 - Adding Javascript average execution time to the data frame**

In [None]:
%%javascript
!pip install jupyter-js-widgets 
function matmul(a, b) {
  let result = new Array(a.length);
  for (let i = 0; i < a.length; i++) {
    result[i] = new Array(b[0].length).fill(0);
    for (let j = 0; j < b[0].length; j++) {
      for (let k = 0; k < a[0].length; k++) {
        result[i][j] += a[i][k] * b[k][j];
      }
    }
  }
  return result;
}
let a = [[1, 2, 3], [4, 5, 6]];
let b = [[7, 8], [9, 10], [11, 12]];
let c = matmul(a, b);
console.log(c);

<IPython.core.display.Javascript object>

**My own example 31 - Finding the minimun average esecuiton time in the data frame**

In [None]:
"""Python
---average attempt time : 1228.378 ms ---
"""

"""C
    Total time taken by CPU 0 attempt: 38.977000 miliseconds
    Total time taken by CPU 1 attempt: 37.991000 miliseconds
    Total time taken by CPU 2 attempt: 37.784000 miliseconds
    Total time taken by CPU 3 attempt: 41.053000 miliseconds
    Total time taken by CPU 4 attempt: 37.759000 miliseconds
"""

"""C++
    Total time taken by CPU 0 attempt: 28.418000 miliseconds
    Total time taken by CPU 1 attempt: 29.017000 miliseconds
    Total time taken by CPU 2 attempt: 28.608000 miliseconds
    Total time taken by CPU 3 attempt: 31.458000 miliseconds
    Total time taken by CPU 4 attempt: 28.195000 miliseconds
"""

""" Java (take 3 by 3)
    El tiempo total de la multiplicación es del intento 0: 27 milisegundos.
    El tiempo total de la multiplicación es del intento 1: 55 milisegundos.
    El tiempo total de la multiplicación es del intento 2: 14 milisegundos.
    El tiempo total de la multiplicación es del intento 0: 34 milisegundos.
    El tiempo total de la multiplicación es del intento 1: 48 milisegundos.
"""

""" Javascript
    The multiplication number 0 took  48.9186999201775 milliseconds
    The multiplication number 1 took  44.84457999952137 milliseconds
    The multiplication number 2 took  40.27608999930322 milliseconds
    The multiplication number 3 took  41.07909899994731 milliseconds
    The multiplication number 4 took  46.56920500122011 milliseconds
"""

""" Numpy
    ---attempt 0 : 0.00758999930359228517  ---
    ---attempt 1 : 0.00630790989999057123  ---
    ---attempt 2 : 0.00730759009592759056  ---
    ---attempt 3 : 0.00890989999478728516  ---
    ---attempt 4 : 0.00657895422959228516  ---


SyntaxError: ignored

**My own example 32 - Adding the Speed factor columne to the data frame**

In [None]:
# import pandas as pd
import pandas as pd 

print("Tiempo (ms)")
    
times = pd.DataFrame(lst, columns =['Python', 'C', 'C++', 'Java', 'Javascript', 'Numpy'])
print(times)

Tiempo (ms)
     Python       C     C++  Java  Javascript  Numpy
0  1244.379  28.977  28.418  27.0      48.391  0.709
1  1220.918  27.991  29.017  55.0      76.794  0.732
2  1229.609  27.784  28.608  14.0      44.216  0.629
3  1232.513  31.053  32.458  33.0      45.059  0.628
4  1220.301  27.759  28.195  38.0      44.559  0.629
5  1238.533  27.799  28.902  72.0      43.881  0.695
6  1239.356  28.577  28.536  31.0      46.651  0.664
7  1246.045  28.820  28.172  45.0      44.184  0.633
8  1224.682  28.702  28.754  73.0      44.276  0.624
9  1234.916  27.803  30.983  28.0      44.240  0.697


**My own example 33 - Sorting the the data frame by average execution time**

In [None]:
times[['Python', 'C', 'C++', 'Java', 'Javascript', 'Numpy']].mean()

Python        1233.1252
C               28.5265
C++             29.2043
Java            41.6000
Javascript      48.2251
Numpy            0.6640
dtype: float64

## Why is the Python Version so Much Slower?

In [None]:
# Dynamic typing.
def mul_elemwise(xs, ys):
    return [x * y for x, y in zip(xs, ys)]

mul_elemwise([1, 2, 3, 4], [1, 2 + 0j, 3.0, 'four'])
#[type(x) for x in _]

In [None]:
# Interpretation overhead.
source_code = 'a + b * c'
bytecode = compile(source_code, '', 'eval')
import dis; dis.dis(bytecode)

## Why is the Python Version so Slow?
- Dynamic typing means that every single operation requires dispatching on the input type.
- Having an interpreter means that every instruction is fetched and dispatched at runtime.
- Other overheads:
  - Arbitrary-size integers.
  - Reference-counted garbage collection.

> This is the paradox that we have to work with when we're doing scientific or numerically-intensive Python. What makes Python fast for development -- this high-level, interpreted, and dynamically-typed aspect of the language -- is exactly what makes it slow for code execution.

- Jake VanderPlas, [*Losing Your Loops: Fast Numerical Computing with NumPy*](https://www.youtube.com/watch?v=EEUXKG97YRw)

# What Do We Do?

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/runaway.gif" alt="Drawing" style="width: 50%;"/></center>

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/thisisfine.gif" alt="Drawing" style="width: 1080px;"/></center>

- Python is slow for numerical computation because it performs dynamic dispatch on every operation we perform...

- ...but often, we just want to do the same thing over and over in a loop!

- If we don't need Python's dynamicism, we don't want to pay (much) for it.

- **Idea:** Dispatch **once per operation** instead of **once per element**.

In [None]:
import numpy as np

data = np.array([1, 2, 3, 4])
data

In [None]:
data + data

In [None]:
%%time
# Naive dot product
(array_data * array_data).sum()

In [None]:
%%time
# Built-in dot product.
array_data.dot(array_data)

In [None]:
%%time
fortran_dot_product(array_data, array_data)

In [None]:
# Numpy won't allow us to write a string into an int array.
data[0] = "foo"

In [None]:
# We also can't grow an array once it's created.
data.append(3)

In [None]:
# We **can** reshape an array though.
two_by_two = data.reshape(2, 2)
two_by_two

Numpy arrays are:

- Fixed-type

- Size-immutable

- Multi-dimensional

- Fast\*

\* If you use them correctly.

# What's in an Array?

In [None]:
arr = np.array([1, 2, 3, 4, 5, 6], dtype='int16').reshape(2, 3)
print("Array:\n", arr, sep='')
print("===========")
print("DType:", arr.dtype)
print("Shape:", arr.shape)
print("Strides:", arr.strides)
print("Data:", arr.data.tobytes())

# Core Operations

- Vectorized **ufuncs** for elementwise operations.
- Fancy indexing and masking for selection and filtering.
- Aggregations across axes.
- Broadcasting

# UFuncs

UFuncs (universal functions) are functions that operate elementwise on one or more arrays.

In [None]:
data = np.arange(15).reshape(3, 5)
data

In [None]:
# Binary operators.
data * data

In [None]:
# Unary functions.
np.sqrt(data)

In [None]:
# Comparison operations
(data % 3) == 0

In [None]:
# Boolean combinators.
((data % 2) == 0) & ((data % 3) == 0)

In [None]:
# as of python 3.5, @ is matrix-multiply
data @ data.T

# UFuncs Review

- UFuncs provide efficient elementwise operations applied across one or more arrays.
- Arithmetic Operators (`+`, `*`, `/`)
- Comparisons (`==`, `>`, `!=`)
- Boolean Operators (`&`, `|`, `^`)
- Trigonometric Functions (`sin`, `cos`)
- Transcendental Functions (`exp`, `log`)

# Selections

We often want to perform an operation on just a subset of our data.

In [None]:
sines = np.sin(np.linspace(0, 3.14, 10))
cosines = np.cos(np.linspace(0, 3.14, 10))
sines

In [None]:
# Slicing works with the same semantics as Python lists.
sines[0]

In [None]:
sines[:3]  # First three elements  

In [None]:
sines[5:]  # Elements from 5 on.

In [None]:
sines[::2]  # Every other element.

In [None]:
# More interesting: we can index with boolean arrays to filter by a predicate.
print("sines:\n", sines)
print("sines > 0.5:\n", sines > 0.5)
print("sines[sines > 0.5]:\n", sines[sines > 0.5])

In [None]:
# We index with lists/arrays of integers to select values at those indices.
print(sines)
sines[[0, 4, 7]]

In [None]:
# Index arrays are often used for sorting one or more arrays.
unsorted_data = np.array([1, 3, 2, 12, -1, 5, 2])

In [None]:
sort_indices = np.argsort(unsorted_data)
sort_indices

In [None]:
unsorted_data[sort_indices]

In [None]:
market_caps = np.array([12, 6, 10, 5, 6])  # Presumably in dollars?
assets = np.array(['A', 'B', 'C', 'D', 'E'])

In [None]:
# Sort assets by market cap by using the permutation that would sort market caps on ``assets``.
sort_by_mcap = np.argsort(market_caps)
assets[sort_by_mcap]

In [None]:
# Indexers are also useful for aligning data.
print("Dates:\n", repr(event_dates))
print("Values:\n", repr(event_values))
print("Calendar:\n", repr(calendar))

In [None]:
print("Raw Dates:", event_dates)
print("Indices:", calendar.searchsorted(event_dates))
print("Forward-Filled Dates:", calendar[calendar.searchsorted(event_dates)])

On multi-dimensional arrays, we can slice along each axis independently.

In [None]:
data = np.arange(25).reshape(5, 5)
data

In [None]:
data[:2, :2]  # First two rows and first two columns.

In [None]:
data[:2, [0, -1]]  # First two rows, first and last columns.

In [None]:
data[(data[:, 0] % 2) == 0]  # Rows where the first column is divisible by two.

# Selections Review

- Indexing with an integer removes a dimension.
- Slicing operations work on Numpy arrays the same way they do on lists.
- Indexing with a boolean array filters to True locations.
- Indexing with an integer array selects indices along an axis.
- Multidimensional arrays can apply selections independently along different axes.

## Reductions

Functions that reduce an array to a scalar.

$Var(X) = \frac{1}{N}\sqrt{\sum_{i=1}^N (x_i - \bar{x})^2}$

In [None]:
def variance(x):
    return ((x - x.mean()) ** 2).sum() / len(x)

In [None]:
variance(np.random.standard_normal(1000))

- `sum()` and `mean()` are both **reductions**.

- In the simplest case, we use these to reduce an entire array into a single value...

In [None]:
data = np.arange(30)
data.mean()

- ...but we can do more interesting things with multi-dimensional arrays.

In [None]:
data = np.arange(30).reshape(3, 10)
data

In [None]:
data.mean()

In [None]:
data.mean(axis=0)

In [None]:
data.mean(axis=1)

## Reductions Review

- Reductions allow us to perform efficient aggregations over arrays.
- We can do aggregations over a single axis to collapse a single dimension.
- Many built-in reductions (`mean`, `sum`, `min`, `max`, `median`, ...).

# Broadcasting

In [None]:
row = np.array([1, 2, 3, 4])
column = np.array([[1], [2], [3]])
print("Row:\n", row, sep='')
print("Column:\n", column, sep='')

In [None]:
row + column

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/broadcasting.png" alt="Drawing" style="width: 60%;"/></center>

<h5>Source: http://www.scipy-lectures.org/_images/numpy_broadcasting.png</h5>

In [None]:
# Broadcasting is particularly useful in conjunction with reductions.
print("Data:\n", data, sep='')
print("Mean:\n", data.mean(axis=0), sep='')
print("Data - Mean:\n", data - data.mean(axis=0), sep='')

# Broadcasting Review

- Numpy operations can work on arrays of different dimensions as long as the arrays' shapes are still "compatible".
- Broadcasting works by "tiling" the smaller array along the missing dimension.
- The result of a broadcasted operation is always at least as large in each dimension as the largest array in that dimension.

# Numpy Review

- Numerical algorithms are slow in pure Python because the overhead dynamic dispatch dominates our runtime.

- Numpy solves this problem by:
  1. Imposing additional restrictions on the contents of arrays.
  2. Moving the inner loops of our algorithms into compiled C code.

- Using Numpy effectively often requires reworking an algorithms to use vectorized operations instead of for-loops, but the resulting operations are usually simpler, clearer, and faster than the pure Python equivalent.

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/unicorn.jpg" alt="Drawing" style="width: 75%;"/></center>

Numpy is great for many things, but...

- Sometimes our data is equipped with a natural set of **labels**:
  - Dates/Times
  - Stock Tickers
  - Field Names (e.g. Open/High/Low/Close)

- Sometimes we have **more than one type of data** that we want to keep grouped together.
  - Tables with a mix of real-valued and categorical data.

- Sometimes we have **missing** data, which we need to ignore, fill, or otherwise work around.

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/panda-wrangling.gif" alt="Drawing" style="width: 75%;"/></center>

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/pandas_logo.png" alt="Drawing" style="width: 75%;"/></center>


Pandas extends Numpy with more complex data structures:

- `Series`: 1-dimensional, homogenously-typed, labelled array.
- `DataFrame`: 2-dimensional, semi-homogenous, labelled table.

Pandas also provides many utilities for: 
- Input/Output
- Data Cleaning
- Rolling Algorithms
- Plotting

# Selection in Pandas

In [None]:
s = pd.Series(index=['a', 'b', 'c', 'd', 'e'], data=[1, 2, 3, 4, 5])
s

In [None]:
# There are two pieces to a Series: the index and the values.
print("The index is:", s.index)
print("The values are:", s.values)

In [None]:
# We can look up values out of a Series by position...
s.iloc[0]

In [None]:
# ... or by label.
s.loc['a']

In [None]:
# Slicing works as expected...
s.iloc[:2]

In [None]:
# ...but it works with labels too!
s.loc[:'c']

In [None]:
# Fancy indexing works the same as in numpy.
s.iloc[[0, -1]]

In [None]:
# As does boolean masking.
s.loc[s > 2]

In [None]:
# Element-wise operations are aligned by index.
other_s = pd.Series({'a': 10.0, 'c': 20.0, 'd': 30.0, 'z': 40.0})
other_s

In [None]:
s + other_s

In [None]:
# We can fill in missing values with fillna().
(s + other_s).fillna(0.0)

In [None]:
# Most real datasets are read in from an external file format.
aapl = pd.read_csv('AAPL.csv', parse_dates=['Date'], index_col='Date')
aapl.head()

In [None]:
# Slicing generalizes to two dimensions as you'd expect:
aapl.iloc[:2, :2]

In [None]:
aapl.loc[pd.Timestamp('2010-02-01'):pd.Timestamp('2010-02-04'), ['Close', 'Volume']]

# Rolling Operations

<center><img src="https://raw.githubusercontent.com/ssanderson/pydata-toolbox/master/notebooks/images/rolling.gif" alt="Drawing" style="width: 75%;"/></center>

In [None]:
aapl.rolling(5)[['Close', 'Adj Close']].mean().plot();

In [None]:
# Drop `Volume`, since it's way bigger than everything else.
aapl.drop('Volume', axis=1).resample('2W').max().plot();

In [None]:
# 30-day rolling exponentially-weighted stddev of returns.
aapl['Close'].pct_change().ewm(span=30).std().plot();

# "Real World" Data

In [None]:
from demos.avocados import read_avocadata

avocados = read_avocadata('2014', '2016')
avocados.head()

In [None]:
# Unlike numpy arrays, pandas DataFrames can have a different dtype for each column.
avocados.dtypes

In [None]:
# What's the regional average price of a HASS avocado every day?
hass = avocados[avocados.Variety == 'HASS']
hass.groupby(['Date', 'Region'])['Weighted Avg Price'].mean().unstack().ffill().plot();

In [None]:
def _organic_spread(group):

    if len(group.columns) != 2:
        return pd.Series(index=group.index, data=0.0)
    
    is_organic = group.columns.get_level_values('Organic').values.astype(bool)
    organics = group.loc[:, is_organic].squeeze()
    non_organics = group.loc[:, ~is_organic].squeeze()
    diff = organics - non_organics
    return diff

def organic_spread_by_region(df):
    """What's the difference between the price of an organic 
    and non-organic avocado within each region?
    """
    return (
        df
        .set_index(['Date', 'Region', 'Organic'])
         ['Weighted Avg Price']
        .unstack(level=['Region', 'Organic'])
        .ffill()
        .groupby(level='Region', axis=1)
        .apply(_organic_spread)
    )

In [None]:
organic_spread_by_region(hass).plot();
plt.gca().set_title("Daily Regional Organic Spread");
plt.legend(bbox_to_anchor=(1, 1));

In [None]:
spread_correlation = organic_spread_by_region(hass).corr()
spread_correlation

In [None]:
import seaborn as sns
grid = sns.clustermap(spread_correlation, annot=True)
fig = grid.fig
axes = fig.axes
ax = axes[2]
ax.set_xticklabels(ax.get_xticklabels(), rotation=45);

# Pandas Review

- Pandas extends numpy with more complex datastructures and algorithms.
- If you understand numpy, you understand 90% of pandas.
- `groupby`, `set_index`, and `unstack` are powerful tools for working with categorical data.
- Avocado prices are surprisingly interesting :)

# Thanks!