# Introduction to Numpy and Scipy


In [1]:
import numpy as np
import pandas as pd

import scipy.special

import iqplot

import bokeh.io
import bokeh.plotting

bokeh.io.output_notebook()

# Intro to NumPy arrays

ndarray, referred to as a NumPy array

In [2]:
# create numpy rray from a list
my_ar = np.array([1,2,3,4])

# Look at it
my_ar

array([1, 2, 3, 4])

In [3]:
my_ar.dtype

dtype('int64')

In [4]:
my_ar.shape

(4,)

In [5]:
# converts the data type of the array
my_ar.astype(float)

array([1., 2., 3., 4.])

In [6]:
print(my_ar.max())
print(my_ar.min())
print(my_ar.sum())
print(my_ar.mean())
print(my_ar.std())

4
1
10
2.5
1.118033988749895


In [7]:
# arrays can also be arguments to NumPy functions
print(np.max(my_ar))
print(np.min(my_ar))
print(np.sum(my_ar))
print(np.mean(my_ar))
print(np.std(my_ar))

4
1
10
2.5
1.118033988749895


# Other ways to make NumPy arrays


In [8]:
n = 10

np.zeros(n)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
np.ones(n)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [10]:
np.empty(n)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [11]:
my_ar = np.array([[1,2], [3,4]])
np.zeros_like(my_ar)

array([[0, 0],
       [0, 0]])

# Extracting Numpy arrays from Pandas data frames
NumPy has primitive function for loading in data from textfiles, but we typically extract NumPy arrays from Pandas dataframes. This is almost always for speed reasons



In [12]:
df = pd.read_csv('data/c_elegans_egg_xa.csv', comment='#')

df.head()

Unnamed: 0,food,area (sq. um)
0,high,1683
1,high,2061
2,high,1792
3,high,1852
4,high,2091


In [13]:
p = iqplot.strip(
    data=df,
    q='area (sq. um)',
    cats='food',
    order=['low','high'],
    spread='jitter',
    y_axis_label='amount of food',
    frame_height=200,
)

bokeh.io.show(p)

In [14]:
# Extract measurements for worms with high food

xa_high = df.loc[df['food']=='high', 'area (sq. um)']

# Look at data type
type(xa_high)

pandas.core.series.Series

Pandas series is like single column data fame

In [15]:
# Convert to Numpy array

xa_high = df.loc[df['food'] == 'high', 'area (sq. um)'].values

type(xa_high)

numpy.ndarray

In [16]:
xa_low = df.loc[df['food'] == 'low', 'area (sq. um)'].values

In [17]:
xa_high

array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,
       1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,
       1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,
       2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828])

In [18]:
xa_low

array([1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,
       1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,
       2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,
       1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,
       2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,
       2121, 2409])

# Slicing NumPy arrays

Can slice like lists and tuples

In [19]:
# reversed
xa_high[::-1]

array([1828, 2131, 1851, 2030, 1930, 1660, 1721, 1740, 1752, 1863, 2141,
       1701, 1661, 1712, 1749, 1642, 1882, 1821, 1800, 1692, 1680, 1671,
       1683, 1833, 1800, 1930, 1910, 1821, 1840, 1787, 1683, 1809, 1951,
       1892, 1731, 1751, 1802, 1912, 1781, 2091, 1852, 1792, 2061, 1683])

In [20]:
# Every 5th starting at 3
xa_high[3::5]

array([1852, 1751, 1683, 1930, 1680, 1642, 2141, 1660, 1828])

In [21]:
# Entries 10 to 20
xa_high[10:21]

array([1892, 1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833])

## Fancy indexing
Slice out specific values

In [22]:
xa_high[[1,19,6]]

array([2061, 1800, 1912])

In [23]:
xa_high[np.array([1, 19, 6])]

array([2061, 1800, 1912])

In [24]:
# Just slice out the big ones

xa_high[xa_high > 2000]

array([2061, 2091, 2141, 2030, 2131])

In [25]:
# Know indices

np.where(xa_high > 2000)

(array([ 1,  4, 33, 40, 42]),)

# NumPy arrays are mutable



In [26]:
# make an array

my_ar = np.array([1,2,3,4])

# Change an element
my_ar[2] = 6

# see the result
my_ar

array([1, 2, 6, 4])

In [27]:
# attach a new var
my_ar2 = my_ar

my_ar2[3] = 9

my_ar

array([1, 2, 6, 9])

In [28]:
# Re-instantiate my_ar
my_ar = np.array([1, 2, 3, 4]).astype(float)

# Function to normalize x (note that /= works with mutable objects)
def normalize(x):
    x /= np.sum(x)

# Pass it through a function
normalize(my_ar)

# Is it normalized even though we didn't return anything? (Yes.)
my_ar

array([0.1, 0.2, 0.3, 0.4])

## Slices of NumPy arrays are views, not copies!!!!!!


In [29]:
my_list = [1,2,3,4]
my_ar = np.array(my_list)

# Slice out of each
my_list_slice = my_list[1:-1]
my_ar_slice = my_ar[1:-1]

# Mess with the slices
my_list_slice[0] = 9
my_ar_slice[0] = 9

# look at originals
print(my_list)
print(my_ar)

[1, 2, 3, 4]
[1 9 3 4]


 Make copies with np.copy()

In [30]:
# Make a copy
xa_high_copy = np.copy(xa_high)

# mess with an entry
xa_high_copy[10] = 2000

# check equality
np.allclose(xa_high, xa_high_copy)

False

# Mathematical operations with arrays

Done elementwise to all elements


In [31]:
# Divide one array by another
np.array([5,6,7,8]) / np.array([1,2,3,4])

array([5.        , 3.        , 2.33333333, 2.        ])

In [32]:
# multiply by scalar
-4 * xa_high

array([-6732, -8244, -7168, -7408, -8364, -7124, -7648, -7208, -7004,
       -6924, -7568, -7804, -7236, -6732, -7148, -7360, -7284, -7640,
       -7720, -7200, -7332, -6732, -6684, -6720, -6768, -7200, -7284,
       -7528, -6568, -6996, -6848, -6644, -6804, -8564, -7452, -7008,
       -6960, -6884, -6640, -7720, -8120, -7404, -8524, -7312])

In [33]:
xa_high ** 2

array([2832489, 4247721, 3211264, 3429904, 4372281, 3171961, 3655744,
       3247204, 3066001, 2996361, 3579664, 3806401, 3272481, 2832489,
       3193369, 3385600, 3316041, 3648100, 3724900, 3240000, 3359889,
       2832489, 2792241, 2822400, 2862864, 3240000, 3316041, 3541924,
       2696164, 3059001, 2930944, 2758921, 2893401, 4583881, 3470769,
       3069504, 3027600, 2961841, 2755600, 3724900, 4120900, 3426201,
       4541161, 3341584])

# Indexing 2D NumPy arrays


In [34]:
# New 2D array using the reshape() method
my_ar = xa_high.reshape((11, 4))

# Look at it
my_ar

array([[1683, 2061, 1792, 1852],
       [2091, 1781, 1912, 1802],
       [1751, 1731, 1892, 1951],
       [1809, 1683, 1787, 1840],
       [1821, 1910, 1930, 1800],
       [1833, 1683, 1671, 1680],
       [1692, 1800, 1821, 1882],
       [1642, 1749, 1712, 1661],
       [1701, 2141, 1863, 1752],
       [1740, 1721, 1660, 1930],
       [2030, 1851, 2131, 1828]])

In [36]:
my_ar[0, 1]

2061

In [37]:
my_ar[2, :]

array([1751, 1731, 1892, 1951])

In [38]:
np.where(my_ar > 2000)

(array([ 0,  1,  8, 10, 10]), array([1, 0, 1, 0, 2]))

In [39]:
my_ar[(np.array([ 0,  1,  8, 10, 10]), np.array([1, 0, 1, 0, 2]))]

array([2061, 2091, 2141, 2030, 2131])

# Concatenating arrays


In [40]:
combined = np.concatenate((xa_high, xa_low))

In [41]:
combined

array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,
       1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,
       1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,
       2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828,
       1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,
       1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,
       2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,
       1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,
       2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,
       2121, 2409])

# Numpy has other useful math


In [42]:
np.exp(xa_high/1000)

array([5.38167681, 7.8538197 , 6.00144336, 6.37255189, 8.09300412,
       5.93578924, 6.76660849, 6.06175887, 5.76036016, 5.64629738,
       6.63262067, 7.03571978, 6.10434004, 5.38167681, 5.97151103,
       6.29653826, 6.1780334 , 6.7530888 , 6.88951024, 6.04964746,
       6.2526164 , 5.38167681, 5.31748262, 5.36555597, 5.43033051,
       6.04964746, 6.1780334 , 6.56662499, 5.16549017, 5.74885095,
       5.54003047, 5.26457279, 5.47942408, 8.50794132, 6.44303692,
       5.7661234 , 5.69734342, 5.59011579, 5.25931084, 6.88951024,
       7.61408636, 6.36618252, 8.42328589, 6.22143134])

In [43]:
np.cos(xa_high)

array([ 0.62656192,  0.9933696 ,  0.27501843,  0.03112568,  0.26681725,
       -0.96021239, -0.33430744,  0.29228295, -0.42404251, -0.99984597,
        0.72399324, -0.99748325,  0.84865001,  0.62656192, -0.84393482,
        0.56257847,  0.43231386,  0.99610114,  0.48702972, -0.99122275,
       -0.11903049,  0.62656192,  0.94691648, -0.73027654, -0.24968607,
       -0.99122275,  0.43231386, -0.98275172, -0.49500319, -0.64703425,
       -0.98592179, -0.61963892, -0.17156886,  0.00460656, -0.99936794,
        0.53296056,  0.90375673,  0.82939405,  0.3256673 ,  0.48702972,
        0.86222727, -0.824246  ,  0.5401501 ,  0.91834245])

In [44]:
np.sqrt(xa_high)

array([41.02438299, 45.39823785, 42.33202098, 43.03486958, 45.72745346,
       42.20189569, 43.72642222, 42.44997055, 41.84495191, 41.60528813,
       43.49712634, 44.17012565, 42.53234064, 41.02438299, 42.27292278,
       42.89522118, 42.67317659, 43.70354677, 43.93176527, 42.42640687,
       42.81354926, 41.02438299, 40.87786687, 40.98780306, 41.1339276 ,
       42.42640687, 42.67317659, 43.38202393, 40.52159918, 41.82104733,
       41.37632173, 40.75536774, 41.24318125, 46.27094121, 43.16248371,
       41.85689907, 41.71330723, 41.48493703, 40.74309757, 43.93176527,
       45.0555213 , 43.02324953, 46.16275555, 42.75511665])

# Scipy has even more functions (in modules)

In [45]:
import scipy.special

In [46]:
scipy.special.erf(xa_high / 2000)

array([0.76597747, 0.8549794 , 0.7948931 , 0.80965587, 0.86074212,
       0.79209865, 0.8236209 , 0.79740973, 0.78433732, 0.77904847,
       0.81905337, 0.83227948, 0.79915793, 0.76597747, 0.7936263 ,
       0.80676772, 0.8021292 , 0.82316805, 0.8276577 , 0.79690821,
       0.80506817, 0.76597747, 0.76262579, 0.76514271, 0.76846912,
       0.79690821, 0.8021292 , 0.81673693, 0.7543863 , 0.78381257,
       0.77393853, 0.75980693, 0.77094188, 0.86995276, 0.81227529,
       0.78459935, 0.78143985, 0.77636944, 0.75952376, 0.8276577 ,
       0.84883448, 0.80941641, 0.86814949, 0.80384751])

scipy.special: Special functions.

scipy.stats: Functions for statistical analysis.

scipy.optimize: Numerical optimization.

scipy.integrate: Numerical solutions to differential equations.

scipy.interpolate: Smooth interpolation of functions.

# Numpy and scipy are very fast!!!