# Assessment 3 Review


* Be able to estimate probabilities with simulations in python

* Review the A/B testing and the beta distribution
    * a good resource for the beta distribution:
https://stats.stackexchange.com/questions/47771/what-is-the-intuition-behind-beta-distribution

* Review hypothesis testing

* Review SQL

### Numpy, Pandas, Linear Regression

In [1]:
import numpy as np
import pandas as pd

In [2]:
bar = np.random.randint(50, size = (10))
baz = np.random.randint(50, size = (1, 10))

In [3]:
bar

array([16,  9, 20, 25,  5, 48, 10, 34, 37, 19])

In [4]:
bar.shape

(10,)

In [5]:
baz

array([[18, 19, 13, 34, 43, 22, 35, 24, 23, 31]])

In [6]:
baz.shape

(1, 10)

What's the difference?

"Non-dimensional" array can cause problems when trying to do operations 
You can reshape your data using `reshape`

In [7]:
bar.reshape((1,10))

array([[16,  9, 20, 25,  5, 48, 10, 34, 37, 19]])

Indexing
https://docs.scipy.org/doc/numpy/user/basics.indexing.html

Filtering with masks

A masked array is the combination of a standard numpy.ndarray and a mask. A mask is either nomask, indicating that no value of the associated array is invalid, or an array of booleans that determines for each element of the associated array whether the value is valid or not.

In [8]:
bar

array([16,  9, 20, 25,  5, 48, 10, 34, 37, 19])

In [9]:
bar[[0,1,5]]

array([16,  9, 48])

In [10]:
bar[[0,1,1, 5,1]]

array([16,  9,  9, 48,  9])

In [11]:
# generate numpy array with random values
# 10 rows
# 5 columns
foo = np.random.randint(50, size = (10,5))
foo

array([[44, 13, 21, 29, 29],
       [13, 41, 22, 23, 36],
       [18, 44,  1, 24, 25],
       [20, 49, 10, 23, 11],
       [29, 27,  5, 37, 36],
       [15, 35, 47, 45, 41],
       [21, 44, 24, 17, 34],
       [ 7, 40, 23, 46,  7],
       [27, 20, 42, 30, 24],
       [23, 24, 33, 44, 34]])

In [12]:
# create boolean array -- True for indexes with values that satisfy the criteria
foo < 10

array([[False, False, False, False, False],
       [False, False, False, False, False],
       [False, False,  True, False, False],
       [False, False, False, False, False],
       [False, False,  True, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [ True, False, False, False,  True],
       [False, False, False, False, False],
       [False, False, False, False, False]], dtype=bool)

In [13]:
(foo < 10).any(axis=0)

array([ True, False,  True, False,  True], dtype=bool)

Axes are defined for arrays with more than one dimension. A 2-dimensional array has two corresponding axes: the first running vertically downwards across rows (axis 0), and the second running horizontally across columns (axis 1)

In [14]:
(foo < 10).any(axis=1)

array([False, False,  True, False,  True, False, False,  True, False, False], dtype=bool)

In [15]:
foo > 1

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True, False,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]], dtype=bool)

In [16]:
(foo > 1).all(axis=0)

array([ True,  True, False,  True,  True], dtype=bool)

In [17]:
(foo > 1).all(axis=1)

array([ True,  True, False,  True,  True,  True,  True,  True,  True,  True], dtype=bool)

### Indexing (subsetting data)

In [18]:
foo[[1, 0, 3, 1]]

array([[13, 41, 22, 23, 36],
       [44, 13, 21, 29, 29],
       [20, 49, 10, 23, 11],
       [13, 41, 22, 23, 36]])

In [19]:
foo

array([[44, 13, 21, 29, 29],
       [13, 41, 22, 23, 36],
       [18, 44,  1, 24, 25],
       [20, 49, 10, 23, 11],
       [29, 27,  5, 37, 36],
       [15, 35, 47, 45, 41],
       [21, 44, 24, 17, 34],
       [ 7, 40, 23, 46,  7],
       [27, 20, 42, 30, 24],
       [23, 24, 33, 44, 34]])

In [20]:
foo[:,[1, 3, 3, 3, 1]]

array([[13, 29, 29, 29, 13],
       [41, 23, 23, 23, 41],
       [44, 24, 24, 24, 44],
       [49, 23, 23, 23, 49],
       [27, 37, 37, 37, 27],
       [35, 45, 45, 45, 35],
       [44, 17, 17, 17, 44],
       [40, 46, 46, 46, 40],
       [20, 30, 30, 30, 20],
       [24, 44, 44, 44, 24]])

In [21]:
(foo < 10).any(axis=1)


array([False, False,  True, False,  True, False, False,  True, False, False], dtype=bool)

In [22]:
mask = (foo < 10).any(axis=1)
mask

array([False, False,  True, False,  True, False, False,  True, False, False], dtype=bool)

In [23]:
foo[mask]

array([[18, 44,  1, 24, 25],
       [29, 27,  5, 37, 36],
       [ 7, 40, 23, 46,  7]])

In [24]:
foo

array([[44, 13, 21, 29, 29],
       [13, 41, 22, 23, 36],
       [18, 44,  1, 24, 25],
       [20, 49, 10, 23, 11],
       [29, 27,  5, 37, 36],
       [15, 35, 47, 45, 41],
       [21, 44, 24, 17, 34],
       [ 7, 40, 23, 46,  7],
       [27, 20, 42, 30, 24],
       [23, 24, 33, 44, 34]])

In [25]:
column_mask = (foo < 10).any(axis=0)

In [26]:
foo[:,column_mask]

array([[44, 21, 29],
       [13, 22, 36],
       [18,  1, 25],
       [20, 10, 11],
       [29,  5, 36],
       [15, 47, 41],
       [21, 24, 34],
       [ 7, 23,  7],
       [27, 42, 24],
       [23, 33, 34]])

In [27]:
foo

array([[44, 13, 21, 29, 29],
       [13, 41, 22, 23, 36],
       [18, 44,  1, 24, 25],
       [20, 49, 10, 23, 11],
       [29, 27,  5, 37, 36],
       [15, 35, 47, 45, 41],
       [21, 44, 24, 17, 34],
       [ 7, 40, 23, 46,  7],
       [27, 20, 42, 30, 24],
       [23, 24, 33, 44, 34]])

In [28]:
fizz = np.arange(10).reshape(10,1)
fizz

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])

In [29]:
np.concatenate((foo, fizz), axis=1)

array([[44, 13, 21, 29, 29,  0],
       [13, 41, 22, 23, 36,  1],
       [18, 44,  1, 24, 25,  2],
       [20, 49, 10, 23, 11,  3],
       [29, 27,  5, 37, 36,  4],
       [15, 35, 47, 45, 41,  5],
       [21, 44, 24, 17, 34,  6],
       [ 7, 40, 23, 46,  7,  7],
       [27, 20, 42, 30, 24,  8],
       [23, 24, 33, 44, 34,  9]])

In [30]:
fuzz = np.arange(5).reshape(1,5)
fuzz

array([[0, 1, 2, 3, 4]])

In [31]:
foo

array([[44, 13, 21, 29, 29],
       [13, 41, 22, 23, 36],
       [18, 44,  1, 24, 25],
       [20, 49, 10, 23, 11],
       [29, 27,  5, 37, 36],
       [15, 35, 47, 45, 41],
       [21, 44, 24, 17, 34],
       [ 7, 40, 23, 46,  7],
       [27, 20, 42, 30, 24],
       [23, 24, 33, 44, 34]])

In [32]:
fuzz

array([[0, 1, 2, 3, 4]])

### Add rows or columns

In [33]:
np.concatenate((foo, fuzz), axis=0)

array([[44, 13, 21, 29, 29],
       [13, 41, 22, 23, 36],
       [18, 44,  1, 24, 25],
       [20, 49, 10, 23, 11],
       [29, 27,  5, 37, 36],
       [15, 35, 47, 45, 41],
       [21, 44, 24, 17, 34],
       [ 7, 40, 23, 46,  7],
       [27, 20, 42, 30, 24],
       [23, 24, 33, 44, 34],
       [ 0,  1,  2,  3,  4]])

In [34]:
foo

array([[44, 13, 21, 29, 29],
       [13, 41, 22, 23, 36],
       [18, 44,  1, 24, 25],
       [20, 49, 10, 23, 11],
       [29, 27,  5, 37, 36],
       [15, 35, 47, 45, 41],
       [21, 44, 24, 17, 34],
       [ 7, 40, 23, 46,  7],
       [27, 20, 42, 30, 24],
       [23, 24, 33, 44, 34]])

In [35]:
# alternative methods... just use concatenate!
np.hstack((foo,fizz))
np.column_stack((foo, fizz))

array([[44, 13, 21, 29, 29,  0],
       [13, 41, 22, 23, 36,  1],
       [18, 44,  1, 24, 25,  2],
       [20, 49, 10, 23, 11,  3],
       [29, 27,  5, 37, 36,  4],
       [15, 35, 47, 45, 41,  5],
       [21, 44, 24, 17, 34,  6],
       [ 7, 40, 23, 46,  7,  7],
       [27, 20, 42, 30, 24,  8],
       [23, 24, 33, 44, 34,  9]])

### PANDAS

In [36]:
df = pd.DataFrame(np.random.randint(10, size = (100, 5)), columns=['y', 'x1', 'x2', 'x3', 'x4'])

In [37]:
df.head()

Unnamed: 0,y,x1,x2,x3,x4
0,8,1,3,5,2
1,5,8,0,8,4
2,2,3,4,2,2
3,4,1,1,3,9
4,7,7,0,9,6


In [38]:
df.groupby('x1')

<pandas.core.groupby.DataFrameGroupBy object at 0x7f44116d6710>

In [39]:
df.groupby('x1').min()

Unnamed: 0_level_0,y,x2,x3,x4
x1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,2,2
1,1,1,0,0
2,1,1,0,1
3,0,0,0,0
4,1,4,1,1
5,4,1,1,0
6,1,2,0,2
7,1,0,0,0
8,1,0,0,0
9,1,0,0,0


In [40]:
df.groupby('x1').max()

Unnamed: 0_level_0,y,x2,x3,x4
x1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,8,9,9,8
1,9,9,9,9
2,7,9,9,9
3,8,9,7,9
4,9,8,9,9
5,9,9,9,5
6,9,8,9,9
7,9,7,9,9
8,9,9,9,9
9,9,6,9,8


In [41]:
df.groupby('x1').median()

Unnamed: 0_level_0,y,x2,x3,x4
x1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6.5,4.5,7.5,4.5
1,4.5,5.5,4.0,3.5
2,3.0,2.0,7.0,4.0
3,2.5,4.5,2.5,2.0
4,7.5,4.5,7.0,5.5
5,6.0,9.0,7.0,1.0
6,5.0,5.0,4.0,6.0
7,4.5,3.0,6.0,4.0
8,7.0,5.0,4.0,5.0
9,3.5,4.0,4.0,2.5


In [42]:
df.groupby('x1').median()['x2']

x1
0    4.5
1    5.5
2    2.0
3    4.5
4    4.5
5    9.0
6    5.0
7    3.0
8    5.0
9    4.0
Name: x2, dtype: float64

In [43]:
df[['x1', 'x2']].groupby('x1').median()

Unnamed: 0_level_0,x2
x1,Unnamed: 1_level_1
0,4.5
1,5.5
2,2.0
3,4.5
4,4.5
5,9.0
6,5.0
7,3.0
8,5.0
9,4.0


In [44]:
df[['x1', 'x2']].groupby('x1').median().sort_values('x2')

Unnamed: 0_level_0,x2
x1,Unnamed: 1_level_1
2,2.0
7,3.0
9,4.0
0,4.5
3,4.5
4,4.5
6,5.0
8,5.0
1,5.5
5,9.0


In [45]:
#  This doesn't work because the groupby column became our new index!!
df[['x1', 'x2']].groupby('x1').median().sort_values('x1')

KeyError: 'x1'

In [46]:
df[['x1', 'x2']].groupby('x1').median().reset_index()

Unnamed: 0,x1,x2
0,0,4.5
1,1,5.5
2,2,2.0
3,3,4.5
4,4,4.5
5,5,9.0
6,6,5.0
7,7,3.0
8,8,5.0
9,9,4.0


In [47]:
df[['x1', 'x2']].groupby('x1').median().reset_index().sort_values('x1')

Unnamed: 0,x1,x2
0,0,4.5
1,1,5.5
2,2,2.0
3,3,4.5
4,4,4.5
5,5,9.0
6,6,5.0
7,7,3.0
8,8,5.0
9,9,4.0


In [48]:
df.groupby('x1').mean()

Unnamed: 0_level_0,y,x2,x3,x4
x1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.5,4.5,6.833333,4.666667
1,5.3,5.6,4.3,4.0
2,4.0,4.285714,6.0,4.571429
3,3.0,4.666667,3.0,3.388889
4,6.25,5.375,5.875,5.125
5,6.4,7.0,5.8,2.0
6,4.714286,4.571429,3.857143,5.428571
7,4.928571,3.214286,5.285714,4.0
8,5.692308,5.230769,5.153846,4.769231
9,4.083333,3.916667,4.333333,3.833333


In [49]:
df.sort_values('x1').head(10)

Unnamed: 0,y,x1,x2,x3,x4
49,7,0,0,7,5
5,7,0,0,9,5
6,6,0,5,8,2
60,4,0,4,7,4
38,1,0,9,2,4
19,8,0,9,8,8
91,5,1,9,7,5
80,3,1,3,2,1
76,8,1,5,9,1
67,7,1,9,2,0


In [50]:
df.sort_values(['x1', 'x2']).head(10)

Unnamed: 0,y,x1,x2,x3,x4
5,7,0,0,9,5
49,7,0,0,7,5
60,4,0,4,7,4
6,6,0,5,8,2
19,8,0,9,8,8
38,1,0,9,2,4
3,4,1,1,3,9
0,8,1,3,5,2
80,3,1,3,2,1
20,9,1,4,6,6


In [51]:
df.sort_values(['x2', 'x1']).head(10)

Unnamed: 0,y,x1,x2,x3,x4
5,7,0,0,9,5
49,7,0,0,7,5
40,4,3,0,5,2
72,0,3,0,5,9
4,7,7,0,9,6
32,9,7,0,5,1
1,5,8,0,8,4
89,4,9,0,0,1
3,4,1,1,3,9
39,5,2,1,8,4


### Linear Regression

In [52]:
# Obtaining one column from a pandas dataframe
# Returns a series
df['x1'].head()

0    1
1    8
2    3
3    1
4    7
Name: x1, dtype: int64

In [53]:
# If you use pop, this DELETES THE COLUMN FROM YOUR DATAFRAME IN PLACE!
df.pop('x1').head()

0    1
1    8
2    3
3    1
4    7
Name: x1, dtype: int64

In [54]:
df.head()

Unnamed: 0,y,x2,x3,x4
0,8,3,5,2
1,5,0,8,4
2,2,4,2,2
3,4,1,3,9
4,7,0,9,6


:(

In [55]:
# So, make a copy first!
df = pd.DataFrame(np.random.randint(10, size = (100, 5)), columns=['y', 'x1', 'x2', 'x3', 'x4'])
df_copy = df.copy()

In [56]:
df_copy.pop('x1').head()

0    8
1    1
2    7
3    9
4    5
Name: x1, dtype: int64

In [57]:
df_copy.head()

Unnamed: 0,y,x2,x3,x4
0,7,4,8,4
1,8,0,9,8
2,1,7,5,4
3,5,4,9,2
4,5,9,4,1


In [58]:
df.head()

Unnamed: 0,y,x1,x2,x3,x4
0,7,8,4,8,4
1,8,1,0,9,8
2,1,7,7,5,4
3,5,9,4,9,2
4,5,5,9,4,1


In [59]:
# Turn dataframe or series into a numpy array
# (here I'm selecting just the first 5 rows to display)
df.values[:5,:]

array([[7, 8, 4, 8, 4],
       [8, 1, 0, 9, 8],
       [1, 7, 7, 5, 4],
       [5, 9, 4, 9, 2],
       [5, 5, 9, 4, 1]])

In [60]:
# Turn dataframe or series into a numpy array
df['x1'].values

array([8, 1, 7, 9, 5, 0, 5, 4, 0, 9, 5, 0, 7, 3, 4, 6, 5, 5, 2, 0, 0, 9, 3,
       3, 5, 0, 5, 7, 3, 2, 6, 5, 9, 3, 4, 1, 3, 0, 8, 1, 9, 9, 5, 7, 9, 4,
       1, 6, 9, 7, 8, 2, 0, 2, 9, 9, 9, 4, 9, 8, 2, 4, 9, 6, 4, 9, 5, 0, 7,
       5, 5, 7, 1, 3, 7, 3, 4, 7, 4, 9, 6, 3, 8, 5, 2, 5, 2, 9, 4, 8, 5, 2,
       7, 5, 7, 8, 3, 7, 5, 9])

In [61]:
# not inplace unless inplace=True
new_df = df.drop('y', axis=1)

Okay... that's nice... but why?

Linear Regression

In [62]:
from sklearn.linear_model import LinearRegression

In [63]:
df.head()

Unnamed: 0,y,x1,x2,x3,x4
0,7,8,4,8,4
1,8,1,0,9,8
2,1,7,7,5,4
3,5,9,4,9,2
4,5,5,9,4,1


In [64]:
y = df['y']
X = df.drop('y', axis=1)

In [65]:
lin_model = LinearRegression()

In [66]:
# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [67]:
lin_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [68]:
y_predict = lin_model.predict(X_test)

In [69]:
lin_model.coef_

array([ 0.10081322, -0.13877898,  0.12673312,  0.09284327])

In [70]:
lin_model.score(X_test, y_test)

-0.029579950041483949

In [71]:
# can get scoring metrics using sklearn.metrics
from sklearn.metrics import r2_score, mean_squared_error

In [72]:
r2_score(y_test, y_predict)

-0.029579950041483949

In [73]:
mean_squared_error(y_test, y_predict)

7.2573946700701928

matrix multiplication and dot products with numpy

In [74]:
arr1 = np.random.randint(50, size = (1, 10))
arr2 = np.random.randint(50, size = (1, 10))

In [75]:
# won't work, .dot is matrix multiplicatoin and needs appropriate dimensions!
arr1.dot(arr2)

ValueError: shapes (1,10) and (1,10) not aligned: 10 (dim 1) != 1 (dim 0)

In [76]:
arr1.dot(arr2.T)

array([[4171]])

In [77]:
mat1 = np.random.randint(50, size = (5, 2))
mat2 = np.random.randint(50, size = (2, 10))

In [78]:
mat1.dot(mat2)

array([[ 792, 1092,  288,  600, 2292, 1728,  288, 1476, 1140,  900],
       [1036, 2070,  596, 1465, 3660, 2102,  395, 2351, 2281, 1025],
       [1064, 1716,  472, 1070, 3336, 2260,  394, 2146, 1838, 1150],
       [ 348, 1198,  372, 1025, 1748,  582,  147, 1119, 1385,  225],
       [1128, 2216,  636, 1555, 3946, 2298,  429, 2535, 2437, 1125]])