# Chapter 5: Random Variables, Probability, Distributions


Let's simulate tossing a coin 10 times

## Multiple Distributions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# for 3D plots
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats


In [None]:
np.random.seed(87654321)
coindomain = np.array([0,1])
coinsupport = np.array([0,1])
coinprob = np.array([.5,.5])
diesupport = np.array(range(1,7))
dieprob = np.array([1/6]*6)
coinedf = np.random.choice(coinsupport, size=10000, replace=True, p=coinprob)
dieedf = np.random.choice(diesupport, size=10000, replace=True, p=dieprob)

In [None]:
# crosstab
np.bincount(coinedf) / len(coinedf) # coinedf_table = np.vstack((coinedf_counts, coinedf_prob)).T

In [None]:
np.bincount(dieedf) / len(dieedf)

In [None]:
pd.crosstab(coinedf, dieedf, rownames=['Coins'], colnames=['Dies']) / len(coinedf)

In [None]:
Transactions = pd.read_csv('../data/Transactions.csv')
np.bincount(Transactions['Toothbrush']) / len(Transactions['Toothbrush'])

In [None]:
np.bincount(Transactions['Perfume']) / len(Transactions['Perfume'])

In [None]:
pd.crosstab(Transactions['Toothbrush'], Transactions['Perfume']) / len(Transactions['Toothbrush'])

## Univariate and Multivariate Distributions

In [None]:
M = np.array([4,10])
S = np.array([[2,1],[1,3]])
x = np.random.multivariate_normal(M, S, 10)
x

In [None]:
np.corrcoef(x[:,0], x[:,1])[0,1] # [0,1] is the first row and first column of the correlation matrix

In [None]:
#Create grid and multivariate normal
x = np.linspace(0, 7, 500)
y = np.linspace(5, 20,500)
X, Y = np.meshgrid(x,y)
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X; pos[:, :, 1] = Y
rv = stats.multivariate_normal(M, S)

#Make a 3D plot
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(X, Y, rv.pdf(pos),cmap='viridis',linewidth=0)
ax.set_xlabel('X axis')
ax.set_ylabel('Y axis')
ax.set_zlabel('Z axis')
plt.show()

## Transformations and Convolutions

In [None]:
u1 = np.random.uniform(0,1,10000)
sns.kdeplot(u1)
plt.show()

In [None]:
u2 = u1 ** 2
sns.kdeplot(u2)
plt.show()

In [None]:
u3 = np.sqrt(u1)
sns.kdeplot(u3)
plt.show()


In [None]:
n1 = np.random.normal(0,1,10000)
sns.kdeplot(u1)
plt.show()

In [None]:
n2 = np.exp(n1)
sns.kdeplot(n2)
plt.show()

In [None]:
newdist = np.random.uniform(0,1,10000)
newdist = (newdist-min(newdist))/max(newdist)
sns.kdeplot(newdist)
for i in range(1,3):
    newdist = newdist + np.random.uniform(0,1,10000)
    newdist = (newdist-min(newdist))/max(newdist)
    sns.kdeplot(newdist, color=sns.color_palette()[i])
plt.show()

In [None]:
newdist = np.random.uniform(0,1,10000)
newdist = (newdist-min(newdist))/max(newdist)
sns.kdeplot(newdist)
for i in range(1,100):
    newdist = newdist + np.random.uniform(0,1,10000)
    newdist = (newdist-min(newdist))/max(newdist)
    sns.kdeplot(newdist, color=sns.color_palette()[i%10])
plt.show()


## Sampling Distributions

In [None]:
women = pd.read_csv('../data/women.csv')

In [None]:
women['weight'].mean()

In [None]:
women['weight'].std()

In [None]:
np.random.seed(87654321)
s = np.random.choice(a=women['weight'], size=5, replace=True)
s.mean()

In [None]:
def f1():
  s = np.random.choice(a=women['weight'], size=5, replace=False)
  return s.mean()
samplingdist = np.array([f1() for i in range(10000)])

In [None]:
samplingdist.min()

In [None]:
samplingdist.max()

In [None]:
samplingdist.mean()

In [None]:
sns.kdeplot(samplingdist)
plt.show()

In [None]:
def f2():
  s2 = np.random.choice(a=women['weight'], size=5, replace=False)
  return s2.mean()
samplingdist2 = np.array([f2() for i in range(10000)])

In [None]:
women['weight'].median()

In [None]:
samplingdist2.mean()

In [None]:
sns.kdeplot(samplingdist2)
plt.show()

In [None]:
def f3():
    s3 = np.random.choice(a=women['weight'], size=5, replace=False)
    return np.quantile(s3, 0.25)
samplingdist3 = np.array([f3() for i in range(10000)])

In [None]:
women['weight'].quantile(0.25)

In [None]:
samplingdist3.mean()

## Start Tillman

In [None]:
import numpy as np
coindomain = [0,1]
coinprob = [.5,.5]
np.random.choice(coindomain, size=10, replace=True, p=coinprob) 

Now, let's simulate rolling a die 15 times. 

In [None]:
diedomain = range(1,7)
dieprob = np.repeat(1/6,6)
np.random.choice(diedomain, size = 15, replace = True, p = dieprob)

Let's simulate 5 coin tosses, i.e. draw 5 sample values, and calculate the relative frequencies:

In [None]:
import pandas as pd
res1 = np.random.choice(coindomain, size=5, replace=True, p=coinprob) 
pd.DataFrame(data=res1).value_counts(normalize=True)

And now the same but with 1000 draws:

In [None]:
res1 = np.random.choice(coindomain, size=1000, replace=True, p=coinprob) 
pd.DataFrame(data=res1).value_counts(normalize=True)