In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Binomial Distribution

n = 10
p = 0.1
size = 1000

binomial_dist = np.random.binomial(n,p,size)

plt.hist(binomial_dist,density=True)
plt.show()

In [None]:
# Uniform Distribution

num_samples = 1000
sample_size = 300
distribution_range = (0,1)

samples = np.random.uniform(distribution_range[0],distribution_range[1],(num_samples,sample_size))
sample_means = np.mean(samples,axis=1)

plt.hist(sample_means,bins=30,density=True,edgecolor='black')
plt.title("Histogram of Sample Means")
plt.xlabel("Sample Mean")
plt.ylabel("Density")
plt.show()

In [None]:
# Exponential Distribution

num_samples = 1000
sample_size = 50
lambda_param = 2

samples = np.random.exponential(scale=1/lambda_param,size=(num_samples,sample_size))
sample_means = np.mean(samples,axis=1)

plt.hist(sample_means,bins=30,density=True,edgecolor='black')
plt.title("Histogram of Sample Means")
plt.xlabel("Sample Mean")
plt.ylabel("Density")
plt.show()

In [None]:
# Gamma Distribution

num_samples = 1000
sample_size = 50

poisson_lambda = 5

gamma_shape = 2
gamma_scale = 1

binomial_n = 10
binomial_p = 0.5

poisson_samples = np.random.poisson(lam=poisson_lambda,size=(num_samples,sample_size))
gamma_samples = np.random.gamma(shape=gamma_shape,scale=gamma_scale,size=(num_samples,sample_size))
binomial_samples = np.random.binomial(n=binomial_n,p=binomial_p,size=(num_samples,sample_size))

poisson_means = np.mean(poisson_samples,axis=1)
gamma_means = np.mean(gamma_samples,axis=1)
binomial_means = np.mean(binomial_samples,axis=1)

fig,ax = plt.subplots(3,1,figsize=(8,12))

ax[0].hist(poisson_means,bins=30,density=True,edgecolor='black')
ax[0].set_title("Histogram of Sample Means (Poisson Distribution)")
ax[0].set_xlabel("Sample Mean")
ax[0].set_ylabel("Density")

ax[1].hist(gamma_means,bins=30,density=True,edgecolor='black')
ax[1].set_title("Histogram of Sample Means (Gamma Distribution)")
ax[1].set_xlabel("Sample Mean")
ax[1].set_ylabel("Density")

ax[2].hist(binomial_means,bins=30,density=True,edgecolor='black')
ax[2].set_title("Histogram of Sample Means (Binomial Distribution)")
ax[2].set_xlabel("Sample Mean")
ax[2].set_ylabel("Density")

fig.tight_layout()
plt.show()

In [None]:
# CLT On Gamma Distribution

num_samples = 1000
sample_size = 50

gamma_shape = 2
gamma_scale = 1

theoretical_mean = gamma_shape * gamma_scale
theoretical_variance = (gamma_shape * gamma_scale ** 2)

samples = np.random.gamma(shape=gamma_shape,scale=gamma_scale,size=(num_samples,sample_size))
sample_means = np.mean(samples,axis=1)

empirical_mean = np.mean(sample_means)
empirical_variance = np.var(sample_means)

print(f"Theoretical Mean : {theoretical_mean:0.4f}")
print(f"Empirical Mean : {empirical_mean/50:0.4f}")
print(f"Theoretical Variance : {theoretical_variance:0.4f}")
print(f"Empirical Variance : {empirical_variance:0.4f}")

In [None]:
# CLT On Log Normal Distribution

num_samples = 1000
sample_size = 50

mu = 0.5
sigma = 0.7

theoretical_mean = np.exp(mu + (sigma**2)/2)
theoretical_variance = (np.exp(sigma**2) - 1) * np.exp(2 * mu + sigma ** 2)

samples = np.random.lognormal(mean=mu,sigma=sigma,size=(num_samples,sample_size))
sample_means = np.mean(samples,axis=1)

empirical_mean = np.mean(sample_means)
empirical_variance = np.var(sample_means)

print(f"Theoretical Mean : {theoretical_mean:0.4f}")
print(f"Empirical Mean : {empirical_mean/50:0.4f}")
print(f"Theoretical Variance : {theoretical_variance:0.4f}")
print(f"Empirical Variance : {empirical_variance:0.4f}")

In [None]:
# CLT Case Study

population_size = 100000
sample_size = 50
num_samples = 100

np.random.seed(42)
population_salaries = np.random.lognormal(mean=4.5,sigma=0.8,size=population_size)

sample_means = []
sample_std_devs = []

for _ in range(num_samples):
    sample_salaries = np.random.choice(population_salaries,size=sample_size)
    sample_means.append(np.mean(sample_salaries))
    sample_std_devs.append(np.std(sample_salaries))
    
average_sample_mean = np.mean(sample_means)
standard_error = np.std(sample_means) / np.sqrt(num_samples)

margin_of_error = 1.96 * standard_error
lower_limit = average_sample_mean - margin_of_error
upper_limit = average_sample_mean + margin_of_error

print(f"Estimated Average Salary (in thousands) : {average_sample_mean:0.2f}")
print(f"95% Confidence Interval (in thousands): ({lower_limit:0.2f},{upper_limit:0.2f})")

<h3> Question On Central Limit Theorem </h3>

In [None]:
# A company claims that their email marketing campaign has a 15% click-through rate. If you randomly select 100 people to 
#receive the email, what is the probability that exactly 20 will click through to the website?

p = .15
n = 100
x = 20

prob = scipy.stats.binom.pmf(x,n,p)
print(prob)

## Solution-1

This is a binomial probability problem. The probability of exactly 20 people clicking through to the website out of 100 people who received the email can be calculated using the binomial formula:

P(X=k) = (n choose k) * p^k * (1-p)^(n-k)

where n is the number of trials (100 in this case), k is the number of successes (20 in this case), and p is the probability of success on a single trial (0.15 in this case).

Substituting these values into the formula, we get:

P(X=20) = (100 choose 20) * 0.15^20 * 0.85^80 ≈ 0.04

So, the probability that exactly 20 out of 100 people will click through to the website is approximately 0.04 or about 4%.

In [None]:
# A researcher is investigating whether a new medication improves patient outcomes. The medication has a success rate of 
# 75%. If the researcher enrolls 50 patients in the study, what is the probability that fewer than 35 will have a positive
# outcome?

p = 0.75
n = 50
x = 34

prob = scipy.stats.binom.cdf(x,n,p)
print(prob)

result = 0
for i in range(x+1):
    result += scipy.special.comb(n,i,exact=True) * (p**i) * (1-p)**(n-i)
print(result)

## Solution-2

This is another binomial probability problem. The probability of fewer than 35 patients having a positive outcome out of 50 patients enrolled in the study can be calculated using the cumulative distribution function (CDF) of the binomial distribution.

P(X ≤ k) = ∑[i=0 to k] (n choose i) * p^i * (1-p)^(n-i)

The CDF gives the probability that the number of successes in n independent trials is less than or equal to a given value k. In this case, n is the number of patients enrolled in the study (50), k is the maximum number of patients with a positive outcome (34), and p is the probability of success on a single trial (0.75).

In [None]:
# A website offers a premium subscription service with a 20% sign-up rate. If you randomly select 500 visitors to the 
# website, what is the probability that between 90 and 110 will sign up for the premium service?

p = 0.2
n = 500
x1 = 90
x2 = 110

prob = scipy.stats.binom.cdf(x2,n,p) - scipy.stats.binom.cdf(x1,n,p)
print(prob)

def probBetween(x,n,p):
    result = 0
    for i in range(x+1):
        result = result + scipy.special.comb(n,i,exact=True) * (p**i) * (1-p)**(n-i)
    return result

print(probBetween(x2,n,p) - probBetween(x1,n,p))

## Solution-3

This is another binomial probability problem. The probability of between 90 and 110 visitors signing up for the premium service out of 500 visitors to the website can be calculated using the cumulative distribution function (CDF) of the binomial distribution.

The CDF gives the probability that the number of successes in n independent trials is less than or equal to a given value k. In this case, n is the number of visitors to the website (500), k1 is the minimum number of visitors who sign up for the premium service (90), k2 is the maximum number of visitors who sign up for the premium service (110), and p is the probability of success on a single trial (0.20).

The probability that between k1 and k2 visitors will sign up for the premium service out of n visitors to the website can be calculated as:

P(k1 ≤ X ≤ k2) = P(X ≤ k2) - P(X < k1) = F(k2) - F(k1-1)

where F(k) is the CDF of the binomial distribution with parameters n and p at k.

In [None]:
# A school district is investigating the effectiveness of a new reading program. The program has a success rate of 70%. If
#the district enrolls 200 students in the program, what is the probability that more than 140 will show significant 
#improvement in reading skills?

p = .7
n = 200
x = 140

prob = 1 - scipy.stats.binom.cdf(x,n,p)
print(prob)

def probBetween(x,n,p):
    result = 0
    for i in range(x+1):
        result = result + scipy.special.comb(n,i,exact=True) * (p**i) * (1-p)**(n-i)
    return result

print(1 - probBetween(x,n,p))

## Solution-4
This is another binomial probability problem. The probability of more than 140 students showing significant improvement in reading skills out of 200 students enrolled in the program can be calculated using the cumulative distribution function (CDF) of the binomial distribution.

The CDF gives the probability that the number of successes in n independent trials is less than or equal to a given value k. In this case, n is the number of students enrolled in the program (200), k is the maximum number of students who do not show significant improvement in reading skills (59), and p is the probability of success on a single trial (0.70).

The probability that more than k students will show significant improvement in reading skills out of n students enrolled in the program can be calculated as:

P(X > k) = 1 - P(X ≤ k) = 1 - F(k)

where F(k) is the CDF of the binomial distribution with parameters n and p at k.

In [None]:
# A factory produces electronic components with a defect rate of 5%. If a shipment of 200 components is sent out, what is 
# the probability that fewer than 10 will be defective?

p = 0.05
n = 200
x = 9

prob = scipy.stats.binom.cdf(x,n,p)
print(prob)

## Solution-5

The CDF gives the probability that the number of successes in n independent trials is less than or equal to a given 
value k. In this case, n is the number of components in the shipment (200), k is the maximum number of defective 
components (9), and p is the probability of success on a single trial (0.05).

In [None]:
# A survey shows that 70% of people prefer chocolate ice cream over vanilla ice cream. If you randomly survey one person, 
#what is the probability that they prefer vanilla ice cream?

p = .7
print(1-p)

## Solution-6
This is a Bernoulli trial with p = 0.7. The probability of the person preferring vanilla ice cream is 1 - p = 0.3.

In [None]:
# A software company releases a new product with a bug rate of 2%. If 10,000 copies of the product are sold, what is the 
#probability that at least 250 will have a bug?

p = 0.02
n = 10000
x = 249
prob = 1 - scipy.stats.binom.cdf(x,n,p)
print(prob)

## Solution-7

This problem can be solved using the binomial distribution. We can model the number of copies with bugs out of 10,000 as a 
binomial random variable with n=10,000 and p=0.02, where p is the probability of a single copy having a bug. We want to 
find the probability that at least 250 copies have a bug, which can be written as:

P(X >= 250) = 1 - P(X < 250)

where X is the number of copies with bugs.

To calculate this probability, we can use the cumulative distribution function (CDF) of the binomial distribution.

In [None]:
# According to data from the National Center for Health Statistics (NCHS), the average height for adult men aged 20 years 
#and over in the United States is approximately 69.2 inches with a standard deviation of approximately 2.9 inches. If you 
#randomly select a sample of 50 adult men aged 20 years and over, what is the probability that the sample mean height is 
#greater than 70 inches?

mu = 69.2
sigma = 2.9
x = 70
n = 50

z = (x - mu)/(sigma/n**0.5)
prob = 1 - scipy.stats.norm.cdf(z)
print(prob)

## Solution-8

This is a problem that can be solved using the central limit theorem. The central limit theorem states that the distribution of sample means approaches a normal distribution as the sample size increases, regardless of the shape of the population distribution.

In this case, we are given that the population mean height is 69.2 inches and the population standard deviation is 2.9 inches. If we randomly select a sample of 50 adult men, the sample mean height will have a normal distribution with a mean equal to the population mean (69.2) and a standard deviation equal to the population standard deviation divided by the square root of the sample size (2.9 / sqrt(50) ≈ 0.41).

We can use this information to calculate the probability that the sample mean height is greater than 70 inches. This probability is equivalent to the probability that a standard normal variable Z is greater than (70 - 69.2) / 0.41 ≈ 1.95.

In [None]:
# A company claims that the average salary of its employees is 75,000 with a standard deviation of 10,000. If you randomly 
# select 100 employees, what is the probability that the sample mean salary is less than 72,500?

mu = 75000
sigma = 10000
n = 100
x = 72500

z = (x - mu)/(sigma/n**0.5)
prob = scipy.stats.norm.cdf(z)
print(prob)

## Solution-9

This is a problem that can be solved using the central limit theorem. The central limit theorem states that the 
distribution of sample means approaches a normal distribution as the sample size increases, regardless of the shape of 
the population distribution.

In this case, we are given that the population mean salary is 75,000 and the population standard deviation is 10,000. If 
we randomly select a sample of 100 employees, the sample mean salary will have a normal distribution with a mean equal to 
the population mean (75,000) and a standard deviation equal to the population standard deviation divided by the square 
root of the sample size (10,000 / sqrt(100) = 1,000).

We can use this information to calculate the probability that the sample mean salary is less than 72,500. This probability 
is equivalent to the probability that a standard normal variable Z is less than (72,500 - 75,000) / 1,000 = -2.5.

In [None]:
# A restaurant claims that the average wait time for a table is 15 minutes with a standard deviation of 3 minutes. If 
# you randomly survey 50 customers, what is the probability that the sample mean wait time is greater than 16 minutes?

mu = 15
sigma = 3
n = 50
x = 16

z = (x - mu)/(sigma/n**0.5)
prob = 1 - scipy.stats.norm.cdf(z)
print(prob)

## Solution-10

In this case, we are given that the population mean wait time is 15 minutes and the population standard deviation is 3 minutes. If we randomly select a sample of 50 customers, the sample mean wait time will have a normal distribution with a mean equal to the population mean (15) and a standard deviation equal to the population standard deviation divided by the square root of the sample size (3 / sqrt(50) ≈ 0.424).

We can use this information to calculate the probability that the sample mean wait time is greater than 16 minutes. This probability is equivalent to the probability that a standard normal variable Z is greater than (16 - 15) / 0.424 ≈ 2.36.

In [None]:
# What is the average and standard deviation for the salary in our data set? Show the distribution of the salary 
# (Histogram and kde both).

# Now verify the dataset mean and std from using central limit theorem using following sample and sample size:
# a) 200 samples of size 30
# b) 100 samples of size 50
# Plot samples means distribution and show horizontal line for both mean value: Dataset mean value and means of sample 
# mean.

df = pd.read_csv("ds_salaries.csv",index_col = [0])

mean = df["salary_in_usd"].mean()
std_dev = df["salary_in_usd"].std()

sample_sizes = [30,50]
total_samples = [200,100]

for sample_size,total_sample in zip(sample_sizes,total_samples):
    list_of_sample_means = []
    
    for i in range(total_sample):
        sample = df["salary_in_usd"].sample(n=sample_size,replace=True)
        sample_mean = sample.mean()
        list_of_sample_means.append(sample_mean)
        
    plt.hist(list_of_sample_means,bins=25)
    plt.title(f"Distribution of Mean of Salaries ({total_sample} Samples of {sample_size})")
    plt.xlabel("Salary")
    plt.ylabel("Frequency")
    plt.axvline(x=np.mean(list_of_sample_means),label='Sample Mean',color="green")
    plt.axvline(x=df["salary_in_usd"].mean(),label='Population Mean',color="red")
    plt.legend(loc = "upper right")
    plt.show()