# 6  - Vectors

In [None]:
import numpy as np
import matplotlib.pyplot as plt 

def create_np_array(r, N, STD, size):
    empty_list = []
    for i in range(r):
        rand = np.random.normal(N, STD, size) 
        x = rand.mean()
        empty_list.append(x) 
    return np.array(empty_list) 
   
def plot_histogram(data):
    fig, ax = plt.subplots()
    for i in data:
        ax.hist(i[0], label = i[1], histtype = i[2])
    ax.legend() 
    plt.show()

x1 = create_np_array(100, 10, 5, 100)
x2 = create_np_array(100, 10, 20, 100)
plot_histogram([(x1, "x1 - 100", "step"), (x2, "x2 - 100", "step")])

x3 = create_np_array(1000, 10, 5, 100)
x4 = create_np_array(1000, 10, 20, 100)
plot_histogram([(x3, "x3 - 1,000", "step"), (x4, "x4- 1,000", "step")])

x5 = create_np_array(10000, 10, 5, 100)
x6 = create_np_array(10000, 10, 20, 100)
plot_histogram([(x5, "x5 - 10,000", "step"), (x6, "x6- 10,000", "step")])

# 7 - part1 - Confidence Intervals

In [1]:
#Calc_ci for calculating Conficdence Intervals
#returns tuple (lower_ci, upper_ci)
#Calc_relfreq 
#returns a double of the samples falling outside the CI

import numpy as np

def calc_ci(N, STD, Z):
    return((N-STD*Z), N+STD*Z)

def calc_relfreq(N , STD, size, Z):
    array =  np.random.normal(N, STD, size=size)
    ci_lower, ci_upper = calc_ci(N, STD, Z)
    count = 0
    for i in array:
        if(ci_lower < i < ci_upper):
            count += 1
    return 100-((count/len(array)*100))
        
rel_10 = calc_relfreq(10,3,10, 1.96)
rel_100 = calc_relfreq(10,3,100, 1.96)
rel_1000 = calc_relfreq(10,3,1000, 1.96)
rel_10000 = calc_relfreq(10,3,10000, 1.96)


# 7 - part1 - Confidence Intervals

In [46]:
#
import scipy.stats
import numpy as np

def calc_montecarlo(N, STD, size, confidence):
    monte_carlo =  np.random.normal(N, STD, size=size)
    monte_carlo_mean = monte_carlo.mean()
    monte_carlo_std = monte_carlo.std()
    dof = len(monte_carlo) - 1
    t_crit = np.abs(scipy.stats.t.ppf((1-confidence)/2, dof))
    lower_ci = monte_carlo_mean-monte_carlo_std/np.sqrt(len(monte_carlo))
    upper_ci = monte_carlo_mean+monte_carlo_std/np.sqrt(len(monte_carlo))
    count = 0
    for i in monte_carlo:
        if(lower_ci < i < upper_ci):
            count += 1
    real_freq = count /len(monte_carlo)*100
    return ((lower_ci), (upper_ci), (real_freq))

def calc_montecarlo_trials(N, STD, size, confidence, trials):
    for i in range(trials):
        lower_ci, upper_ci = calc_montecarlo(N, STD, size, confidence)
        print(lower_ci, upper_ci)
    

print(calc_montecarlo(10, 3, 1000000, 0.95))

    
    
    
#print(calc_montecarlo_trials(10, 3, 100, 0.95, 5))

(9.9911207041854, 9.997119149028295, 0.0843)


# 7 - part2 - Confidence Intervals for proportions

In [None]:
#How to calculate CI when we know:
#population and favourable_outcome || missing: standardDIV, x_bar
# N = number of samplesize
# x = favourable_outcome
# CL = Confidencelevel in fraction
# Z = from z-table; 95% = 1.9600, 90% = 1.6449
# return is a tuple (lower_ci, upper_ci)
# Calulator to check that the function works as expected || https://sample-size.net/confidence-interval-proportion/ 

import numpy as np
def calc_ci(N, x, CL, Z):
    SEM = np.sqrt((x*(N-x)/N**3))
    PPR = x/N
    return ((PPR-(Z*SEM)),(PPR+(Z*SEM)))

lower_ci, upper_ci = calc_ci(1000, 42, 0.90, 1.6449)
print(lower_ci)
print(upper_ci)

# 7 - part2 - Significant difference

In [None]:
#Calculate Significant difference
# p1 = number of favourable_outcome sample1
# n1 = number of samplesize sample1
# p2 = number of favourable_outcome sample2
# n2 = number of samplesize sample2
# return is a boolean 
# Z = from z-table, 95% CL = 1.96, 90% = 1.6449
# source | https://help.surveymonkey.com/en/analyze/significant-differences/

import numpy as np
def calc_significant_diff(p1, n1, p2, n2, Z):
    a1 = p1/n1*n1
    b1 = p2/n2*n2
    p = (p1+p2) / (n1+n2)
    SE = np.sqrt((p*(1-p)) * ((1/n1) +(1/n2)))
    t = (p1/n1-p2/n2) / SE
    if(t > Z):
        return True
    else:
        return False

calc_significant_diff(42, 1000, 45, 1000, 1.6449)