In [None]:
!pip install nestle corner emcee

# Imports

In [None]:
import warnings
import numpy
import scipy
import scipy.stats
import pandas
import matplotlib.pyplot as plt
import Library_GraphTwoDimensionDensityColorMap
warnings.simplefilter('ignore')

# Read in the data:

In [None]:
data = pandas.read_csv( "RandomVariable_Generated_Data.dat",sep=' ',header=0)
print ( data )

# Some useful helper functions:

In [None]:
def plot_1D_function( 
    Functions = None,
    minp = None,
    maxp = None,
    nump = None,
    Labels = None,
    ):
    
       
    if None in [Functions, minp, maxp, nump]: 
        raise Exception("arg missing..." + str([Function, minp, maxp, nump]))
    
    if Labels is None:
        Labels = [None]*len(Functions)
    
    for Label,Function in zip(Labels,Functions):

        TrialPoints = numpy.linspace(-20, 20, 100)
        Values = []
        for TrailPoint in TrialPoints:
            Values.append(Function ( TrailPoint) )

        plt.plot(TrialPoints, Values,label=Label)
        if Label is not None:
            plt.legend()
    
    
    
    return

# Marginal Distributions:

### Make a histogram of the data from column A

In [None]:
Adata = data['A']
plt.hist(Adata, density=True,bins=50,label='A')
plt.xlabel('A')
plt.ylabel('Number')
plt.legend()
plt.show()

### Make a histogram of the data from column B

In [None]:
Bdata = data['B']
plt.hist(Bdata, density=True,bins=50,label='B')
plt.xlabel('B')
plt.ylabel('Number')
plt.legend()
plt.show()

### Approximate the data from column A as a univariate gaussian: (MARGINAL)

In [None]:
A_standard_deviation = numpy.sqrt( numpy.var( Adata ) )
A_mean = numpy.mean(Adata)

ATrialPoints = numpy.linspace(-20, 20, 100)
AValuePoints = scipy.stats.norm.pdf(ATrialPoints, loc = A_mean, scale = A_standard_deviation )

plt.hist(Adata, density=True, bins=50,label='Binned A')
plt.plot(ATrialPoints, AValuePoints,label='Gaussian A')
plt.xlabel('A')
plt.ylabel('Number')
plt.legend()
plt.show()

### Approximate the data from column B as a univariate gaussian: (MARGINAL)

In [None]:
# STUDENTS WILL WORK ON THIS CELL
B_standard_deviation = numpy.sqrt( numpy.var( Bdata ) )
B_mean = numpy.mean(Bdata)

BTrialPoints = numpy.linspace(-20, 20, 100)
BValuePoints = scipy.stats.norm.pdf(BTrialPoints, loc = B_mean, scale = B_standard_deviation )

plt.hist(Bdata, density=True, bins=50,label='Binned B')
plt.plot(BTrialPoints, BValuePoints,label='Gaussian B')
plt.xlabel('B')
plt.ylabel('Number')
plt.legend()
plt.show()

# Conditional distributions

### (1) Approximate the joint probability density function of A and B with a multivariate gaussian


In [None]:

ABdata = numpy.vstack( ( data['A'], data['B'] ) ).T
print(ABdata)

ABMean = numpy.mean( ABdata, axis = 0 )
print (ABMean)

ABCovarianceMatrix = numpy.cov( ABdata, rowvar = False )
print (ABCovarianceMatrix)

def jointGaussian(ABpoint):
    return scipy.stats.multivariate_normal.pdf( ABpoint, ABMean, ABCovarianceMatrix )
print ( jointGaussian( [0,0]) )

Library_GraphTwoDimensionDensityColorMap.Main(
    Function = jointGaussian,
    DomainMinimumPoint  = numpy.array([-5, -10]),
    DomainMaximumPoint  = numpy.array([10, 15]),
    ShowContours = True,
    PrintExtra = False,
    #PlotThreeDimensional = True
    )
plt.title("Probability Density of A & B", fontsize=40)
plt.ylabel('B',fontsize=40)
plt.xlabel('A',fontsize=40)
plt.draw()

### (1)  Fixing A = 0, plot the unnormalized conditional probability density of B:
P(B|A)
=====


In [None]:
def BdensityConditionalOnA0_unnormalized(Bpoint):
    return jointGaussian( [0, Bpoint] )


plot_1D_function( 
    Functions = [BdensityConditionalOnA0_unnormalized],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['P(B|A=0)']
    )
plt.ylabel('P(B|A)')
plt.xlabel('B')
plt.legend()
plt.show()

### (1) Fixing A = 0, plot the normalized probability density function of B:


In [None]:
FullIntegrationResult = scipy.integrate.quad( 
    BdensityConditionalOnA0_unnormalized, 
    -100, 
    100, 
    full_output = False
    )[0]
print (FullIntegrationResult )

def BdensityConditionalOnA0_normalized(Bpoint):
    return BdensityConditionalOnA0_unnormalized(Bpoint) / FullIntegrationResult



In [None]:
plot_1D_function( 
    Functions = [BdensityConditionalOnA0_unnormalized, BdensityConditionalOnA0_normalized],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['UnNormed','Normed']
    )
plt.ylabel('P(B|A)')
plt.xlabel('B')
plt.legend()
plt.show()


### (1) Fixing A = 7, plot the probability density of B:


In [None]:
# STUDENTS WILL WORK ON THIS CELL

A_fixed_values=[1,3,5,7]

def generateBdensityConditionalOnA_unormalized(Afixed):
    def BdensityConditionalOnA_unormalized(Bpoint):
        return jointGaussian( [Afixed, Bpoint] )
    return BdensityConditionalOnA_unormalized

function_list=[generateBdensityConditionalOnA_unormalized(A) for A in A_fixed_values]

plot_1D_function( 
    Functions = function_list,
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['P(B|A=%i)'%i for i in A_fixed_values]
    )
plt.ylabel('P(B|A)')
plt.xlabel('B')
plt.legend()
plt.show()


### (1) Fixing B = 0 plot the unnormalized probability density of A:


In [None]:
def AdensityConditionalOnB0_unnormalized(Apoint):
    return jointGaussian( [Apoint, 0] )


plot_1D_function( 
    Functions = [AdensityConditionalOnB0_unnormalized],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['P(A|B=0)']
    )
plt.ylabel('P(A|B)')
plt.xlabel('A')
plt.legend()
plt.show()


### (1) Fixing B = 7, plot the unnormalized probability density of A:


In [None]:
def AdensityConditionalOnB7_unnormalized(Apoint):
    return jointGaussian( [Apoint, 7] )


plot_1D_function( 
    Functions = [AdensityConditionalOnB7_unnormalized],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['P(A|B=7)']
    )
plt.ylabel('P(A|B)')
plt.xlabel('A')
plt.legend()
plt.show()

### (2) Approximate the joint probability density function of A and B with a kernel density estimation


In [None]:
kernel_object = scipy.stats.gaussian_kde( ABdata.T ) #takes in points sideways...

kernel_pdf = kernel_object.pdf

Library_GraphTwoDimensionDensityColorMap.Main(
    Function = kernel_pdf,
    DomainMinimumPoint  = numpy.array([-10, -10]),
    DomainMaximumPoint  = numpy.array([20, 20]),
    ShowContours = True,
    PluginPointCount = 10000,
    PrintExtra = False,
    )
plt.ylabel("B",fontsize=14)
plt.xlabel("A",fontsize=14)
plt.show()

### (2) Approximate the conditional probability density function of B, fixing A to 5:


In [None]:
# STUDENTS WILL WORK ON THIS CELL
def AdensityConditionalOnB5_unnormalized_kde(Apoint):
    return (kernel_pdf( [ Apoint, 5]) )


plot_1D_function( 
    Functions = [AdensityConditionalOnB5_unnormalized_kde],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['P(B|A=5)']
    )
plt.ylabel('P(B|A)')
plt.xlabel('B')
plt.legend()
plt.show()

### (2) Approximate the conditional probability density function of A, fixing B to 5:


In [None]:
def AdensityConditionalOnB5_unnormalized_kde(Bpoint):
    return (kernel_object.pdf( [ 5, Bpoint]) )


plot_1D_function( 
    Functions = [AdensityConditionalOnB5_unnormalized_kde],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['P(A|B=5)']
    )
plt.ylabel('P(A|B)')
plt.xlabel('A')
plt.legend()
plt.show()

## Note how the double peaked nature of the gaussian could be missed assuming gaussianity


# Final Excercise: Code up your own 1D kernel density estimation function against the data in Column A:

In [None]:
def Triangle_Kernel( Point, Mean, StandardDeviation):
    if Point < Mean-StandardDeviation or Point>Mean+StandardDeviation:
        return 0
    elif Point >= Mean-StandardDeviation and Point <= Mean:
        return((Point-Mean)/StandardDeviation**2+1/StandardDeviation)
    elif Point >=Mean and Point <= Mean+StandardDeviation:
        return(-(Point-Mean)/StandardDeviation**2+1/StandardDeviation)
    else: 
        print('What happened?')
    return 

plt.plot(numpy.arange(0,10,.01),[Triangle_Kernel(x,5,1) for x in numpy.arange(0,10,.01)],label='Triangle')
plt.plot(numpy.arange(0,10,.01),scipy.stats.norm.pdf( numpy.arange(0,10,.01), 5, .5 ),label="Gaussian")
plt.legend()
plt.xlabel('A')
plt.ylabel('Probability Density')
plt.show()

def GenerateKernelDensityEstimationFunction1D_Triangle( Data ):
    def KDE(  Value  ):
        Result = 0
        Bandwidth = numpy.sqrt( numpy.var(Data) ) / 6 #Bandwidth calculation
        for Datapoint in Data:
            Result += Triangle_Kernel( Value, Datapoint, Bandwidth ) #
        Probability = Result / len(Data)
        return Probability
    
    return KDE

In [None]:
Personal_KDE_Function_triangle = GenerateKernelDensityEstimationFunction1D_Triangle( Adata )

scipy_KDE_Function = scipy.stats.gaussian_kde( Adata ).pdf

plot_1D_function( 
    Functions = [Personal_KDE_Function_triangle, scipy_KDE_Function],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['Personal_Triangle','Scipy']
    )
plt.ylabel('P(A)')
plt.xlabel('A')
plt.legend()
plt.show()


In [None]:
# STUDENTS WILL WORK ON THIS CELL
def UnivariateGaussian( Point, Mean, StandardDeviation):
    return scipy.stats.norm.pdf( Point, Mean, StandardDeviation )


def GenerateKernelDensityEstimationFunction1D( Data ):
    
    def KDE(  Value  ):
        Result = 0
        Bandwidth = numpy.sqrt( numpy.var(Data) ) / 6 
        for Datapoint in Data:
            Result += UnivariateGaussian( Value, Datapoint, Bandwidth ) 
        Probability = Result / len(Data)
        return Probability
    
    return KDE
    
    
    

In [None]:
Personal_KDE_Function = GenerateKernelDensityEstimationFunction1D( Adata )

scipy_KDE_Function = scipy.stats.gaussian_kde( Adata ).pdf

plot_1D_function( 
    Functions = [Personal_KDE_Function, scipy_KDE_Function],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['Personal','Scipy']
    )
plt.ylabel('P(A)')
plt.xlabel('A')
plt.legend()
plt.show()

# Discuss choice of bandwidth, delta functions, and understand the overfitting problem:

In [None]:
# STUDENTS WILL WORK ON THIS CELL
def GenerateKernelDensityEstimationFunction1D_bad( Data ):
    
    def KDE(  Value  ):
        Result = 0
        Bandwidth = numpy.sqrt( numpy.var(Data) ) / 100
        for Datapoint in Data:
            Result += UnivariateGaussian( Value, Datapoint, Bandwidth )
        Probability = Result / len(Data)
        return Probability
    
    return KDE

Personal_KDE_Function_bad = GenerateKernelDensityEstimationFunction1D_bad( Adata )

scipy_KDE_Function = scipy.stats.gaussian_kde( Adata ).pdf

plot_1D_function( 
    Functions = [Personal_KDE_Function_bad, scipy_KDE_Function],
    minp = -20,
    maxp = 20,
    nump = 100,
    Labels = ['Personal_Bad','Scipy']
    )
plt.xlabel('P(A)')
plt.ylabel('A')
plt.legend()
plt.show()
