<h1> Probability Distribution Function </h1>

In [None]:
# Importing Packages

import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from numpy.random import normal
from scipy.stats import norm
from scipy import integrate
import seaborn as sns
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV

In [None]:
# Create Series 
()
L = []

for i in range(10000):
    a = random.randint(1,6)
    b = random.randint(1,6)
    L.append(a+b)
    
s = (pd.Series(L).value_counts() / pd.Series(L).value_counts().sum()).sort_index()

s.plot(kind="bar")

In [None]:
# Plot Cumulative

np.cumsum(s).plot(kind='bar')

<h3> Parametric Density Estimation </h3>

In [None]:
sample = normal(loc=50,scale=5,size=1000)

# Plot Histogram To Understand The Distribution Of Data

plt.hist(sample,bins=10)
plt.show()

In [None]:
# Calculate Sample Mean And Sample Standard Deviation

sample_mean = sample.mean()
sample_std = sample.std()

In [None]:
# Fit The Distribution

dist = norm(60,12)
values = np.linspace(sample.min(),sample.max(),100)
probabilities = [dist.pdf(value) for value in values]

In [None]:
# Plot The Histogram And PDF

plt.hist(sample,bins=10,density=True)
plt.plot(values,probabilities)
plt.show()

sns.distplot(sample)

<h3> Kernel Density Estimation </h3>

In [None]:
# Generate Sample

sample1 = normal(loc=20,scale=5,size=300)
sample2 = normal(loc=40,scale=5,size=700)

sample = np.hstack((sample1,sample2))

In [None]:
# Plot Histogram

plt.hist(sample,bins=50)
plt.show()

In [None]:
# Fit The Distribution

model = KernelDensity(bandwidth=5,kernel='gaussian')

# Convert Data To 2D Array
sample = sample.reshape(len(sample),1)
model.fit(sample)

values = np.linspace(sample.min(),sample.max(),100)
values = values.reshape(len(values),1)
probabilities = model.score_samples(values)
probabilities = np.exp(probabilities)

score_samples(values) returns the log-density estimate of the input samples values. This is because the score_samples() method of the KernelDensity class returns the logarithm of the probability density estimate rather than the actual probability density estimate.

In [None]:
# Plot The Histogram

plt.hist(sample,bins=50,density=True)
plt.plot(values[:],probabilities)
plt.show()

sns.kdeplot(sample.reshape(1000),bw_adjust=0.02)

In [None]:
# Importing Dataset

iris = sns.load_dataset('iris')

In [None]:
# Plot Kde Plot

sns.kdeplot(data=iris,x='sepal_length',hue='species')

In [None]:
# Plot Kde Plot

sns.kdeplot(data=iris,x='sepal_width',hue='species')

In [None]:
# Plot Kde Plot

sns.kdeplot(data=iris,x='petal_length',hue='species')

In [None]:
# Plot Kde Plot

sns.kdeplot(data=iris,x='petal_width',hue='species')

In [None]:
# Plot Kdeplot And Ecdf Plot

sns.kdeplot(data=iris,x = 'petal_width',hue='species')
sns.ecdfplot(data=iris,x="petal_width",hue='species')

In [None]:
# Importing Dataset

titanic = sns.load_dataset('titanic')

In [None]:
# Plot Kdeplot

sns.kdeplot(data=titanic,x='age',hue='sex')

In [None]:
# Plot Joint Plot

sns.jointplot(data=iris,x="petal_length",y="sepal_length",kind="kde",fill=True,cbar=True)

In [None]:
# Plot Kde Plot

sns.kdeplot(data=titanic,x='age')

In [None]:
# Plot Kdeplot

x = (titanic["age"] - titanic['age'].mean())/titanic["age"].std()
sns.kdeplot(x)

In [None]:
# Detect Outlier Using Normal Distirbution

xmin = titanic['age'].mean() - 3 * titanic["age"].std()
xmax = titanic["age"].mean() + 3 * titanic["age"].std()

titanic[(titanic["age"] > 73.27860964406095) & (titanic["age"] > -13.88037434994331)]

In [None]:
# Skewness Of Data

titanic["age"].skew()

<h3> Question on Probability Distribution Function </h3>

In [None]:
insurance = pd.read_csv("InsuranceData.csv")
insurance.info()

In [None]:
# 1. What is the probability distribution of age in the insurance dataset?

plt.hist(insurance["age"], density=True)
kde = KernelDensity(kernel="gaussian",bandwidth=2).fit(insurance["age"].dropna().to_numpy().reshape(-1,1))
age_range = np.linspace(insurance["age"].min(),insurance["age"].max(),1000)
pdf = np.exp(kde.score_samples(age_range.reshape(-1, 1)))
plt.plot(age_range,pdf)
plt.title("Age PDF")
plt.show()

In [None]:
# 2. What is the probability of a patient having a BMI greater than 30?

patient_bmi_greater_than_30 = insurance[insurance["bmi"]>30].shape[0]
print("Probability of patient having a BMI greater than 30 is ", round(patient_bmi_greater_than_30/insurance.shape[0],2))

In [None]:
# 3. Plot distribution plot of claim for Smoker and non smoker. What changes you see in the plot?

sns.kdeplot(x=insurance["claim"],hue=insurance["smoker"])

- Claim for non-smoker is lesser than smokers.
- Most claim of non-smoker lie in range 0-15000 and Smoker's claim is greater than 15000.

In [None]:
# 4. Plot the 2D probability density plot of claim and age in the insurance dataset?

sns.kdeplot(x=insurance["age"],y=insurance["claim"],cmap="Blues",fill=True,thresh=0.05)

In [None]:
# 5. How does the disribution of claim changes for different region? Plot density plot and note down the observations

sns.kdeplot(x=insurance["claim"],hue=insurance["region"])

- Leaving northeast, all other regions have high density around same claim amount.
- All regions follow same trends of claims, global peak around 6000-10000 and a local peak around 4000

In [None]:
# 6. Plot PDF and CDF of claim in insurance data

# Approach1 - Histogram approach to which distribution follows
plt.hist(insurance["claim"],bins=30,density=True,alpha=0.5)
plt.show()

- As above hist plot is not normal we are going with non-parametric approach.

In [None]:
plt.figure(figsize=(10,4))

# plot pdf using KDE
plt.subplot(1,2,1)
sns.kdeplot(insurance["claim"])
plt.xlabel("Variable")
plt.ylabel("Density")
plt.title("Probability Density Function (PDF)")

# plot cdf using cumulative sum of KDE
plt.subplot(1,2,2)
sns.kdeplot(insurance["claim"],cumulative=True)
plt.xlabel("Variable")
plt.ylabel("Cumulative Probability")
plt.title("Cumulative Density Function (CDF)")

plt.tight_layout()
plt.show()

In [None]:
# 7. Given a probability density function f(x) = 2x for 0 <= x <= 1 and f(x) = 0 otherwise, compute the cumulative 
#distribution function F(x) and plot it. Use this to find the probability that X is greater than 0.5.

def pdf(x):
    if(0<=x<=1):
        return 2 * x
    else:
        return 0

def cdf(x):
    if(0<=x<= 1):
        return x ** 2
    else:
        return 1

x = np.linspace(0,1,100)
y_cdf = np.array([cdf(xi) for xi in x])
plt.plot(x,y_cdf)
plt.xlabel('x')
plt.ylabel('F(x)')
plt.title("Cumulative Distribution Function")
plt.grid(True)
plt.show()

print("Probability that X is greater than 0.5 is ",1 - cdf(0.5))

In [None]:
# 8. In a manufacturing process, the thickness of a certain material is known to be normally distributed with a mean of 
# 1.2mm and a standard deviation of 0.05 mm. What is the probability density function of the thickness? Plot the PDF and 
# use it to compute the probability that the thickness is between 1.1 mm and 1.3 mm.

def pdf(x,mean,std_dev):
    return (1/(std_dev * np.sqrt(2 * np.pi)))*np.exp(-((x-mean)**2)/(2*std_dev**2))

mean = 1.2
std_dev = 0.05

x = np.linspace(1.0,1.4,100)
y_pdf = pdf(x,mean,std_dev)

plt.plot(x,y_pdf)
plt.xlabel("Thickness")
plt.ylabel("f(x)")
plt.title("Probability Density Function (PDF) of Thickness")
plt.grid(True)
plt.show()

print("Probability that the thickness is between 1.1mm and 1.3mm is ",round(integrate.quad(lambda x:pdf(x,mean,std_dev),1.1,1.3)[0],2))

In [None]:
# 9. A data scientist is investigating the distribution of customer ages in a retail store. She collects a sample of 100 
# ages and estimates the probability density function using kernel density estimation. What bandwidth should she choose to 
# obtain the best estimate?

ages = np.random.normal(loc=40,scale=10,size=100)
bandwidths = 10 ** np.linspace(-1,1,100)
params = {"bandwidth":bandwidths,"kernel":["gaussian"]}

grid = GridSearchCV(KernelDensity(),params,cv=5)
grid.fit(ages.reshape(-1,1))

best_bandwidth = grid.best_estimator_.bandwidth
print("Best Estimator : ",best_bandwidth)

Choosing the best bandwidth for KDE involves finding a balance between overfitting and underfitting. If the bandwidth is too small, the estimated PDF may have a lot of small, spurious oscillations or noise, which may not accurately represent the underlying distribution of the data. On the other hand, if the bandwidth is too large, the estimated PDF may be overly smooth and may not capture the finer details or variations in the data.

There are several methods that can be used to select the optimal bandwidth for KDE, including cross-validation, rule-of-thumb methods (e.g., Scott's rule, Silverman's rule), and optimization techniques (e.g., maximum likelihood estimation).

One common rule-of-thumb method for choosing the bandwidth in KDE is Scott's rule, which is given by:

h = 1.06 * sigma * n^(-1/5)

where h is the bandwidth, sigma is the standard deviation of the data, and n is the number of data points in the sample. Scott's rule is often used as a default bandwidth choice in many KDE implementations.

In [None]:
# Scott rule bandwidth
# h = 1.06 * sigma * n^(-1/5)
h = 1.06 * 10 * (100**(-1/5))
print("Scott rule bandwidth:", h)