## Chi Square Goodness Of Fit Test On Uniform Distribution

In [None]:
# Importing Basic Packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import norm,uniform,poisson,chisquare,chi2
from statsmodels.graphics.gofplots import qqplot

In [None]:
# Reading The Data

df = pd.read_csv("UniformData.csv")

In [None]:
# Print Top 5 Rows From Data

df.head()

In [None]:
# Print Shape Of The Data

df.shape

In [None]:
# Visualize The Data As A Histogram

plt.hist(df["obs"])
plt.show()

In [None]:
# Get The Descriptive Statistics For The Data

df.describe()

In [None]:
# Some Additional Descriptive Statistics

stats_labels = ["Variance Observed","Skew Observed","Kurt Observed"]
stats_values = [round(df["obs"].var(),2),round(df["obs"].skew(),2),round(df["obs"].kurt(),2)]
print(set(zip(stats_labels,stats_values)))

In [None]:
# Plot QQ To Compare The Quantiles (Amplify The Differences In The Tail)

sm.qqplot(df["obs"],uniform,fit=True,line='45')
plt.show()

In [None]:
# Plot PP To Compare The Quantiles (Amplify The Differences In The Center)

stats.probplot(df["obs"],dist=uniform,plot=plt)
plt.show()

*Null Hypothesis : The given data follows Uniform Distribution*<br>
*Alternate Hypothesis : The given data does not follow Uniform Distribution.*

In [None]:
# Step-1 In Chi Square Goodness Of Fit : Create Bins

df["bins"] = pd.cut(df["obs"],bins=10)
df

In [None]:
# Step-2 In Chi Square Goodness Of Fit : Count Number Of Obserations In Each Bin

df_2 = pd.DataFrame()
df_2["Observed Frequency"] = df.groupby("bins").count()
df_2

In [None]:
# Step-3 In Chi Square Goodness Of Fit : Determine The Expected Number Of Observations In Each Bin

mean = df_2["Observed Frequency"].mean()
df_2["Expected Frequency"] = mean
df_2

In [None]:
# Step-4A In Chi Square Goodness Of Square Fit : Determine The Test Statistics And P-value

chisquare(df_2["Observed Frequency"],df_2["Expected Frequency"])

*Conclusion*<br>
*Reject The Null If The Pvalue < Level Of Siginificance*<br>
*Do Not Reject The Null If the Pvalue >= Level Of Siginificance*<br>
*Say The Level Of Siginifance is 5%,Then For Given Data Set,P-value (0.694)>0.05-->Do Not Reject Null*<br>
*Note : For Any Leel Of Siginificance -->70%,We Do Not Reject The Null For The Current Data Set.*<br>

In [None]:
# Step-4B In Chi Square Goodness Of Square Fit : Determine The Tabulated Value Of The Test Statistics

chi2.ppf(0.95,len(df_2)-2-1)

*Conclusion*<br>
*Reject The Null If The Tabulated Test Statistic Value < Computed Test Statistics Value*<br>
*Do Not Reject The Null If The Tabulated Test Statistics Value >=  Computed Test Statistics Value*

## Chi Square Goodness Of Fit Test On Poisson Distribution

In [None]:
# Reading The Data

df = pd.read_csv("PoissonData1.csv")

In [None]:
# Print Top 5 Rows From Data

df.head()

In [None]:
# Print Shape Of The Data

df.shape

In [None]:
# Visualize The Data As A Histogram

plt.hist(df["obs"])
plt.show()

In [None]:
# Get The Descriptive Statistics For The Data

df.describe()

In [None]:
# Some Additional Descriptive Statistics

stats_labels = ["Variance Observed","Skew Observed","Kurt Observed"]
stats_values = [round(df["obs"].var(),2),round(df["obs"].skew(),2),round(df["obs"].kurt(),2)]
print(set(zip(stats_labels,stats_values)))

In [None]:
# Plot QQ To Compare The Quantiles (Amplify The Differences In The Tail)

sm.qqplot(df["obs"], poisson(np.mean(df["obs"])), line='45')
plt.show()

In [None]:
# Plot PP To Compare The Quantiles (Amplify The Differences In The Center)

stats.probplot(df["obs"],dist="poisson",sparams=(np.mean(df["obs"]),),plot=plt)
plt.show()

*Null Hypothesis : The given data follows Poisson Distribution*<br>
*Alternate Hypothesis : The given data does not follow Poisson Distribution.*

In [None]:
# Step-1 In Chi Square Goodness Of Fit : Create Bins

df["bins"] = pd.cut(df["obs"],bins=10)
df

In [None]:
# Step-2 In Chi Square Goodness Of Fit : Count Number Of Obserations In Each Bin

grouped_df = pd.DataFrame()
grouped_df["Observed Frequency"] = df.groupby("obs").count()
grouped_df = grouped_df.reset_index()
grouped_df

In [None]:
# Step-3 In Chi Square Goodness Of Fit : Determine The Expected Number Of Observations In Each Bin
# Step3A : Determine The Poisson PMF For Each Bucket

grouped_df["POISSON_PMF"] = poisson.pmf(k=grouped_df.index,mu=df["obs"].mean())
grouped_df

In [None]:
# Step3B : Determined The Expected Frequency In Each Bucket

grouped_df["Expected Frequency"] = grouped_df["POISSON_PMF"] * len(df)
grouped_df

In [None]:
print("Observed Frequency : ",grouped_df["Observed Frequency"].sum())
print("Expected Frequency : ",grouped_df["Expected Frequency"].sum())

In [None]:
# Since Expected Frequency Not Equal To Observed Frequency  

grouped_df = grouped_df.drop(grouped_df[grouped_df['obs']==11].index)

grouped_df.iloc[9,3] = 100 - grouped_df["Expected Frequency"].iloc[:-1].sum()
grouped_df.iloc[9,1] = 4
grouped_df

In [None]:
print("Observed Frequency : ",grouped_df["Observed Frequency"].sum())
print("Expected Frequency : ",grouped_df["Expected Frequency"].sum())

In [None]:
# Step-4A In Chi Square Goodness Of Square Fit : Determine The Test Statistics And P-value

chisquare(grouped_df["Observed Frequency"],grouped_df["Expected Frequency"])

*Conclusion*<br>
*Reject The Null If The Pvalue < Level Of Siginificance*<br>
*Do Not Reject The Null If the Pvalue >= Level Of Siginificance*<br>
*Say The Level Of Siginifance is 5%,Then For Given Data Set,P-value (0.634)>0.05-->Do Not Reject Null*<br>
*Note : For Any Leel Of Siginificance -->70%,We Do Not Reject The Null For The Current Data Set.*<br>

In [None]:
# Step-4B In Chi Square Goodness Of Square Fit : Determine The Tabulated Value Of The Test Statistics

chi2.ppf(0.95,len(grouped_df)-1-1)

*Conclusion*<br>
*Reject The Null If The Tabulated Test Statistic Value < Computed Test Statistics Value*<br>
*Do Not Reject The Null If The Tabulated Test Statistics Value >=  Computed Test Statistics Value*

## Question On Chi Square Goodness Of Fit Test

A car maker is exploring the distribution of defects in the assembly line. Data on defective cars produced (a sample of 100 defective cars) on the assembly line are presented in the file titled (GOF.xlsx). The file contains one column - obs. The obs indicates the number of scratches found in a produced car.

You are hired as the business analyst to assist the car maker in answering the following questions

In [None]:
# 1. Based on the given sample data, the column “obs” can be assumed to have a __________.
# Discrete distribution
# Continuous distribution

df = pd.read_csv("GOF.csv")
print("Column obs can be assumed to have a Discrete Distribution")

In [None]:
# 2. Given the sample data,the expected number of scratches on a car produced on the assembly line is __________.

print(f"Expected Number Of Scratches : {df['obs'].mean()}")

In [None]:
# 3. Based on the descriptive statistics (and visualization) of the sample data, the total scratch length can be assumed to
# have a ___
# 1. Symmetric distribution as the mean > median > mode
# 2. Symmetric distribution as the mode > median > mean
# 3. Right skewed distribution as the mean > median > mode
# 4. Left skewed distribution as the mode > median > mean
# 5. Left skewed distribution as the mean > median > mode
# 6. Right skewed distribution as the mode > median > mean
# 7. None of the above

plt.hist(df["obs"])
plt.show()
print("Right Skewed Distribution with mean = median = mode")

In [None]:
# In Chi Square Goodness Of Fit : Create Bins

df["bins"] = pd.cut(df["obs"],bins=10)
df

In [None]:
# Step-2 In Chi Square Goodness Of Fit : Count Number Of Obserations In Each Bin

grouped_df = pd.DataFrame()
grouped_df["Observed Frequency"] = df.groupby("obs").count()
grouped_df = grouped_df.reset_index()
grouped_df

In [None]:
# Step-3 In Chi Square Goodness Of Fit : Determine The Expected Number Of Observations In Each Bin
# Step3A : Determine The Poisson PMF For Each Bucket

grouped_df["POISSON_PMF"] = poisson.pmf(k=grouped_df.index,mu=df["obs"].mean())
grouped_df

In [None]:
# Step3B : Determined The Expected Frequency In Each Bucket

grouped_df["Expected Frequency"] = grouped_df["POISSON_PMF"] * len(df)
grouped_df

In [None]:
grouped_df = grouped_df.drop(grouped_df[grouped_df['obs']==9].index)
grouped_df

In [None]:
# Since Expected Frequency Not Equal To Observed Frequency  

grouped_df.iloc[7,3] = 100 - grouped_df["Expected Frequency"].iloc[:-1].sum()
grouped_df.iloc[7,1] = 13
grouped_df

In [None]:
# 4. If you perform a chi-square goodness of fit test for the “obs” column, what is the probability of observing exactly 4 
# defects?
# [Hint-1: Use the descriptive statistics to identify a possible distribution]
# [Hint-2: Determine the chi-square test statistic without adding any new bins.  Modifying an existing bin is allowed ]

print("Probability Of Observing Exactly 4 Defects :", np.round(grouped_df[grouped_df["obs"]==4]["POISSON_PMF"].values[0],4))

In [None]:
# 5. If you perform a chi-square goodness of fit test for the “obs” column, what is the value of the computed test statistic?
# [Hint-1: Use the descriptive statistics to identify a possible distribution]
# [Hint-2: Determine the chi-square test statistic without adding any new bins. Modifying an existing bin is allowed]
# [Hint 3: Use degrees of freedom (ddof) based on the assumed distribution]

statistic = chisquare(grouped_df["Observed Frequency"],grouped_df["Expected Frequency"])
print("Computed Test Statistic : ",np.round(statistic[0],4))
statistic

In [None]:
# 6. If you perform a chi-square goodness of fit test for the “obs” column, what is the p-value for the test?
# [Hint-1: Use the descriptive statistics to identify a possible distribution]
# [Hint-2: Determine the chi-square test statistic without adding any newbins.  Modifying an existing bin is allowed ]
# [Hint 3: Use degrees of freedom (ddof) based on the assumed distribution]

statistic = chisquare(grouped_df["Observed Frequency"],grouped_df["Expected Frequency"])
print("p-value : ",np.round(statistic[1],4))

In [None]:
# 7. How many degrees of freedom are present in the chi-square goodness of test to determine the distribution for the 
# “Number of Scratches” column?

# dof = number of bins - parameter - 1
print("Degree Of Freedom : ",len(grouped_df) - 1 - 1)