In [1]:
import scipy.stats as stats
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_ind, f_oneway

## Q1. Explain the assumptions required to use ANOVA and provide examples of violations that could impact the validity of the results.
Ans :

1. Normality of sampling distribution of means. The distribution of sample mean is normally distributed.
2. Absence of outliers. outliers score need to be removed from dataset.
3. Homogeneity: each and every one of the populations that (sample/column) used in level has same variance
4. Sample must be independent and ranmdom
e.g.

Factor = Medicines

Level = 5mg 10mg 15mg 20mg (dosage)

## Q2. What are the three types of ANOVA, and in what situations would each be used?
Ans :

One Way ANOVA : One factor with at least z level, these levels are independent

Repeated Measures ANOVA : One factor with at least 2 levels, levels are dependent

Factorial ANOVA : Two or more factors, each of which at least 2 level, levels can be either independent and dependent

## Q3. What is the partitioning of variance in ANOVA, and why is it important to understand this concept?
Ans :

The partitioning of variance in ANOVA refers to the process of breaking down the total variance in a dataset into different components based on the sources of variation. There are three types of variation, Between-group variance, Within-group variance, Total variance

it helps us to determine the relative importance of the different sources of variation in the dataset.

## Q4. How would you calculate the total sum of squares (SST), explained sum of squares (SSE), and residual sum of squares (SSR) in a one-way ANOVA using Python?

In [2]:
data_mean = np.array([3,4,56,3])
import scipy.stats as stats

# Create sample data
group1 = np.array([4, 5, 6, 7, 8])
group2 = np.array([2, 3, 4, 5, 6])
group3 = np.array([1, 2, 3, 4, 5])


# Combine data into one array
data =group1 + group2 + group3

data = np.array(data)

# ANOVA
f_value, p_value = stats.f_oneway(group1, group2, group3)

# total sum of squares (SST)
SST = ((data - data.mean())**2).sum()

# explained sum of squares (SSE)
SSE = ((group1.mean() - data.mean())**2) * len(group1)
SSE += ((group2.mean() - data.mean())**2) * len(group2)
SSE += ((group3.mean() - data.mean())**2) * len(group3)

# Compute the residual sum of squares (SSR)
SSR = SST - SSE

print('Total sum of squares (SST) =', SST)
print('Explained sum of squares (SSE) =', SSE)
print('Residual sum of squares (SSR) =', SSR)
print('F-value =', f_value)
print('p-value =', p_value)

Total sum of squares (SST) = 90.0
Explained sum of squares (SSE) = 1150.0
Residual sum of squares (SSR) = -1060.0
F-value = 4.666666666666667
p-value = 0.031676352024078334


## Q5. In a two-way ANOVA, how would you calculate the main effects and interaction effects using Python?

In [3]:
# dataset with two categorical variables and one continuous variable
df = pd.DataFrame({'data1': ['A', 'A', 'B', 'B'], 
                   'data2': ['X', 'Y', 'X', 'Y'],
                   'value': [10.0, 12.0, 15.0, 18.0]})

# fit a two-way ANOVA model with interaction effect
model = ols('value ~ C(data1) + C(data2) + C(data1):C(data2)', data=df).fit()

# extract the main effects and interaction effect
effects = model.params[1:3]
int_effect = model.params[3]

print("Main effects:", effects)
print("Interaction effect:", int_effect.round())


Main effects: C(data1)[T.B]    5.0
C(data2)[T.Y]    2.0
dtype: float64
Interaction effect: 1.0


## Q6. Suppose you conducted a one-way ANOVA and obtained an F-statistic of 5.23 and a p-value of 0.02. What can you conclude about the differences between the groups, and how would you interpret these results?
Ans :

The obtained F-statistic is 5.23 and the p-value is 0.02. The p-value is below the conventional threshold of 0.05 and significant at the 5% level.

This Investigation is warranted to understand the nature of the differences between the groups.

## Q7. In a repeated measures ANOVA, how would you handle missing data, and what are the potential consequences of using different methods to handle missing data?
Ans :

step 1. Complete-case analysis
step 2. Pairwise deletion
step 3. Imputation

## Q8. What are some common post-hoc tests used after ANOVA, and when would you use each one? Provide an example of a situation where a post-hoc test might be necessary.
Ans :

Tukey's Honestly Significant Difference (HSD) test

Bonferroni correction

Scheffé's test

Dunnett's test

e.g. In a study investigating the effect of different types of exercise on weight loss,

## Q9. A researcher wants to compare the mean weight loss of three diets: A, B, and C. They collect data from 50 participants who were randomly assigned to one of the diets. Conduct a one-way ANOVA using Python to determine if there are any significant differences between the mean weight loss of the three diets. Report the F-statistic and p-value, and interpret the results.

In [5]:
np.random.seed(42)
# some data for each group

group_A = np.random.randint(3,10,50)
group_B = np.random.randint(3,10,50)
group_C = np.random.randint(3,10,50)

# print(group_A)
# print(group_B)
# print(group_C)

# Conduct one-way ANOVA
f_stat, p_val = f_oneway(group_A, group_B, group_C)

# Print results
print('F-statistic:', f_stat)
print('p-value:', p_val)

F-statistic: 0.8223246115557947
p-value: 0.441419982089683


## Q10. A company wants to know if there are any significant differences in the average time it takes to complete a task using three different software programs: Program A, Program B, and Program C. They randomly assign 30 employees to one of the programs and record the time it takes each employee to complete the task. Conduct a two-way ANOVA using Python to determine if there are any main effects or interaction effects between the software programs and employee experience level (novice vs. experienced). Report the F-statistics and p-values, and interpret the results.

In [6]:
np.random.seed(10)

# Create a sample dataset
df = pd.DataFrame({'Software_name': ['A', 'B', 'C'] * 30,
                     'Experience': ['Novice'] * 45 + ['Experienced'] * 45,
                     'Time': np.random.randint(10,30,90)})

# Fit a two-way ANOVA model
model = ols('Time ~ C(Software_name) + C(Experience) + C(Software_name):C(Experience)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

# Print the ANOVA table
print(anova_table)

# data['Time'].count()

                                     sum_sq    df         F    PR(>F)
C(Software_name)                  46.866667   2.0  0.709760  0.494687
C(Experience)                      2.500000   1.0  0.075721  0.783858
C(Software_name):C(Experience)    25.800000   2.0  0.390721  0.677792
Residual                        2773.333333  84.0       NaN       NaN


## Q11. An educational researcher is interested in whether a new teaching method improves student test scores. They randomly assign 100 students to either the control group (traditional teaching method) or the experimental group (new teaching method) and administer a test at the end of the semester. Conduct a two-sample t-test using Python to determine if there are any significant differences in test scores between the two groups. If the results are significant, follow up with a post-hoc test to determine which group(s) differ significantly from each other.

In [7]:
# Define the test score data for each group
control_group = np.random.randint(70, 90, 100)        #traditional teaching method

experimental_group = np.random.randint(70, 90, 100)   #new teaching method

# two-sample t-test
t_stat, p_val = ttest_ind(control_group, experimental_group)

# Print results
print('t-statistic:', t_stat)
print('p-value:', p_val)

# Conduct post-hoc test (Tukey's HSD)
f_stat, p_val = f_oneway(control_group, experimental_group)
tukey_results = pairwise_tukeyhsd(control_group, experimental_group)
print(tukey_results)

t-statistic: 0.5955759316302612
p-value: 0.5521387024616692
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower    upper  reject
-----------------------------------------------------
    70     71     -5.0    1.0 -26.0297 16.0297  False
    70     72      6.0 0.9791  -7.1992 19.1992  False
    70     73      7.0 0.8131  -4.8624 18.8624  False
    70     74      3.6 0.9996  -7.3824 14.5824  False
    70     75   0.6667    1.0  -9.6876  11.021  False
    70     76   1.5714    1.0  -8.3098 11.4527  False
    70     77     -0.5    1.0 -10.8543  9.8543  False
    70     78      0.5    1.0 -11.3624 12.3624  False
    70     79   2.1667    1.0  -8.1876  12.521  False
    70     80   2.7778 0.9999   -6.435 11.9906  False
    70     81     -0.5    1.0 -12.3624 11.3624  False
    70     82      4.0 0.9984  -6.9824 14.9824  False
    70     83  -2.3333    1.0 -15.5325 10.8659  False
    70     84      9.5 0.7661  -6.0315 25.0315  False
    70     85   5.3333

## Q12. A researcher wants to know if there are any significant differences in the average daily sales of three retail stores: Store A, Store B, and Store C. They randomly select 30 days and record the sales for each store on those days. Conduct a repeated measures ANOVA using Python to determine if there are any significant differences in sales between the three stores. If the results are significant, follow up with a post- hoc test to determine which store(s) differ significantly from each other.

In [8]:
np.random.seed(10)
# Create a sample dataset
data = pd.DataFrame({'Days': list(range(1, 31)) * 3,
                     'Stores': ['A'] * 30 + ['B'] * 30 + ['C'] * 30,
                     'Sales': np.random.randint(10,25,90)})

# Fit a repeated measures ANOVA model
m = ols('Sales ~ C(Stores) + C(Days)', data=data).fit()
a_table = sm.stats.anova_lm(m, typ=2)

# Print the ANOVA table
print(a_table)
# data['Sales'].count()

                sum_sq    df         F    PR(>F)
C(Stores)    53.088889   2.0  1.505094  0.230538
C(Days)     637.155556  29.0  1.245769  0.234926
Residual   1022.911111  58.0       NaN       NaN
