In [None]:
# Student name: Benjamin Rabishaw
# Student number: 1001556522
# Instructor: Shion Guha
# Course code: INF2178
# Course name: Experimental Design for Data Science
# Program: Master of Information
# Faculty of Information
# University of Toronto


# Technical Assignment 3
# March 23, 2024

In [70]:
# Initiating relevant libraries and packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import levene
from scipy.stats import f_oneway
from scipy.stats import anderson
from scipy.stats import boxcox
from scipy.stats import yeojohnson
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
import seaborn as sns
from scipy import stats
from statsmodels.stats.diagnostic import het_white

In [None]:
# Integrating Google Drive

from google.colab import drive
drive.mount('/drive', force_remount=True)

In [None]:
# Loading Dataset and surveying first rows

df = pd.read_csv('/drive/MyDrive/UofT/Colab/Files/INF2178_A3_data.csv')
df.head(25)

In [None]:
# Displaying total length of dataframe

len(df)

In [38]:
# Surveying columns

df.columns

Index(['fallreadingscore', 'fallmathscore', 'fallgeneralknowledgescore',
       'springreadingscore', 'springmathscore', 'springgeneralknowledgescore',
       'totalhouseholdincome', 'incomeinthousands', 'incomegroup',
       'readingdelta', 'mathdelta', 'generalknowledgedelta'],
      dtype='object')

In [39]:
# Surveying column data types

df.dtypes

fallreadingscore               float64
fallmathscore                  float64
fallgeneralknowledgescore      float64
springreadingscore             float64
springmathscore                float64
springgeneralknowledgescore    float64
totalhouseholdincome           float64
incomeinthousands              float64
incomegroup                      int64
readingdelta                   float64
mathdelta                      float64
generalknowledgedelta          float64
dtype: object

In [None]:
# Data Cleaning

## Creating new columns for change in student scores per subject area

df['readingdelta'] = df['springreadingscore'] - df['fallreadingscore']
df['mathdelta'] = df['springmathscore'] - df['fallmathscore']
df['generalknowledgedelta'] = df['springgeneralknowledgescore'] - df['fallgeneralknowledgescore']

df.head(25)


In [None]:
# General Exploratory Data Analysis: Summary Statistics

df.describe()

In [None]:
# General Exploratory Data Analysis: Boxplots

## Creating boxplots for test score variables
plt.figure(figsize=(12, 8))
plt.boxplot(df[['fallreadingscore', 'fallmathscore', 'fallgeneralknowledgescore',
                    'springreadingscore', 'springmathscore', 'springgeneralknowledgescore']])
plt.xticks(range(1, len(selected_vars.columns) + 1), selected_vars.columns, rotation=45)
plt.grid(True)
plt.xlabel('Subject and Time')
plt.ylabel('Scores')

## Displaying boxplots
plt.show()



In [67]:
# Assumption 1: Testing Normality of Residuals using Anderson-Darling Tests

## Ordinary Least Squares (OLS) model for Research Questions 1-3
OLSreading = 'readingdelta ~ fallreadingscore'
OLSmath = 'mathdelta ~ fallmathscore'
OLSgeneralknowledge = 'generalknowledgedelta ~ fallgeneralknowledgescore'

### For reading scores, RQ1:
model_RQ1_reading = sm.OLS.from_formula(OLSreading, data=df)
result_RQ1_reading = model_RQ1_reading.fit()
residuals_RQ1_reading = result_RQ1_reading.resid

### For math scores, RQ2:
model_RQ2_math = sm.OLS.from_formula(OLSmath, data=df)
result_RQ2_math = model_RQ2_math.fit()
residuals_RQ2_math = result_RQ2_math.resid

### For general knowledge scores, RQ3:
model_RQ3_generalknowledge = sm.OLS.from_formula(OLSgeneralknowledge, data=df)
result_RQ3_generalknowledge = model_RQ3_generalknowledge.fit()
residuals_RQ3_generalknowledge = result_RQ3_generalknowledge.resid

## Performing the Anderson-Darling test for normality of residuals for each subject
anderson_statistic_RQ1_reading, critical_values_reading, significance_levels_reading = anderson(residuals_RQ1_reading)
anderson_statistic_RQ2_math, critical_values_math, significance_levels_math = anderson(residuals_RQ2_math)
anderson_statistic_RQ3_generalknowledge, critical_values_generalknowledge, significance_levels_generalknowledge = anderson(residuals_RQ3_generalknowledge)

## Printing Anderson-Darling results for each subject
print("Reading: Anderson-Darling Test Statistic:", anderson_statistic_RQ1_reading)
print("Reading: Critical Values:", critical_values_reading)
print("Reading: Significance Levels:", significance_levels_reading)
print()
print("Math: Anderson-Darling Test Statistic:", anderson_statistic_RQ2_math)
print("Math: Critical Values:", critical_values_math)
print("Math: Significance Levels:", significance_levels_math)
print()
print("General Knowledge: Anderson-Darling Test Statistic:", anderson_statistic_RQ3_generalknowledge)
print("General Knowledge: Critical Values:", critical_values_generalknowledge)
print("General Knowledge: Significance Levels:", significance_levels_generalknowledge)
print()

## Making residual data positive in order to proceed with Box Cox transformation

### For reading scores, RQ1
min_residualRQ1 = min(residuals_RQ1_reading)
constantRQ1 = abs(min_residualRQ1) + 1e-6
residuals_positiveRQ1 = residuals_RQ1_reading + constantRQ1

### For math scores, RQ2
min_residualRQ2 = min(residuals_RQ2_math)
constantRQ2 = abs(min_residualRQ2) + 1e-6
residuals_positiveRQ2 = residuals_RQ2_math + constantRQ2

### For general knowledge scores, RQ3
min_residualRQ3 = min(residuals_RQ3_generalknowledge)
constantRQ3 = abs(min_residualRQ3) + 1e-6
residuals_positiveRQ3 = residuals_RQ3_generalknowledge + constantRQ3

## Transorming residuals using the Box Cox method

### For reading scores, RQ1
transformed_residuals_RQ1, best_lambda_RQ1 = boxcox(residuals_positiveRQ1)

### For math scores, RQ2
transformed_residuals_RQ2, best_lambda_RQ2 = boxcox(residuals_positiveRQ2)

### For general knowledge scores, RQ3
transformed_residuals_RQ3, best_lambda_RQ3 = boxcox(residuals_positiveRQ3)

## Re-running Anderson-Darling tests on transformed residuals

### For reading scores, RQ1
anderson_statistic_RQ1, critical_values_RQ1, significance_levels_RQ1 = anderson(transformed_residuals_RQ1)

### For math scores, RQ2
anderson_statistic_RQ2, critical_values_RQ2, significance_levels_RQ2 = anderson(transformed_residuals_RQ2)

### For general knowledge scores, RQ3
anderson_statistic_RQ3, critical_values_RQ3, significance_levels_RQ3 = anderson(transformed_residuals_RQ3)

## Printing Anderson-Darling results for each subject

print("Reading: Transformed Anderson-Darling Test Statistic:", anderson_statistic_RQ1)
print("Reading: Transformed Critical Values:", critical_values_RQ1)
print("Reading: Transformed Significance Levels:", significance_levels_RQ1)
print()
print("Math: Transformed Anderson-Darling Test Statistic:", anderson_statistic_RQ2)
print("Math: Transformed Critical Values:", critical_values_RQ2)
print("Math: Transformed Significance Levels:", significance_levels_RQ2)
print()
print("General Knowledge: Transformed Anderson-Darling Test Statistic:", anderson_statistic_RQ3)
print("General Knowledge: Transformed Critical Values:", critical_values_RQ3)
print("General Knowledge: Transformed Significance Levels:", significance_levels_RQ3)
print()

Reading: Anderson-Darling Test Statistic: 154.48124076225577
Reading: Critical Values: [0.576 0.656 0.787 0.918 1.092]
Reading: Significance Levels: [15.  10.   5.   2.5  1. ]

Math: Anderson-Darling Test Statistic: 74.66567741147264
Math: Critical Values: [0.576 0.656 0.787 0.918 1.092]
Math: Significance Levels: [15.  10.   5.   2.5  1. ]

General Knowledge: Anderson-Darling Test Statistic: 1.7208135790970118
General Knowledge: Critical Values: [0.576 0.656 0.787 0.918 1.092]
General Knowledge: Significance Levels: [15.  10.   5.   2.5  1. ]

Reading: Transformed Anderson-Darling Test Statistic: 117.24325101265094
Reading: Transformed Critical Values: [0.576 0.656 0.787 0.918 1.092]
Reading: Transformed Significance Levels: [15.  10.   5.   2.5  1. ]

Math: Transformed Anderson-Darling Test Statistic: 36.25975917566029
Math: Transformed Critical Values: [0.576 0.656 0.787 0.918 1.092]
Math: Transformed Significance Levels: [15.  10.   5.   2.5  1. ]

General Knowledge: Transformed An

In [69]:
# Assumption 1: Testing Normality of Residuals using Anderson-Darling Tests

## Ordinary Least Squares (OLS) model for Research Questions 1-3
OLSreading = 'readingdelta ~ fallreadingscore'
OLSmath = 'mathdelta ~ fallmathscore'
OLSgeneralknowledge = 'generalknowledgedelta ~ fallgeneralknowledgescore'

### For reading scores, RQ1:
model_RQ1_reading = sm.OLS.from_formula(OLSreading, data=df)
result_RQ1_reading = model_RQ1_reading.fit()
residuals_RQ1_reading = result_RQ1_reading.resid

### For math scores, RQ2:
model_RQ2_math = sm.OLS.from_formula(OLSmath, data=df)
result_RQ2_math = model_RQ2_math.fit()
residuals_RQ2_math = result_RQ2_math.resid

### For general knowledge scores, RQ3:
model_RQ3_generalknowledge = sm.OLS.from_formula(OLSgeneralknowledge, data=df)
result_RQ3_generalknowledge = model_RQ3_generalknowledge.fit()
residuals_RQ3_generalknowledge = result_RQ3_generalknowledge.resid

## Performing the Anderson-Darling test for normality of residuals for each subject
anderson_statistic_RQ1_reading, critical_values_reading, significance_levels_reading = anderson(residuals_RQ1_reading)
anderson_statistic_RQ2_math, critical_values_math, significance_levels_math = anderson(residuals_RQ2_math)
anderson_statistic_RQ3_generalknowledge, critical_values_generalknowledge, significance_levels_generalknowledge = anderson(residuals_RQ3_generalknowledge)

## Printing Anderson-Darling results for each subject
print("Reading: Anderson-Darling Test Statistic:", anderson_statistic_RQ1_reading)
print("Reading: Critical Values:", critical_values_reading)
print("Reading: Significance Levels:", significance_levels_reading)
print()
print("Math: Anderson-Darling Test Statistic:", anderson_statistic_RQ2_math)
print("Math: Critical Values:", critical_values_math)
print("Math: Significance Levels:", significance_levels_math)
print()
print("General Knowledge: Anderson-Darling Test Statistic:", anderson_statistic_RQ3_generalknowledge)
print("General Knowledge: Critical Values:", critical_values_generalknowledge)
print("General Knowledge: Significance Levels:", significance_levels_generalknowledge)
print()

## Making residual data positive in order to proceed with Yeo-Johnson transformation

### For reading scores, RQ1
min_residualRQ1 = min(residuals_RQ1_reading)
constantRQ1 = abs(min_residualRQ1) + 1e-6
residuals_positiveRQ1 = residuals_RQ1_reading + constantRQ1

### For math scores, RQ2
min_residualRQ2 = min(residuals_RQ2_math)
constantRQ2 = abs(min_residualRQ2) + 1e-6
residuals_positiveRQ2 = residuals_RQ2_math + constantRQ2

### For general knowledge scores, RQ3
min_residualRQ3 = min(residuals_RQ3_generalknowledge)
constantRQ3 = abs(min_residualRQ3) + 1e-6
residuals_positiveRQ3 = residuals_RQ3_generalknowledge + constantRQ3

## Transorming residuals using the Yeo-Johnson method

### For reading scores, RQ1
transformed_residuals_RQ1, lambda_RQ1 = yeojohnson(residuals_positiveRQ1)

### For math scores, RQ2
transformed_residuals_RQ2, lambda_RQ2 = yeojohnson(residuals_positiveRQ2)

### For general knowledge scores, RQ3
transformed_residuals_RQ3, lambda_RQ3 = yeojohnson(residuals_positiveRQ3)

## Re-running Anderson-Darling tests on transformed residuals

### For reading scores, RQ1
anderson_statistic_RQ1, critical_values_RQ1, significance_levels_RQ1 = anderson(transformed_residuals_RQ1)

### For math scores, RQ2
anderson_statistic_RQ2, critical_values_RQ2, significance_levels_RQ2 = anderson(transformed_residuals_RQ2)

### For general knowledge scores, RQ3
anderson_statistic_RQ3, critical_values_RQ3, significance_levels_RQ3 = anderson(transformed_residuals_RQ3)

## Printing Anderson-Darling results for each subject

print("Reading: Transformed Anderson-Darling Test Statistic:", anderson_statistic_RQ1)
print("Reading: Transformed Critical Values:", critical_values_RQ1)
print("Reading: Transformed Significance Levels:", significance_levels_RQ1)
print()
print("Math: Transformed Anderson-Darling Test Statistic:", anderson_statistic_RQ2)
print("Math: Transformed Critical Values:", critical_values_RQ2)
print("Math: Transformed Significance Levels:", significance_levels_RQ2)
print()
print("General Knowledge: Transformed Anderson-Darling Test Statistic:", anderson_statistic_RQ3)
print("General Knowledge: Transformed Critical Values:", critical_values_RQ3)
print("General Knowledge: Transformed Significance Levels:", significance_levels_RQ3)
print()

Reading: Anderson-Darling Test Statistic: 154.48124076225577
Reading: Critical Values: [0.576 0.656 0.787 0.918 1.092]
Reading: Significance Levels: [15.  10.   5.   2.5  1. ]

Math: Anderson-Darling Test Statistic: 74.66567741147264
Math: Critical Values: [0.576 0.656 0.787 0.918 1.092]
Math: Significance Levels: [15.  10.   5.   2.5  1. ]

General Knowledge: Anderson-Darling Test Statistic: 1.7208135790970118
General Knowledge: Critical Values: [0.576 0.656 0.787 0.918 1.092]
General Knowledge: Significance Levels: [15.  10.   5.   2.5  1. ]

Reading: Transformed Anderson-Darling Test Statistic: 112.14681634098633
Reading: Transformed Critical Values: [0.576 0.656 0.787 0.918 1.092]
Reading: Transformed Significance Levels: [15.  10.   5.   2.5  1. ]

Math: Transformed Anderson-Darling Test Statistic: 33.082302250664725
Math: Transformed Critical Values: [0.576 0.656 0.787 0.918 1.092]
Math: Transformed Significance Levels: [15.  10.   5.   2.5  1. ]

General Knowledge: Transformed A

In [71]:
# Assumption 2: Testing Heteroskedasticity with a White test

## Fitting a regression model in preparation for the White test
model = sm.OLS(df['fallgeneralknowledgescore'], sm.add_constant(df[['totalhouseholdincome']]))
result = model.fit()

## Performing the White test
white_test = het_white(result.resid, result.model.exog)

## Displaying test results
white_test_statistic = white_test[0]
white_p_value = white_test[1]

print("White Test Statistic:", white_test_statistic)
print("p-value:", white_p_value)


White Test Statistic: 26.79737470563559
p-value: 1.5171342676929652e-06


In [74]:
# The one-way ANCOVA test

## Defining the model
test_model = 'generalknowledgedelta ~ totalhouseholdincome + fallgeneralknowledgescore'

## Fitting the model
ANCOVA_model = sm.formula.ols(formula=test_model, data=df).fit()

## Conducting the one-way ANCOVA test
ANCOVA = sm.stats.anova_lm(ANCOVA_model, typ=2)

## Displaying the ANCOVA results
print(ANCOVA)


                                  sum_sq       df           F         PR(>F)
totalhouseholdincome         1863.445065      1.0  120.798022   5.761298e-28
fallgeneralknowledgescore   12151.352428      1.0  787.712698  6.723343e-168
Residual                   184033.639329  11930.0         NaN            NaN
