In [70]:
import pandas as pd
import numpy as np
from scipy.stats import chi2

In [71]:
df = pd.read_csv('../data/processed.csv')
df

Unnamed: 0,Institution,Course,actor.id,timestamp,verb.id,object.definition.type,result.score.scaled,result.success,result.completion,Teaching
0,UEF,Advanced Data Management Systems,0,2023-07-07 18:55:47,viewed,course,,False,False,Flipped classroom
1,UEF,Advanced Data Management Systems,0,2023-07-07 15:21:18,viewed,link,,False,False,Flipped classroom
2,UEF,Advanced Data Management Systems,0,2023-07-07 15:08:13,viewed,link,,False,False,Flipped classroom
3,UEF,Advanced Data Management Systems,0,2023-07-07 15:08:10,viewed,course,,False,False,Flipped classroom
4,UEF,Advanced Data Management Systems,0,2023-07-07 09:03:32,viewed,course,,False,False,Flipped classroom
...,...,...,...,...,...,...,...,...,...,...
299286,BMU,Computer Networks,820,2023-06-11 22:57:27,answered,cmi.interaction,1.000,True,True,Flipped classroom
299287,BMU,Computer Networks,799,2023-05-16 00:05:45,answered,cmi.interaction,0.000,False,True,Flipped classroom
299288,BMU,Computer Networks,788,2023-05-12 22:10:50,completed,assessment,0.375,True,True,Flipped classroom
299289,BMU,Computer Networks,788,2023-02-27 22:45:33,completed,module,,False,False,Flipped classroom


In [72]:
df['verb.id'].value_counts()

verb.id
viewed       229721
answered      33546
completed     18224
receive        6582
start          4400
scored         4236
submit         1481
create          931
join            136
leave            34
Name: count, dtype: int64

In [73]:
df_freq = df.groupby('verb.id')['Course'].value_counts()
df_freq

verb.id    Course                                 
answered   Human Factors of Interactive Technology     14754
           Computer Architecture                        8191
           Advanced Data Management Systems             3715
           e-Learning                                   3397
           Computer Organization                        2116
           Computer Networks                            1373
completed  Human Factors of Interactive Technology      5783
           Computer Architecture                        4454
           e-Learning                                   2309
           Computer Organization                        1616
           Web Applications                             1603
           Computer Networks                            1244
           Advanced Data Management Systems             1034
           Human-computer interaction                    181
create     Human Factors of Interactive Technology       823
           e-Learning             

In [74]:
df_count = df_freq.reset_index()
df_count

Unnamed: 0,verb.id,Course,count
0,answered,Human Factors of Interactive Technology,14754
1,answered,Computer Architecture,8191
2,answered,Advanced Data Management Systems,3715
3,answered,e-Learning,3397
4,answered,Computer Organization,2116
5,answered,Computer Networks,1373
6,completed,Human Factors of Interactive Technology,5783
7,completed,Computer Architecture,4454
8,completed,e-Learning,2309
9,completed,Computer Organization,1616


In [75]:
df_freq = pd.DataFrame(columns=['verb', 'course', 'exp_freq', 'obs_freq'])
df_freq

Unnamed: 0,verb,course,exp_freq,obs_freq


In [76]:
for verb in df_count['verb.id'].drop_duplicates().tolist():
    for course in df_count['Course'].drop_duplicates().tolist():
        if ((df_count['verb.id'] == verb) & (df_count['Course'] == course)).any():
            obs_freq = df_count.loc[(df_count['verb.id'] == verb) & (df_count['Course'] == course), 'count'].iloc[0]
        else:
            obs_freq = 0
        
        df_freq.loc[len(df_freq)] = {
            'verb': verb,
            'course': course,
            'exp_freq': 1/8 * sum(df_count.loc[(df_count['verb.id'] == verb)]['count']),
            'obs_freq': obs_freq
        }
        
df_freq

Unnamed: 0,verb,course,exp_freq,obs_freq
0,answered,Human Factors of Interactive Technology,4193.250,14754
1,answered,Computer Architecture,4193.250,8191
2,answered,Advanced Data Management Systems,4193.250,3715
3,answered,e-Learning,4193.250,3397
4,answered,Computer Organization,4193.250,2116
...,...,...,...,...
75,viewed,e-Learning,28715.125,20055
76,viewed,Computer Organization,28715.125,315
77,viewed,Computer Networks,28715.125,162
78,viewed,Web Applications,28715.125,18418


$H_0: \ The \ verbs \ are \ uniformly \ distributed \ across \ courses.$


$H_1: \ The \ verbs \ are \ not \ uniformly \ distributed \ across \ courses.$

In [77]:
alpha = 0.01 

$\chi^{2} = \sum \frac {(O - E)^{2}} {E}$

In [85]:
for course in df_count['Course'].drop_duplicates().tolist():
    course_df = df_freq.loc[(df_freq['course'] == course)]
    O_E = (course_df['obs_freq'] - course_df['exp_freq']) ** 2
    chi_squared = np.sum(O_E / course_df['exp_freq'])
    p = 1 - chi2.cdf(chi_squared, df=7)
    
    print(f'Course:{course}\nChi-squared: {chi_squared}\np-value: {p}\nReject the null hypothesis: {p < alpha}')
    print()

Course:Human Factors of Interactive Technology
Chi-squared: 261242.3575437683
p-value: 0.0
Reject the null hypothesis: True

Course:Computer Architecture
Chi-squared: 43452.12260002721
p-value: 0.0
Reject the null hypothesis: True

Course:Advanced Data Management Systems
Chi-squared: 11217.7852148754
p-value: 0.0
Reject the null hypothesis: True

Course:e-Learning
Chi-squared: 3788.285432569007
p-value: 0.0
Reject the null hypothesis: True

Course:Computer Organization
Chi-squared: 30065.370977486305
p-value: 0.0
Reject the null hypothesis: True

Course:Computer Networks
Chi-squared: 31892.145755209207
p-value: 0.0
Reject the null hypothesis: True

Course:Web Applications
Chi-squared: 9778.041290447616
p-value: 0.0
Reject the null hypothesis: True

Course:Human-computer interaction
Chi-squared: 15384.49961715622
p-value: 0.0
Reject the null hypothesis: True
