In [1]:
from forgetful_dkt.lstm import LSTM
from forgetful_dkt.preprocess import DataProcessor
import tensorflow as tf
import numpy as np

def initialize_sess():
    global sess
    ruv = set(sess.run(tf.report_uninitialized_variables()))
    uv = [v for v in tf.global_variables() if v.name.split(':')[0].encode('ascii') in ruv]
    tf.variables_initializer(uv).run()
    
def reset_sess():
    global sess
    tf.reset_default_graph()
    sess.close()
    sess = tf.InteractiveSession()    

sess = tf.InteractiveSession()

## Exponents 1

In [2]:
dp = DataProcessor('vivek', ['exponents_1'], max_seq_length=200, min_seq_length=5)

In [3]:
correctness_only_data = dp.correctness_only(verbose=False).get_data()
by_exercise_bucket_data = dp.bucketed_delay_by_exercise([3, 24, None], [3], False, verbose=False).get_data()
aggregate_bucket_data = dp.bucketed_delay_aggregate([3, 24, None], [3], False, verbose=False).get_data()

datasets = [correctness_only_data, by_exercise_bucket_data, aggregate_bucket_data]
names = ["correctness_only_data", "by_exercise_bucket_data", "aggregate_bucket_data"]

In [4]:
for data, name in zip(datasets, names):
    print(name)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    avg_acc = 0.0
    avg_auc = 0.0
    avg_mae = 0.0
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae = lstm.test(sess, data)
        avg_acc += acc / k
        avg_auc += auc / k
        avg_mae += mae / k
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average Mean Absolute Error %.8f" % (avg_acc, avg_auc, avg_mae))

correctness_only_data
Fold 1
epoch 0, MSE: 0.1808
epoch 1, MSE: 0.1805
epoch 2, MSE: 0.1805
Accuracy: 0.76151, Baseline: 0.57417, AUC: 0.85404, MAE: 0.34134
Fold 2
epoch 0, MSE: 0.1656
epoch 1, MSE: 0.1662
epoch 2, MSE: 0.1654
Accuracy: 0.75791, Baseline: 0.57637, AUC: 0.84334, MAE: 0.33285
Fold 3
epoch 0, MSE: 0.1808
epoch 1, MSE: 0.1795
epoch 2, MSE: 0.1793
Accuracy: 0.77229, Baseline: 0.57791, AUC: 0.85060, MAE: 0.32475
Fold 4
epoch 0, MSE: 0.1647
epoch 1, MSE: 0.1643
epoch 2, MSE: 0.1630
Accuracy: 0.76377, Baseline: 0.58072, AUC: 0.84854, MAE: 0.33200
Fold 5
epoch 0, MSE: 0.1693
epoch 1, MSE: 0.1688
epoch 2, MSE: 0.1688
Accuracy: 0.77424, Baseline: 0.55916, AUC: 0.86842, MAE: 0.33536
Fold 6
epoch 0, MSE: 0.1507
epoch 1, MSE: 0.1503
epoch 2, MSE: 0.1510
Accuracy: 0.75590, Baseline: 0.56858, AUC: 0.84066, MAE: 0.33736
Fold 7
epoch 0, MSE: 0.1786
epoch 1, MSE: 0.1759
epoch 2, MSE: 0.1756
Accuracy: 0.76561, Baseline: 0.56300, AUC: 0.86403, MAE: 0.33443
Fold 8
epoch 0, MSE: 0.1688
epoch

## Set of Exercises

In [5]:
dpset = DataProcessor('vivek', ['exponents_1', 'exponent_rules', 'positive_and_zero_exponents',
                                'properties-of-integer-exponents', 'square_roots_2',
                                'equations-w-square-and-cube-roots', 'scientific_notation',
                                'computing-in-scientific-notation'], max_seq_length=200, min_seq_length=5)

In [6]:
correctness_only_dataset = dpset.correctness_only(verbose=False).get_data()
by_exercise_bucket_dataset = dpset.bucketed_delay_by_exercise([3, 24, None], [3], False, verbose=False).get_data()
aggregate_bucket_dataset = dpset.bucketed_delay_aggregate([3, 24, None], [3], False, verbose=False).get_data()

datasets2 = [correctness_only_dataset, by_exercise_bucket_dataset, aggregate_bucket_dataset]
names2 = ["correctness_only_dataset", "by_exercise_bucket_dataset", "aggregate_bucket_dataset"]

In [7]:
for data, name in zip(datasets2, names2):
    print(name)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    avg_acc = 0.0
    avg_auc = 0.0
    avg_mae = 0.0
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae = lstm.test(sess, data)
        avg_acc += acc / k
        avg_auc += auc / k
        avg_mae += mae / k
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average Mean Absolute Error %.8f" % (avg_acc, avg_auc, avg_mae))

correctness_only_dataset
Fold 1
epoch 0, MSE: 0.1562
epoch 1, MSE: 0.1547
epoch 2, MSE: 0.1543
Accuracy: 0.78256, Baseline: 0.57233, AUC: 0.86633, MAE: 0.31567
Fold 2
epoch 0, MSE: 0.1554
epoch 1, MSE: 0.1553
epoch 2, MSE: 0.1549
Accuracy: 0.78070, Baseline: 0.56992, AUC: 0.85945, MAE: 0.31160
Fold 3
epoch 0, MSE: 0.1615
epoch 1, MSE: 0.1597
epoch 2, MSE: 0.1594
Accuracy: 0.78266, Baseline: 0.55741, AUC: 0.85880, MAE: 0.31682
Fold 4
epoch 0, MSE: 0.1553
epoch 1, MSE: 0.1533
epoch 2, MSE: 0.1531
Accuracy: 0.77658, Baseline: 0.57242, AUC: 0.85835, MAE: 0.31498
Fold 5
epoch 0, MSE: 0.1485
epoch 1, MSE: 0.1469
epoch 2, MSE: 0.1455
Accuracy: 0.78912, Baseline: 0.57556, AUC: 0.85779, MAE: 0.30812
Fold 6
epoch 0, MSE: 0.1634
epoch 1, MSE: 0.1625
epoch 2, MSE: 0.1610
Accuracy: 0.78519, Baseline: 0.56443, AUC: 0.86620, MAE: 0.31015
Fold 7
epoch 0, MSE: 0.1613
epoch 1, MSE: 0.1621
epoch 2, MSE: 0.1613
Accuracy: 0.78276, Baseline: 0.54830, AUC: 0.87423, MAE: 0.30872
Fold 8
epoch 0, MSE: 0.1660
ep

## Paired T-tests

In [10]:
import scipy.stats as stats

### Exponents 1

In [11]:
exponents_correctness_only = {
    "accuracy": [0.76151, 0.75791, 0.77229, 0.76377, 0.77424, 0.75590, 0.76561, 0.75643, 0.76172,
                 0.76461, 0.76128, 0.76279, 0.77356, 0.76587, 0.77183],
    "baseline": [0.57417, 0.57637, 0.57791, 0.58072, 0.55916, 0.56858, 0.56300, 0.56220, 0.55819,
                 0.55895, 0.56213, 0.58526, 0.57605, 0.56858, 0.55594],
    "AUC": [0.85404, 0.84334, 0.85060, 0.84854, 0.86842, 0.84066, 0.86403, 0.85638, 0.85409, 0.85763,
            0.85411, 0.85080, 0.86670, 0.85546, 0.86077],
    "MAE": [0.34134, 0.33285, 0.32475, 0.33200, 0.33536, 0.33736, 0.33443, 0.33969, 0.33618, 0.33904,
            0.33537, 0.33274, 0.32989, 0.33138, 0.33747]
}
                
exponents_by_exercise_buckets = {
    "accuracy": [0.76103, 0.75973, 0.77088, 0.76170, 0.77721, 0.75600, 0.76581, 0.75693, 0.76248,
                 0.76590, 0.76064, 0.76131, 0.77034, 0.76472, 0.77082],
    "baseline": [0.57417, 0.57637, 0.57791, 0.58072, 0.55916, 0.56858, 0.56300, 0.56220, 0.55819,
                 0.55895, 0.56213, 0.58526, 0.57605, 0.56858, 0.55594],
    "AUC": [0.84488, 0.84503, 0.84430, 0.84597, 0.86254, 0.83953, 0.86162, 0.85913, 0.85464, 0.85813,
            0.84724, 0.84358, 0.86618, 0.85402, 0.86211],
    "MAE": [0.34508, 0.33415, 0.32351, 0.33360, 0.33042, 0.33781, 0.33129, 0.33996, 0.33509, 0.33543,
            0.33492, 0.33361, 0.32847, 0.32991, 0.33484]
}

exponents_aggregate_buckets = {
    "accuracy": [0.76103, 0.75932, 0.77135, 0.76358, 0.77641, 0.75562, 0.76531, 0.75703, 0.76191,
                 0.76481, 0.76299, 0.76200, 0.77356, 0.76415, 0.77001],
    "baseline": [0.57417, 0.57637, 0.57791, 0.58072, 0.55916, 0.56858, 0.56300, 0.56220, 0.55819,
                 0.55895, 0.56213, 0.58526, 0.57605, 0.56858, 0.55594],
    "AUC": [0.85411, 0.84036, 0.84517, 0.84598, 0.86143, 0.84116, 0.85588, 0.85783, 0.85593, 0.85266,
            0.85154, 0.84860, 0.86463, 0.85273, 0.85809],
    "MAE": [0.34435, 0.33176, 0.32459, 0.33239, 0.33496, 0.33911, 0.33231, 0.33945, 0.33648, 0.33498,
            0.33345, 0.33477, 0.33059, 0.33031, 0.33492]
}

In [12]:
print("Correctness vs. By Exercise Buckets")
acc_result = stats.ttest_rel(exponents_correctness_only["accuracy"], exponents_by_exercise_buckets["accuracy"])
auc_result = stats.ttest_rel(exponents_correctness_only["AUC"], exponents_by_exercise_buckets["AUC"])
mae_result = stats.ttest_rel(exponents_correctness_only["MAE"], exponents_by_exercise_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. By Exercise Buckets
  Accuracy: t-statistic 0.619250, p-value 0.545697
  AUC: t-statistic 2.526285, p-value 0.024204
  MAE: t-statistic 1.344853, p-value 0.200056


In [13]:
print("Correctness vs. Aggregate Buckets")
acc_result = stats.ttest_rel(exponents_correctness_only["accuracy"], exponents_aggregate_buckets["accuracy"])
auc_result = stats.ttest_rel(exponents_correctness_only["AUC"], exponents_aggregate_buckets["AUC"])
mae_result = stats.ttest_rel(exponents_correctness_only["MAE"], exponents_aggregate_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. Aggregate Buckets
  Accuracy: t-statistic 0.054250, p-value 0.957503
  AUC: t-statistic 3.522987, p-value 0.003377
  MAE: t-statistic 0.751307, p-value 0.464911


### Set of Exercises

In [14]:
set_correctness_only = {
    "accuracy": [0.78256, 0.78070, 0.78266, 0.77658, 0.78912, 0.78519, 0.78276, 0.78622, 0.78125,
                 0.78230, 0.78400, 0.77940, 0.78752, 0.78870, 0.78784],
    "baseline": [0.57233, 0.56992, 0.55741, 0.57242, 0.57556, 0.56443, 0.54830, 0.56593, 0.56627,
                 0.56337, 0.55683, 0.56348, 0.55922, 0.56016, 0.56875],
    "AUC": [0.86633, 0.85945, 0.85880, 0.85835, 0.85779, 0.86620, 0.87423, 0.86977, 0.86377, 0.86515,
            0.86847, 0.86652, 0.87112, 0.87156, 0.86590],
    "MAE": [0.31567, 0.31160, 0.31682, 0.31498, 0.30812, 0.31015, 0.30872, 0.30949, 0.31219, 0.31355,
            0.31170, 0.31284, 0.30809, 0.30961, 0.30761]
}
                
set_by_exercise_buckets = {
    "accuracy": [0.78155, 0.78067, 0.78169, 0.77658, 0.78908, 0.78439, 0.78020, 0.78615, 0.78010,
                 0.78076, 0.78215, 0.77707, 0.78495, 0.78900, 0.78590],
    "baseline": [0.57233, 0.56992, 0.55741, 0.57242, 0.57556, 0.56443, 0.54830, 0.56593, 0.56627,
                 0.56337, 0.55683, 0.56348, 0.55922, 0.56016, 0.56875],
    "AUC": [0.86504, 0.86016, 0.86181, 0.85531, 0.86130, 0.86560, 0.87352, 0.86613, 0.86463, 0.86368,
            0.86744, 0.86552, 0.87095, 0.86894, 0.86265],
    "MAE": [0.31458, 0.31454, 0.31696, 0.31196, 0.30835, 0.31242, 0.30814, 0.31287, 0.31175, 0.31186,
            0.31342, 0.31385, 0.30697, 0.30944, 0.30942]
}

set_aggregate_buckets = {
    "accuracy": [0.78141, 0.78036, 0.78108, 0.77579, 0.78791, 0.78500, 0.78127, 0.78681, 0.78125,
                 0.78145, 0.78267, 0.77544, 0.78510, 0.78769, 0.78704],
    "baseline": [0.57233, 0.56992, 0.55741, 0.57242, 0.57556, 0.56443, 0.54830, 0.56593, 0.56627,
                 0.56337, 0.55683, 0.56348, 0.55922, 0.56016, 0.56875],
    "AUC": [0.86264, 0.86419, 0.85521, 0.86143, 0.85141, 0.86181, 0.87285, 0.86787, 0.86405, 0.85965,
            0.86671, 0.86303, 0.86738, 0.86847, 0.86497],
    "MAE": [0.31321, 0.31083, 0.32093, 0.31377, 0.30520, 0.31050, 0.31004, 0.31141, 0.31149, 0.31183,
            0.31221, 0.31348, 0.30487, 0.31309, 0.30817]
}

In [15]:
print("Correctness vs. By Exercise Buckets")
acc_result = stats.ttest_rel(set_correctness_only["accuracy"], set_by_exercise_buckets["accuracy"])
auc_result = stats.ttest_rel(set_correctness_only["AUC"], set_by_exercise_buckets["AUC"])
mae_result = stats.ttest_rel(set_correctness_only["MAE"], set_by_exercise_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. By Exercise Buckets
  Accuracy: t-statistic 4.293635, p-value 0.000743
  AUC: t-statistic 1.320286, p-value 0.207919
  MAE: t-statistic -0.771217, p-value 0.453399


In [16]:
print("Correctness vs. Aggregate Buckets")
acc_result = stats.ttest_rel(set_correctness_only["accuracy"], set_aggregate_buckets["accuracy"])
auc_result = stats.ttest_rel(set_correctness_only["AUC"], set_aggregate_buckets["AUC"])
mae_result = stats.ttest_rel(set_correctness_only["MAE"], set_aggregate_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. Aggregate Buckets
  Accuracy: t-statistic 3.979902, p-value 0.001369
  AUC: t-statistic 2.723178, p-value 0.016490
  MAE: t-statistic 0.013082, p-value 0.989747
