In [1]:
from forgetful_dkt.lstm import LSTM
from forgetful_dkt.preprocess import DataProcessor
import tensorflow as tf
import numpy as np

def initialize_sess():
    global sess
    ruv = set(sess.run(tf.report_uninitialized_variables()))
    uv = [v for v in tf.global_variables() if v.name.split(':')[0].encode('ascii') in ruv]
    tf.variables_initializer(uv).run()
    
def reset_sess():
    global sess
    tf.reset_default_graph()
    sess.close()
    sess = tf.InteractiveSession()    

sess = tf.InteractiveSession()

## Exponents 1

In [2]:
dp = DataProcessor('vivek', ['exponents_1'], max_seq_length=200, min_seq_length=5)

In [3]:
correctness_only_data = dp.correctness_only(verbose=False).get_data()
by_exercise_bucket_data = dp.bucketed_delay_by_exercise([3, 24, None], [3], False, verbose=False).get_data()
aggregate_bucket_data = dp.bucketed_delay_aggregate([3, 24, None], [3], False, verbose=False).get_data()

datasets = [correctness_only_data, by_exercise_bucket_data, aggregate_bucket_data]
names = ["correctness_only_data", "by_exercise_bucket_data", "aggregate_bucket_data"]

In [4]:
for data, name in zip(datasets, names):
    print(name)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    avg_acc = 0.0
    avg_auc = 0.0
    avg_mae = 0.0
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae = lstm.test(sess, data)
        avg_acc += acc / k
        avg_auc += auc / k
        avg_mae += mae / k
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average Mean Absolute Error %.8f" % (avg_acc, avg_auc, avg_mae))

correctness_only_data
Fold 1
Accuracy: 0.76093, Baseline: 0.57417, AUC: 0.75836, MAE: 0.34303
Fold 2
Accuracy: 0.75852, Baseline: 0.57637, AUC: 0.77281, MAE: 0.33654
Fold 3
Accuracy: 0.77192, Baseline: 0.57791, AUC: 0.78637, MAE: 0.32624
Fold 4
Accuracy: 0.76358, Baseline: 0.58072, AUC: 0.76242, MAE: 0.33388
Fold 5
Accuracy: 0.77582, Baseline: 0.55916, AUC: 0.78394, MAE: 0.33391
Fold 6
Accuracy: 0.75562, Baseline: 0.56858, AUC: 0.75633, MAE: 0.33841
Fold 7
Accuracy: 0.76531, Baseline: 0.56300, AUC: 0.77325, MAE: 0.33378
Fold 8
Accuracy: 0.75653, Baseline: 0.56220, AUC: 0.75976, MAE: 0.34079
Fold 9
Accuracy: 0.76182, Baseline: 0.55819, AUC: 0.77556, MAE: 0.33463
Fold 10
Accuracy: 0.76422, Baseline: 0.55895, AUC: 0.78055, MAE: 0.33587
Fold 11
Accuracy: 0.76171, Baseline: 0.56213, AUC: 0.76662, MAE: 0.33619
Fold 12
Accuracy: 0.76279, Baseline: 0.58526, AUC: 0.76611, MAE: 0.33334
Fold 13
Accuracy: 0.77482, Baseline: 0.57605, AUC: 0.78369, MAE: 0.33193
Fold 14
Accuracy: 0.76558, Baseline: 0

## Set of Exercises

In [5]:
dpset = DataProcessor('vivek', ['exponents_1', 'exponent_rules', 'positive_and_zero_exponents',
                                'properties-of-integer-exponents', 'square_roots_2',
                                'equations-w-square-and-cube-roots', 'scientific_notation',
                                'computing-in-scientific-notation'], max_seq_length=200, min_seq_length=5)

In [6]:
correctness_only_dataset = dpset.correctness_only(verbose=False).get_data()
by_exercise_bucket_dataset = dpset.bucketed_delay_by_exercise([3, 24, None], [3], False, verbose=False).get_data()
aggregate_bucket_dataset = dpset.bucketed_delay_aggregate([3, 24, None], [3], False, verbose=False).get_data()

datasets2 = [correctness_only_dataset, by_exercise_bucket_dataset, aggregate_bucket_dataset]
names2 = ["correctness_only_dataset", "by_exercise_bucket_dataset", "aggregate_bucket_dataset"]

In [7]:
for data, name in zip(datasets2, names2):
    print(name)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    avg_acc = 0.0
    avg_auc = 0.0
    avg_mae = 0.0
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae = lstm.test(sess, data)
        avg_acc += acc / k
        avg_auc += auc / k
        avg_mae += mae / k
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average Mean Absolute Error %.8f" % (avg_acc, avg_auc, avg_mae))

correctness_only_dataset
Fold 1
Accuracy: 0.78245, Baseline: 0.57233, AUC: 0.69419, MAE: 0.31473
Fold 2
Accuracy: 0.78136, Baseline: 0.56992, AUC: 0.72698, MAE: 0.31153
Fold 3
Accuracy: 0.78298, Baseline: 0.55741, AUC: 0.72605, MAE: 0.31529
Fold 4
Accuracy: 0.77683, Baseline: 0.57242, AUC: 0.70708, MAE: 0.31386
Fold 5
Accuracy: 0.78885, Baseline: 0.57556, AUC: 0.72499, MAE: 0.30894
Fold 6
Accuracy: 0.78542, Baseline: 0.56443, AUC: 0.71268, MAE: 0.30986
Fold 7
Accuracy: 0.78150, Baseline: 0.54830, AUC: 0.74004, MAE: 0.30959
Fold 8
Accuracy: 0.78709, Baseline: 0.56593, AUC: 0.71163, MAE: 0.30982
Fold 9
Accuracy: 0.78221, Baseline: 0.56627, AUC: 0.73574, MAE: 0.31262
Fold 10
Accuracy: 0.78188, Baseline: 0.56337, AUC: 0.73386, MAE: 0.31541
Fold 11
Accuracy: 0.78336, Baseline: 0.55683, AUC: 0.73884, MAE: 0.31113
Fold 12
Accuracy: 0.77961, Baseline: 0.56348, AUC: 0.71624, MAE: 0.31256
Fold 13
Accuracy: 0.78714, Baseline: 0.55922, AUC: 0.73611, MAE: 0.30679
Fold 14
Accuracy: 0.78919, Baseline

## Paired T-tests

In [8]:
import scipy.stats as stats

### Exponents 1

In [9]:
exponents_correctness_only = {
    "accuracy": [0.76093, 0.75852, 0.77192, 0.76358, 0.77582, 0.75562, 0.76531, 0.75653, 0.76182,
                 0.76422, 0.76171, 0.76279, 0.77482, 0.76558, 0.77162],
    "baseline": [0.57417, 0.57637, 0.57791, 0.58072, 0.55916, 0.56858, 0.56300, 0.56220, 0.55819,
                 0.55895, 0.56213, 0.58526, 0.57605, 0.56858, 0.55594],
    "AUC": [0.75836, 0.77281, 0.78637, 0.76242, 0.78394, 0.75633, 0.77325, 0.75976, 0.77556, 0.78055,
            0.76662, 0.76611, 0.78369, 0.76289, 0.78314],
    "MAE": [0.34303, 0.33654, 0.32624, 0.33388, 0.33391, 0.33841, 0.33378, 0.34079, 0.33463, 0.33587,
            0.33619, 0.33334, 0.33193, 0.33068, 0.33813]
}
                
exponents_by_exercise_buckets = {
    "accuracy": [0.76025, 0.75912, 0.77107, 0.76236, 0.77671, 0.75496, 0.76541, 0.75734, 0.76267,
                 0.76669, 0.76107, 0.76141, 0.77141, 0.76444, 0.77102],
    "baseline": [0.57417, 0.57637, 0.57791, 0.58072, 0.55916, 0.56858, 0.56300, 0.56220, 0.55819,
                 0.55895, 0.56213, 0.58526, 0.57605, 0.56858, 0.55594],
    "AUC": [0.76411, 0.77316, 0.77929, 0.76838, 0.77813, 0.74481, 0.76638, 0.75166, 0.77272, 0.77525,
            0.74442, 0.74210, 0.77540, 0.76663, 0.78213],
    "MAE": [0.34056, 0.33498, 0.32448, 0.33330, 0.33317, 0.33784, 0.32922, 0.33994, 0.33439, 0.33601,
            0.33525, 0.33295, 0.33019, 0.33281, 0.33490]

}

exponents_aggregate_buckets = {
    "accuracy": [0.76122, 0.75953, 0.77126, 0.76348, 0.77661, 0.75666, 0.76510, 0.75643, 0.76286,
                 0.76698, 0.76213, 0.76250, 0.77326, 0.76463, 0.77021],
    "baseline": [0.57417, 0.57637, 0.57791, 0.58072, 0.55916, 0.56858, 0.56300, 0.56220, 0.55819,
                 0.55895, 0.56213, 0.58526, 0.57605, 0.56858, 0.55594],
    "AUC": [0.76910, 0.76191, 0.78289, 0.76930, 0.78026, 0.76252, 0.77653, 0.75838, 0.77471, 0.77942,
            0.76769, 0.76159, 0.77610, 0.77427, 0.78214],
    "MAE": [0.34069, 0.33182, 0.32272, 0.33236, 0.33428, 0.33870, 0.33150, 0.33856, 0.32987, 0.33491,
            0.33428, 0.33403, 0.33017, 0.32985, 0.33684]

}

In [10]:
print("Correctness vs. By Exercise Buckets")
acc_result = stats.ttest_rel(exponents_correctness_only["accuracy"], exponents_by_exercise_buckets["accuracy"])
auc_result = stats.ttest_rel(exponents_correctness_only["AUC"], exponents_by_exercise_buckets["AUC"])
mae_result = stats.ttest_rel(exponents_correctness_only["MAE"], exponents_by_exercise_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. By Exercise Buckets
  Accuracy: t-statistic 0.924770, p-value 0.370752
  AUC: t-statistic 2.564271, p-value 0.022486
  MAE: t-statistic 2.903022, p-value 0.011573


In [11]:
print("Correctness vs. Aggregate Buckets")
acc_result = stats.ttest_rel(exponents_correctness_only["accuracy"], exponents_aggregate_buckets["accuracy"])
auc_result = stats.ttest_rel(exponents_correctness_only["AUC"], exponents_aggregate_buckets["AUC"])
mae_result = stats.ttest_rel(exponents_correctness_only["MAE"], exponents_aggregate_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. Aggregate Buckets
  Accuracy: t-statistic -0.480692, p-value 0.638160
  AUC: t-statistic -0.203062, p-value 0.842008
  MAE: t-statistic 4.183018, p-value 0.000920


### Set of Exercises

In [12]:
set_correctness_only = {
    "accuracy": [0.78245, 0.78136, 0.78298, 0.77683, 0.78885, 0.78542, 0.78150, 0.78709, 0.78221,
                 0.78188, 0.78336, 0.77961, 0.78714, 0.78919, 0.78765],
    "baseline": [0.57233, 0.56992, 0.55741, 0.57242, 0.57556, 0.56443, 0.54830, 0.56593, 0.56627,
                 0.56337, 0.55683, 0.56348, 0.55922, 0.56016, 0.56875],
    "AUC": [0.69419, 0.72698, 0.72605, 0.70708, 0.72499, 0.71268, 0.74004, 0.71163, 0.73574, 0.73386,
            0.73884, 0.71624, 0.73611, 0.71763, 0.73092],
    "MAE": [0.31473, 0.31153, 0.31529, 0.31386, 0.30894, 0.30986, 0.30959, 0.30982, 0.31262, 0.31541,
            0.31113, 0.31256, 0.30679, 0.30951, 0.30757]
}
                
set_by_exercise_buckets = {
    "accuracy": [0.78204, 0.77955, 0.78059, 0.77588, 0.78881, 0.78328, 0.78012, 0.78579, 0.77984,
                 0.78149, 0.78239, 0.77718, 0.78480, 0.78837, 0.78495],
    "baseline": [0.57233, 0.56992, 0.55741, 0.57242, 0.57556, 0.56443, 0.54830, 0.56593, 0.56627,
                 0.56337, 0.55683, 0.56348, 0.55922, 0.56016, 0.56875],
    "AUC": [0.72447, 0.73438, 0.74069, 0.71960, 0.73405, 0.72007, 0.75334, 0.73242, 0.75408, 0.74004,
            0.74585, 0.73217, 0.75147, 0.72926, 0.74194],
    "MAE": [0.31343, 0.31382, 0.31762, 0.31365, 0.30609, 0.31093, 0.30833, 0.31124, 0.31001, 0.31271,
            0.31256, 0.31331, 0.30791, 0.31008, 0.30825]

}

set_aggregate_buckets = {
    "accuracy": [0.78234, 0.78036, 0.78120, 0.77567, 0.78806, 0.78615, 0.78127, 0.78548, 0.78121,
                 0.78265, 0.78311, 0.77758, 0.78650, 0.78826, 0.78750],
    "baseline": [0.57233, 0.56992, 0.55741, 0.57242, 0.57556, 0.56443, 0.54830, 0.56593, 0.56627,
                 0.56337, 0.55683, 0.56348, 0.55922, 0.56016, 0.56875],
    "AUC": [0.73766, 0.74924, 0.74195, 0.73659, 0.74667, 0.74016, 0.76195, 0.73968, 0.75824, 0.75088,
            0.75867, 0.74941, 0.75404, 0.74520, 0.75279],
    "MAE": [0.31355, 0.31021, 0.31867, 0.31465, 0.30893, 0.31032, 0.30988, 0.30879, 0.31059, 0.31202,
            0.31184, 0.31538, 0.30507, 0.31159, 0.30754]

}


In [13]:
print("Correctness vs. By Exercise Buckets")
acc_result = stats.ttest_rel(set_correctness_only["accuracy"], set_by_exercise_buckets["accuracy"])
auc_result = stats.ttest_rel(set_correctness_only["AUC"], set_by_exercise_buckets["AUC"])
mae_result = stats.ttest_rel(set_correctness_only["MAE"], set_by_exercise_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. By Exercise Buckets
  Accuracy: t-statistic 6.622248, p-value 0.000011
  AUC: t-statistic -8.132941, p-value 0.000001
  MAE: t-statistic -0.106327, p-value 0.916831


In [14]:
print("Correctness vs. Aggregate Buckets")
acc_result = stats.ttest_rel(set_correctness_only["accuracy"], set_aggregate_buckets["accuracy"])
auc_result = stats.ttest_rel(set_correctness_only["AUC"], set_aggregate_buckets["AUC"])
mae_result = stats.ttest_rel(set_correctness_only["MAE"], set_aggregate_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. Aggregate Buckets
  Accuracy: t-statistic 3.192083, p-value 0.006522
  AUC: t-statistic -13.363643, p-value 0.000000
  MAE: t-statistic 0.025128, p-value 0.980308
