In [1]:
from forgetful_dkt.lstm import LSTM
from forgetful_dkt.preprocess import DataProcessor
import tensorflow as tf
import numpy as np

def initialize_sess():
    global sess
    ruv = set(sess.run(tf.report_uninitialized_variables()))
    uv = [v for v in tf.global_variables() if v.name.split(':')[0].encode('ascii') in ruv]
    tf.variables_initializer(uv).run()
    
def reset_sess():
    global sess
    tf.reset_default_graph()
    sess.close()
    sess = tf.InteractiveSession()    

sess = tf.InteractiveSession()

## Exponents 1

In [2]:
dp = DataProcessor('vivek', ['exponents_1'], max_seq_length=200, min_seq_length=5)

In [3]:
correctness_only_data = dp.correctness_only(verbose=False).get_data()
by_exercise_bucket_data = dp.bucketed_delay_by_exercise([3, 24, None], [3], False, verbose=False).get_data()
aggregate_bucket_data = dp.bucketed_delay_aggregate([3, 24, None], [3], False, verbose=False).get_data()

datasets = [correctness_only_data, by_exercise_bucket_data, aggregate_bucket_data]
names = ["correctness_only_data", "by_exercise_bucket_data", "aggregate_bucket_data"]

In [4]:
scores = {}
for data, name in zip(datasets, names):
    print(name)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    scores[name] = {
        "accuracy": [],
        "baseline": [], # should be identical for all groups
        "AUC": [],
        "MAE": [],
        "CE": []
    }
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae, ce = lstm.test(sess, data)
        scores[name]["accuracy"].append(acc)
        scores[name]["baseline"].append(baseline)
        scores[name]["AUC"].append(auc)
        scores[name]["MAE"].append(mae)
        scores[name]["CE"].append(ce)
        
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average MAE %.8f, Average CE: %.8f" % (np.mean(scores[name]["accuracy"]),
                                                                                            np.mean(scores[name]["AUC"]),
                                                                                            np.mean(scores[name]["MAE"]),
                                                                                            np.mean(scores[name]["CE"])))


correctness_only_data
Fold 1
Accuracy: 0.761896, Baseline: 0.574172, AUC: 0.815890, MAE: 0.345147, CE: 5.785039
Fold 2
Accuracy: 0.757606, Baseline: 0.576367, AUC: 0.820316, MAE: 0.334874, CE: 6.062336
Fold 3
Accuracy: 0.773418, Baseline: 0.577905, AUC: 0.834413, MAE: 0.324727, CE: 5.600109
Fold 4
Accuracy: 0.763578, Baseline: 0.580718, AUC: 0.821398, MAE: 0.333564, CE: 5.622003
Fold 5
Accuracy: 0.777008, Baseline: 0.559161, AUC: 0.839170, MAE: 0.335534, CE: 5.911434
Fold 6
Accuracy: 0.756187, Baseline: 0.568581, AUC: 0.814510, MAE: 0.339062, CE: 5.660040
Fold 7
Accuracy: 0.764700, Baseline: 0.563000, AUC: 0.831110, MAE: 0.330790, CE: 6.060083
Fold 8
Accuracy: 0.757235, Baseline: 0.562199, AUC: 0.818641, MAE: 0.338212, CE: 6.030465
Fold 9
Accuracy: 0.761724, Baseline: 0.558193, AUC: 0.826868, MAE: 0.334995, CE: 5.677088
Fold 10
Accuracy: 0.764909, Baseline: 0.558945, AUC: 0.824199, MAE: 0.336777, CE: 5.918892
Fold 11
Accuracy: 0.761600, Baseline: 0.562133, AUC: 0.821619, MAE: 0.336787,

## Set of Exercises

In [5]:
dpset = DataProcessor('vivek', ['exponents_1', 'exponent_rules', 'positive_and_zero_exponents',
                                'properties-of-integer-exponents', 'square_roots_2',
                                'equations-w-square-and-cube-roots', 'scientific_notation',
                                'computing-in-scientific-notation'], max_seq_length=200, min_seq_length=5)

In [6]:
correctness_only_dataset = dpset.correctness_only(verbose=False).get_data()
by_exercise_bucket_dataset = dpset.bucketed_delay_by_exercise([3, 24, None], [3], False, verbose=False).get_data()
aggregate_bucket_dataset = dpset.bucketed_delay_aggregate([3, 24, None], [3], False, verbose=False).get_data()

datasets2 = [correctness_only_dataset, by_exercise_bucket_dataset, aggregate_bucket_dataset]
names2 = ["correctness_only_dataset", "by_exercise_bucket_dataset", "aggregate_bucket_dataset"]

In [None]:
scores2 = {}
for data, name in zip(datasets2, names2):
    print(name)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    scores2[name] = {
        "accuracy": [],
        "baseline": [], # should be identical for all groups
        "AUC": [],
        "MAE": [],
        "CE": []
    }
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae, ce = lstm.test(sess, data)
        scores2[name]["accuracy"].append(acc)
        scores2[name]["baseline"].append(baseline)
        scores2[name]["AUC"].append(auc)
        scores2[name]["MAE"].append(mae)
        scores2[name]["CE"].append(ce)
        
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average MAE %.8f, Average CE: %.8f" % (np.mean(scores2[name]["accuracy"]),
                                                                                            np.mean(scores2[name]["AUC"]),
                                                                                            np.mean(scores2[name]["MAE"]),
                                                                                            np.mean(scores2[name]["CE"])))



correctness_only_dataset
Fold 1
Accuracy: 0.783637, Baseline: 0.572332, AUC: 0.845052, MAE: 0.315468, CE: 4.383525
Fold 2
Accuracy: 0.781705, Baseline: 0.569916, AUC: 0.845828, MAE: 0.313077, CE: 4.545111
Fold 3
Accuracy: 0.783796, Baseline: 0.557408, AUC: 0.847991, MAE: 0.316302, CE: 4.800210
Fold 4
Accuracy: 0.777034, Baseline: 0.572421, AUC: 0.839978, MAE: 0.314568, CE: 4.902726
Fold 5
Accuracy: 0.789529, Baseline: 0.575560, AUC: 0.852526, MAE: 0.306917, CE: 4.424143
Fold 6
Accuracy: 0.784389, Baseline: 0.564427, AUC: 0.847234, MAE: 0.309965, CE: 4.501824
Fold 7
Accuracy: 0.781574, Baseline: 0.548303, AUC: 0.847316, MAE: 0.309996, CE: 4.516862
Fold 8
Accuracy: 0.785989, Baseline: 0.565934, AUC: 0.848407, MAE: 0.310019, CE: 4.634341
Fold 9
Accuracy: 0.781063, Baseline: 0.566270, AUC: 0.846575, MAE: 0.316068, CE: 4.359683
Fold 10
Accuracy: 0.781143, Baseline: 0.563366, AUC: 0.843649, MAE: 0.316516, CE: 4.558907
Fold 11
Accuracy: 0.783114, Baseline: 0.556832, AUC: 0.848705, MAE: 0.3153

Fold 14  
Accuracy: 0.786946, Baseline: 0.560165, AUC: 0.850685, MAE: 0.314490, CE: 4.405485  
Fold 15  
Accuracy: 0.787081, Baseline: 0.568750, AUC: 0.850862, MAE: 0.309178, CE: 4.489941  

###### Error prevented all output from printing properly

## Paired T-tests

In [8]:
import scipy.stats as stats

### Exponents 1

In [9]:
exponents_correctness_only = scores["correctness_only_data"]                
exponents_by_exercise_buckets = scores["by_exercise_bucket_data"]
exponents_aggregate_buckets = scores["aggregate_bucket_data"]

In [10]:
print("Correctness vs. By Exercise Buckets")
acc_result = stats.ttest_rel(exponents_correctness_only["accuracy"], exponents_by_exercise_buckets["accuracy"])
auc_result = stats.ttest_rel(exponents_correctness_only["AUC"], exponents_by_exercise_buckets["AUC"])
mae_result = stats.ttest_rel(exponents_correctness_only["MAE"], exponents_by_exercise_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. By Exercise Buckets
  Accuracy: t-statistic 1.362737, p-value 0.194485
  AUC: t-statistic -4.503897, p-value 0.000495
  MAE: t-statistic 1.650350, p-value 0.121118


In [11]:
print("Correctness vs. Aggregate Buckets")
acc_result = stats.ttest_rel(exponents_correctness_only["accuracy"], exponents_aggregate_buckets["accuracy"])
auc_result = stats.ttest_rel(exponents_correctness_only["AUC"], exponents_aggregate_buckets["AUC"])
mae_result = stats.ttest_rel(exponents_correctness_only["MAE"], exponents_aggregate_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. Aggregate Buckets
  Accuracy: t-statistic 1.398514, p-value 0.183719
  AUC: t-statistic -11.543629, p-value 0.000000
  MAE: t-statistic 0.802218, p-value 0.435834


### Set of Exercises

In [None]:
names2 = ["correctness_only_dataset", "by_exercise_bucket_dataset", "aggregate_bucket_dataset"]
set_correctness_only = scores2["correctness_only_dataset"]
set_by_exercise_buckets = scores2["by_exercise_bucket_dataset"]
set_aggregate_buckets = scores2["aggregate_bucket_dataset"]

In [21]:
print("Correctness vs. By Exercise Buckets")
acc_result = stats.ttest_rel(set_correctness_only["accuracy"], set_by_exercise_buckets["accuracy"])
auc_result = stats.ttest_rel(set_correctness_only["AUC"], set_by_exercise_buckets["AUC"])
mae_result = stats.ttest_rel(set_correctness_only["MAE"], set_by_exercise_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. By Exercise Buckets
  Accuracy: t-statistic 4.721367, p-value 0.000328
  AUC: t-statistic 1.961256, p-value 0.070051
  MAE: t-statistic -0.777902, p-value 0.449574


In [22]:
print("Correctness vs. Aggregate Buckets")
acc_result = stats.ttest_rel(set_correctness_only["accuracy"], set_aggregate_buckets["accuracy"])
auc_result = stats.ttest_rel(set_correctness_only["AUC"], set_aggregate_buckets["AUC"])
mae_result = stats.ttest_rel(set_correctness_only["MAE"], set_aggregate_buckets["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. Aggregate Buckets
  Accuracy: t-statistic 2.565743, p-value 0.022422
  AUC: t-statistic 1.076089, p-value 0.300082
  MAE: t-statistic -0.599214, p-value 0.558603


## Training Data Analysis

In [51]:
import pandas as pd

In [52]:
dpsetdf = dpset.raw_data.copy()
dpdf = dp.raw_data.copy()

In [53]:
dpdf['datetime'] = pd.to_datetime(dpdf['date'] * 10**9)
dpdf['_count'] = 1

dpsetdf['datetime'] = pd.to_datetime(dpdf['date'] * 10**9)
dpsetdf['_count'] = 1

In [54]:
dpdf.groupby(['uid']).sum()['_count'].describe()

count    6917.000000
mean       23.736736
std        24.754204
min         5.000000
25%         8.000000
50%        15.000000
75%        29.000000
max       200.000000
Name: _count, dtype: float64

In [55]:
sorted_df = dpdf.sort_values('datetime', ascending=True)
sorted_df.groupby('uid')['datetime'].diff().describe()

count                     157270
mean      2 days 00:19:29.261219
std      24 days 04:48:20.601453
min              0 days 00:00:00
25%              0 days 00:00:00
50%              0 days 00:00:08
75%              0 days 00:00:33
max           1368 days 19:00:16
Name: datetime, dtype: object

In [56]:
dpsetdf.groupby(['uid']).sum()['_count'].describe()

count    13442.000000
mean        30.066136
std         32.034656
min          5.000000
25%          9.000000
50%         19.000000
75%         37.000000
max        200.000000
Name: _count, dtype: float64

In [57]:
sorteddf = dpsetdf.sort_values('datetime', ascending=True)
# sorteddf.groupby('uid')['datetime'].diff().describe()
sorteddf['datetime'].diff().describe()

count                    256763
mean     0 days 00:08:48.275035
std      0 days 01:29:35.419536
min             0 days 00:00:00
25%             0 days 00:00:00
50%             0 days 00:00:00
75%             0 days 00:00:13
max             3 days 17:34:09
Name: datetime, dtype: object