In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from data import InMemoryExerciseData
from lstm import LSTM

def initialize_sess():
    global sess
    ruv = set(sess.run(tf.report_uninitialized_variables()))
    uv = [v for v in tf.global_variables() if v.name.split(':')[0].encode('ascii') in ruv]
    tf.variables_initializer(uv).run()
    
def reset_sess():
    global sess
    tf.reset_default_graph()
    sess.close()
    sess = tf.InteractiveSession()    

sess = tf.InteractiveSession()

min_seq_length = 5
max_seq_length = 3000
min_correct = 2
min_responses_for_skill = 16800

In [2]:
df = pd.read_csv('data/bridge_to_algebra_2006_2007_train.txt', sep='\t', lineterminator='\r',
                 usecols=['Anon Student Id', 'KC(SubSkills)', 'Correct First Attempt', 'Corrects',
                          'Incorrects', 'Step Start Time','Problem Name', 'Step Name'])


In [3]:
# DATETIME IS STORED IN NANOSECONDS
df['date'] = pd.to_datetime(df['Step Start Time']).astype(np.int64) // 10 ** 9
df.drop(['Step Start Time'], axis=1, inplace=True)

In [4]:
df['_count'] = 1
grouped_df = df.groupby('KC(SubSkills)').sum()

In [5]:
for idx, x in enumerate(grouped_df['_count'].sort_values(ascending=False)):
    if x < min_responses_for_skill:
        break
chosen_skill_names = grouped_df['_count'].sort_values(ascending=False)[0:idx].index
print("Num KCs: %d" % (len(chosen_skill_names)))
df = df[df['KC(SubSkills)'].isin(chosen_skill_names)]

Num KCs: 12


In [6]:
grouped_df = df.groupby('Anon Student Id').sum()
filtered_uids = grouped_df[(grouped_df['_count'] >= min_seq_length) &
                           (grouped_df['_count'] <= max_seq_length) &
                           (grouped_df['Corrects'] >= min_correct)].reset_index()['Anon Student Id']
print("Num Students: %d" % (len(filtered_uids)))
df = df[df['Anon Student Id'].isin(filtered_uids)]

Num Students: 1100


In [7]:
def new_day(df, new_day_threshold=5, verbose=True):
    """
    Processes data to include correctness of the previous question and whether it has been a day since the
    the last response to any skill.
    Ex. For exercises A and B:
        [0, 0, 1, 1, 0, 1]
        Means that the current response is for exercise B, is incorrect and it has been a day since the 
        previous response to an exercise.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()
    new_day_threshold = 5 * 60 * 60 # Convert to seconds

    corrects = np.zeros((num_students, max_seq_length, num_skills))
    mask = np.zeros((num_students, max_seq_length, num_skills))
    sequences = np.zeros((num_students, max_seq_length, 3 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 3

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0
        prev_date = 0.0

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']
            if event['date'] - prev_date > new_day_threshold:
                sequences[row_idx, col_idx] += np.array([0, 0, 1] * num_skills)
                        
            prev_date = event['date']
            corrects[row_idx, col_idx, idx // 3] = event['Correct First Attempt']
            mask[row_idx, col_idx, idx // 3] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

def correctness_only(df, verbose=True):
    """
    Processes data to only include correctness of the previous question as input.
    Ex. For exercises A and B:
        [0, 0, 1, 0]
        Means that the current response is for exercise B and is incorrect.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()

    corrects = np.zeros((num_students, max_seq_length, num_skills))
    mask = np.zeros((num_students, max_seq_length, num_skills))
    sequences = np.zeros((num_students, max_seq_length, 2 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 2

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']

            corrects[row_idx, col_idx, idx // 2] = event['Correct First Attempt']
            mask[row_idx, col_idx, idx // 2] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

def new_day_by_exercise(df, new_day_threshold=5, verbose=True):
    """
    Processes data to include correctness of the previous question and whether it has been a day since the
    the last response to this skill.
    Ex. For exercises A and B:
        [0, 0, 1, 1, 0, 0]
        Means that the current response is for exercise B, is incorrect and it is not a new day for B. It
        is a new day for A.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()
    new_day_threshold = 5 * 60 * 60 # Convert to seconds

    corrects = np.zeros((num_students, max_seq_length, num_skills))
    mask = np.zeros((num_students, max_seq_length, num_skills))
    sequences = np.zeros((num_students, max_seq_length, 3 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 3

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0
        prev_dates = np.zeros((3 * num_skills))

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']
            sequences[row_idx, col_idx] += ((np.ones((3 * num_skills)) * event['date'] - prev_dates) * (prev_dates != 0)) > new_day_threshold
                        
            prev_dates[idx + 2] = event['date']
            corrects[row_idx, col_idx, idx // 3] = event['Correct First Attempt']
            mask[row_idx, col_idx, idx // 3] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

In [8]:
correctness_only_data = correctness_only(df)
new_day_data = new_day(df)
new_day_by_exercise_data = new_day_by_exercise(df)

datasets = [correctness_only_data, new_day_data, new_day_by_exercise_data]

Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000
Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000
Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000


In [9]:
print("--UNWEIGHTED--")
for data, name in zip(datasets, ["Correctness Only", "New Day Aggregate", "New Day by Exercise"]):
    print(name)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    avg_acc = 0.0
    avg_auc = 0.0
    avg_mae = 0.0
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae = lstm.test(sess, data)
        avg_acc += acc / k
        avg_auc += auc / k
        avg_mae += mae / k
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average Mean Absolute Error %.8f" % (avg_acc, avg_auc, avg_mae))

--UNWEIGHTED--
Correctness Only
Fold 1
Accuracy: 0.88614, Baseline: 0.88614, AUC: 0.98451, MAE: 0.14899
Fold 2
Accuracy: 0.90523, Baseline: 0.90535, AUC: 0.98750, MAE: 0.17492
Fold 3
Accuracy: 0.88632, Baseline: 0.88632, AUC: 0.98004, MAE: 0.16075
Fold 4
Accuracy: 0.89472, Baseline: 0.89472, AUC: 0.97538, MAE: 0.14934
Fold 5
Accuracy: 0.87740, Baseline: 0.87740, AUC: 0.98139, MAE: 0.17583
Fold 6
Accuracy: 0.90195, Baseline: 0.90195, AUC: 0.97973, MAE: 0.16864
Fold 7
Accuracy: 0.87234, Baseline: 0.87234, AUC: 0.98545, MAE: 0.22059
Fold 8
Accuracy: 0.87243, Baseline: 0.87243, AUC: 0.98630, MAE: 0.19669
Fold 9
Accuracy: 0.89203, Baseline: 0.89203, AUC: 0.98387, MAE: 0.22111
Fold 10
Accuracy: 0.91574, Baseline: 0.91574, AUC: 0.99001, MAE: 0.19883
Fold 11
Accuracy: 0.86302, Baseline: 0.86302, AUC: 0.98428, MAE: 0.22652
Fold 12
Accuracy: 0.87025, Baseline: 0.87025, AUC: 0.98743, MAE: 0.19593
Fold 13
Accuracy: 0.87208, Baseline: 0.87208, AUC: 0.97799, MAE: 0.19591
Fold 14
Accuracy: 0.90591, B

In [10]:
print("--WEIGHTED 1.5x--")
for data, name in zip(datasets, ["Correctness Only", "New Day Aggregate", "New Day by Exercise"]):
    print(name)
    data.target_masks = data.target_masks * 3/2 - data.targets * (3/2 - 1)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    avg_acc = 0.0
    avg_auc = 0.0
    avg_mae = 0.0
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae = lstm.test(sess, data)
        avg_acc += acc / k
        avg_auc += auc / k
        avg_mae += mae / k
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average Mean Absolute Error %.8f" % (avg_acc, avg_auc, avg_mae))

--WEIGHTED 1.5x--
Correctness Only
Fold 1
Accuracy: 0.88808, Baseline: 0.88826, AUC: 0.98387, MAE: 0.21906
Fold 2
Accuracy: 0.90399, Baseline: 0.90404, AUC: 0.98733, MAE: 0.18826
Fold 3
Accuracy: 0.88078, Baseline: 0.88078, AUC: 0.97666, MAE: 0.22670
Fold 4
Accuracy: 0.89136, Baseline: 0.89136, AUC: 0.97496, MAE: 0.20171
Fold 5
Accuracy: 0.88830, Baseline: 0.88731, AUC: 0.98122, MAE: 0.22500
Fold 6
Accuracy: 0.90122, Baseline: 0.90122, AUC: 0.98119, MAE: 0.22881
Fold 7
Accuracy: 0.87024, Baseline: 0.87024, AUC: 0.98475, MAE: 0.23530
Fold 8
Accuracy: 0.87680, Baseline: 0.87680, AUC: 0.98508, MAE: 0.26000
Fold 9
Accuracy: 0.89189, Baseline: 0.89189, AUC: 0.98295, MAE: 0.27784
Fold 10
Accuracy: 0.90663, Baseline: 0.90663, AUC: 0.98955, MAE: 0.22358
Fold 11
Accuracy: 0.86592, Baseline: 0.86592, AUC: 0.98470, MAE: 0.24329
Fold 12
Accuracy: 0.87474, Baseline: 0.87474, AUC: 0.98872, MAE: 0.18619
Fold 13
Accuracy: 0.86853, Baseline: 0.86853, AUC: 0.97378, MAE: 0.21723
Fold 14
Accuracy: 0.90631

## Paired T-tests

In [11]:
import scipy.stats as stats

### Unweighted Data

In [12]:
unweighted_correctness_only = {
    "accuracy": [0.88614, 0.90523, 0.88632, 0.89472, 0.87740, 0.90195, 0.87234, 0.87243, 0.89203, 
                 0.91574, 0.86302, 0.87025, 0.87208, 0.90591, 0.89518],
    "baseline": [0.88614, 0.90535, 0.88632, 0.89472, 0.87740, 0.90195, 0.87234, 0.87243, 0.89203,
                 0.91574, 0.86302, 0.87025, 0.87208, 0.90591, 0.89518], # should be identical for all groups
    "AUC": [0.98451, 0.98750, 0.98004, 0.97538, 0.98139, 0.97973, 0.98545, 0.98630, 0.98387, 0.99001,
            0.98428, 0.98743, 0.97799, 0.98618, 0.98267],
    "MAE": [0.14899, 0.17492, 0.16075, 0.14934, 0.17583, 0.16864, 0.22059, 0.19669, 0.22111, 0.19883,
            0.22652, 0.19593, 0.19591, 0.17195, 0.17555]
}

unweighted_new_day_aggregate = {
    "accuracy": [0.88614, 0.90535, 0.88640, 0.89472, 0.87740, 0.90195, 0.87234, 0.87243, 0.89203,
                 0.91574, 0.86302, 0.87025, 0.87208, 0.90591, 0.89518],
    "baseline": [0.88614, 0.90535, 0.88632, 0.89472, 0.87740, 0.90195, 0.87234, 0.87243, 0.89203,
                 0.91574, 0.86302, 0.87025, 0.87208, 0.90591, 0.89518], # should be identical for all groups
    "AUC": [0.98451, 0.98757, 0.97986, 0.97538, 0.98139, 0.97973, 0.98545, 0.98630, 0.98387, 0.99001,
            0.98428, 0.98743, 0.97799, 0.98618, 0.98267],
    "MAE": [0.18491, 0.17399, 0.18364, 0.16964, 0.17477, 0.17046, 0.21987, 0.20073, 0.22822, 0.18383,
            0.19392, 0.18240, 0.19969, 0.13626, 0.18090]
}
    
unweighted_new_day_by_exercise = {
    "accuracy": [0.88614, 0.90535, 0.88632, 0.89472, 0.87740, 0.90195, 0.87234, 0.87243, 0.89203,
                 0.91574, 0.86302, 0.87025, 0.87208, 0.90591, 0.89518],
    "baseline": [0.88614, 0.90535, 0.88632, 0.89472, 0.87740, 0.90195, 0.87234, 0.87243, 0.89203,
                 0.91574, 0.86302, 0.87025, 0.87208, 0.90591, 0.89518], # should be identical for all groups
    "AUC": [0.98451, 0.98757, 0.98004, 0.97538, 0.98139, 0.97973, 0.98545, 0.98630, 0.98387, 0.99001,
            0.98428, 0.98743, 0.97799, 0.98618, 0.98267],
    "MAE": [0.19186, 0.17538, 0.19184, 0.17108, 0.17415, 0.18111, 0.21031, 0.21641, 0.16568, 0.19981,
            0.22191, 0.17620, 0.16105, 0.15052, 0.16958]
}

In [17]:
print("Correctness vs. New Day Aggregate")
acc_result = stats.ttest_rel(unweighted_correctness_only["accuracy"], unweighted_new_day_aggregate["accuracy"])
auc_result = stats.ttest_rel(unweighted_correctness_only["AUC"], unweighted_new_day_aggregate["AUC"])
mae_result = stats.ttest_rel(unweighted_correctness_only["MAE"], unweighted_new_day_aggregate["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. New Day Aggregate
  Accuracy: t-statistic -1.434860, p-value 0.173288
  AUC: t-statistic 0.556294, p-value 0.586792
  MAE: t-statistic -0.022758, p-value 0.982165


In [18]:
print("Correctness vs. New Day by Exercise")
acc_result = stats.ttest_rel(unweighted_correctness_only["accuracy"], unweighted_new_day_by_exercise["accuracy"])
auc_result = stats.ttest_rel(unweighted_correctness_only["AUC"], unweighted_new_day_by_exercise["AUC"])
mae_result = stats.ttest_rel(unweighted_correctness_only["MAE"], unweighted_new_day_by_exercise["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. New Day by Exercise
  Accuracy: t-statistic -1.000000, p-value 0.334282
  AUC: t-statistic -1.000000, p-value 0.334282
  MAE: t-statistic 0.249966, p-value 0.806244


### Weighted Data

In [19]:
weighted_correctness_only = {
    "accuracy": [0.88808, 0.90399, 0.88078, 0.89136, 0.88830, 0.90122, 0.87024, 0.87680, 0.89189,
                 0.90663, 0.86592, 0.87474, 0.86853, 0.90631, 0.89714],
    "baseline": [0.88826, 0.90404, 0.88078, 0.89136, 0.88731, 0.90122, 0.87024, 0.87680, 0.89189,
                 0.90663, 0.86592, 0.87474, 0.86853, 0.90631, 0.89714], # should be identical for all groups
    "AUC": [0.98387, 0.98733, 0.97666, 0.97496, 0.98122, 0.98119, 0.98475, 0.98508, 0.98295, 0.98955,
            0.98470, 0.98872, 0.97378, 0.98417, 0.98414],
    "MAE": [0.21906, 0.18826, 0.22670, 0.20171, 0.22500, 0.22881, 0.23530, 0.26000, 0.27784, 0.22358,
            0.24329, 0.18619, 0.21723, 0.18149, 0.19596]
}

weighted_new_day_aggregate = {
    "accuracy": [0.88829, 0.90402, 0.88078, 0.89118, 0.88731, 0.90100, 0.87024, 0.87680, 0.89189,
                 0.90661, 0.86442, 0.87474, 0.86853, 0.90615, 0.89714,],
    "baseline": [0.88826, 0.90404, 0.88078, 0.89136, 0.88731, 0.90122, 0.87024, 0.87680, 0.89189,
                 0.90663, 0.86592, 0.87474, 0.86853, 0.90631, 0.89714], # should be identical for all groups
    "AUC": [0.98403, 0.98735, 0.97670, 0.97432, 0.98169, 0.98082, 0.98475, 0.98508, 0.98295, 0.98953,
            0.98373, 0.98872, 0.97378, 0.98404, 0.98414],
    "MAE": [0.21494, 0.22114, 0.20791, 0.22562, 0.16655, 0.21079, 0.21383, 0.21498, 0.26525, 0.20560,
            0.26368, 0.22800, 0.23209, 0.17780, 0.20999]
}
    
weighted_new_day_by_exercise = {
    "accuracy": [0.88826, 0.90404, 0.88078, 0.89126, 0.88622, 0.90104, 0.87024, 0.87680, 0.89198,
                 0.90616, 0.86592, 0.87474, 0.87041, 0.90631, 0.89714],
    "baseline": [0.88826, 0.90404, 0.88078, 0.89136, 0.88731, 0.90122, 0.87024, 0.87680, 0.89189,
                 0.90663, 0.86592, 0.87474, 0.86853, 0.90631, 0.89714], # should be identical for all groups
    "AUC": [0.98403, 0.98736, 0.97670, 0.97449, 0.97939, 0.98084, 0.98475, 0.98508, 0.98285, 0.98867,
            0.98470, 0.98872, 0.97382, 0.98417, 0.98414],
    "MAE": [0.21515, 0.20394, 0.21869, 0.20421, 0.22478, 0.20887, 0.25685, 0.25547, 0.24504, 0.23428,
            0.25436, 0.21499, 0.22189, 0.19909, 0.20803]
}

In [20]:
print("Correctness vs. New Day Aggregate")
acc_result = stats.ttest_rel(weighted_correctness_only["accuracy"], weighted_new_day_aggregate["accuracy"])
auc_result = stats.ttest_rel(weighted_correctness_only["AUC"], weighted_new_day_aggregate["AUC"])
mae_result = stats.ttest_rel(weighted_correctness_only["MAE"], weighted_new_day_aggregate["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. New Day Aggregate
  Accuracy: t-statistic 1.620082, p-value 0.127513
  AUC: t-statistic 1.094712, p-value 0.292120
  MAE: t-statistic 0.478116, p-value 0.639947


In [21]:
print("Correctness vs. New Day by Exercise")
acc_result = stats.ttest_rel(weighted_correctness_only["accuracy"], weighted_new_day_by_exercise["accuracy"])
auc_result = stats.ttest_rel(weighted_correctness_only["AUC"], weighted_new_day_by_exercise["AUC"])
mae_result = stats.ttest_rel(weighted_correctness_only["MAE"], weighted_new_day_by_exercise["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. New Day by Exercise
  Accuracy: t-statistic 0.213319, p-value 0.834153
  AUC: t-statistic 1.675423, p-value 0.116033
  MAE: t-statistic -0.885405, p-value 0.390893
