In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from data import InMemoryExerciseData
from lstm import LSTM

def initialize_sess():
    global sess
    ruv = set(sess.run(tf.report_uninitialized_variables()))
    uv = [v for v in tf.global_variables() if v.name.split(':')[0].encode('ascii') in ruv]
    tf.variables_initializer(uv).run()
    
def reset_sess():
    global sess
    tf.reset_default_graph()
    sess.close()
    sess = tf.InteractiveSession()    

sess = tf.InteractiveSession()

min_seq_length = 5
max_seq_length = 3000
min_correct = 2
min_responses_for_skill = 16000

In [2]:
df = pd.read_csv('data/bridge_to_algebra_2006_2007_train.txt', sep='\t', lineterminator='\r',
                 usecols=['Anon Student Id', 'KC(SubSkills)', 'Correct First Attempt', 'Corrects',
                          'Incorrects', 'Step Start Time','Problem Name', 'Step Name'])


In [3]:
# skillRules = [skill for skill in df['KC(SubSkills)'].unique() if str(skill)[0:10] == '[SkillRule']

In [4]:
# DATETIME IS STORED IN NANOSECONDS
df['date'] = pd.to_datetime(df['Step Start Time']).astype(np.int64) // 10 ** 9
df.drop(['Step Start Time'], axis=1, inplace=True)

In [5]:
df['_count'] = 1
grouped_df = df.groupby('KC(SubSkills)').sum()

In [6]:
for idx, x in enumerate(grouped_df['_count'].sort_values(ascending=False)):
    if x < min_responses_for_skill:
        break
chosen_skill_names_plus = grouped_df['_count'].sort_values(ascending=False)[0:idx].index
chosen_skill_names = [skill for skill in chosen_skill_names_plus if skill not in ["Enter answer digit -- DON'T TRACK ME",
                                                                                  'Enter quantity from diagram by reading',
                                                                                  'Entering a given']]
# chosen_skill_names = skillRules[0:12]
print("Num KCs: %d" % (len(chosen_skill_names)))
df = df[df['KC(SubSkills)'].isin(chosen_skill_names)]

Num KCs: 12


In [7]:
grouped_df = df.groupby('Anon Student Id').sum()
filtered_uids = grouped_df[(grouped_df['_count'] >= min_seq_length) &
                           (grouped_df['_count'] <= max_seq_length) &
                           (grouped_df['Corrects'] >= min_correct)].reset_index()['Anon Student Id']
print("Num Students: %d" % (len(filtered_uids)))
df = df[df['Anon Student Id'].isin(filtered_uids)]

Num Students: 1023


In [8]:
def new_day(df, new_day_threshold=5, verbose=True):
    """
    Processes data to include correctness of the previous question and whether it has been a day since the
    the last response to any skill.
    Ex. For exercises A and B:
        [0, 0, 1, 1, 0, 1]
        Means that the current response is for exercise B, is incorrect and it has been a day since the 
        previous response to an exercise.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()
    new_day_threshold = 5 * 60 * 60 # Convert to seconds

    corrects = np.zeros((num_students, max_seq_length, num_skills))
    mask = np.zeros((num_students, max_seq_length, num_skills))
    sequences = np.zeros((num_students, max_seq_length, 3 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 3

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0
        prev_date = 0.0

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']
            if event['date'] - prev_date > new_day_threshold:
                sequences[row_idx, col_idx] += np.array([0, 0, 1] * num_skills)
                        
            prev_date = event['date']
            corrects[row_idx, col_idx, idx // 3] = event['Correct First Attempt']
            mask[row_idx, col_idx, idx // 3] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

def correctness_only(df, verbose=True):
    """
    Processes data to only include correctness of the previous question as input.
    Ex. For exercises A and B:
        [0, 0, 1, 0]
        Means that the current response is for exercise B and is incorrect.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()

    corrects = np.zeros((num_students, max_seq_length, num_skills))
    mask = np.zeros((num_students, max_seq_length, num_skills))
    sequences = np.zeros((num_students, max_seq_length, 2 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 2

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']

            corrects[row_idx, col_idx, idx // 2] = event['Correct First Attempt']
            mask[row_idx, col_idx, idx // 2] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

def new_day_by_exercise(df, new_day_threshold=5, verbose=True):
    """
    Processes data to include correctness of the previous question and whether it has been a day since the
    the last response to this skill.
    Ex. For exercises A and B:
        [0, 0, 1, 1, 0, 0]
        Means that the current response is for exercise B, is incorrect and it is not a new day for B. It
        is a new day for A.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()
    new_day_threshold = 5 * 60 * 60 # Convert to seconds

    corrects = np.zeros((num_students, max_seq_length, num_skills))
    mask = np.zeros((num_students, max_seq_length, num_skills))
    sequences = np.zeros((num_students, max_seq_length, 3 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 3

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0
        prev_dates = np.zeros((3 * num_skills))

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']
            sequences[row_idx, col_idx] += ((np.ones((3 * num_skills)) * event['date'] - prev_dates) * (prev_dates != 0)) > new_day_threshold
                        
            prev_dates[idx + 2] = event['date']
            corrects[row_idx, col_idx, idx // 3] = event['Correct First Attempt']
            mask[row_idx, col_idx, idx // 3] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

In [9]:
correctness_only_data = correctness_only(df)
new_day_data = new_day(df)
new_day_by_exercise_data = new_day_by_exercise(df)

datasets = [correctness_only_data, new_day_data, new_day_by_exercise_data]

Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000
Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000
Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000


In [10]:
print("--UNWEIGHTED--")
unweighted_scores = {}
for data, name in zip(datasets, ["Correctness Only", "New Day Aggregate", "New Day by Exercise"]):
    print(name)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    unweighted_scores[name] = {
        "accuracy": [],
        "baseline": [], # should be identical for all groups
        "AUC": [],
        "MAE": [],
        "CE": []
    }
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae, ce = lstm.test(sess, data)
        unweighted_scores[name]["accuracy"].append(acc)
        unweighted_scores[name]["baseline"].append(baseline)
        unweighted_scores[name]["AUC"].append(auc)
        unweighted_scores[name]["MAE"].append(mae)
        unweighted_scores[name]["CE"].append(ce)
        
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average MAE %.8f, Average CE: %.8f" % (np.mean(unweighted_scores[name]["accuracy"]),
                                                                                            np.mean(unweighted_scores[name]["AUC"]),
                                                                                            np.mean(unweighted_scores[name]["MAE"]),
                                                                                            np.mean(unweighted_scores[name]["CE"])))
    

--UNWEIGHTED--
Correctness Only
Fold 1
Accuracy: 0.885806, Baseline: 0.885806, AUC: 0.612333, MAE: 0.196806, CE: 2.632014
Fold 2
Accuracy: 0.901120, Baseline: 0.901120, AUC: 0.590038, MAE: 0.195088, CE: 3.013435
Fold 3
Accuracy: 0.888383, Baseline: 0.888383, AUC: 0.597144, MAE: 0.206588, CE: 2.404195
Fold 4
Accuracy: 0.885806, Baseline: 0.885806, AUC: 0.606203, MAE: 0.205402, CE: 1.603371
Fold 5
Accuracy: 0.872028, Baseline: 0.872028, AUC: 0.637995, MAE: 0.220470, CE: 2.091223
Fold 6
Accuracy: 0.895073, Baseline: 0.894812, AUC: 0.663101, MAE: 0.200829, CE: 1.945569
Fold 7
Accuracy: 0.876557, Baseline: 0.876557, AUC: 0.576807, MAE: 0.226991, CE: 3.242107
Fold 8
Accuracy: 0.852551, Baseline: 0.852551, AUC: 0.634040, MAE: 0.229741, CE: 3.721224
Fold 9
Accuracy: 0.891087, Baseline: 0.891087, AUC: 0.636713, MAE: 0.199534, CE: 2.573182
Fold 10
Accuracy: 0.916132, Baseline: 0.916132, AUC: 0.626838, MAE: 0.170637, CE: 2.614731
Fold 11
Accuracy: 0.858977, Baseline: 0.859735, AUC: 0.659039, MAE:

In [None]:
print("--WEIGHTED 1.5x--")
for data, name in zip(datasets, ["Correctness Only", "New Day Aggregate", "New Day by Exercise"]):
    print(name)
    data.target_masks = data.target_masks * 3/2 - data.targets * (3/2 - 1)
    reset_sess()
    lstm = LSTM(hidden_dim=200,
                output_dim=data.targets.shape[2],
                input_dim=data.inputs.shape[2],
                learning_rate=1e-2,
                batch_size=64,
                num_layers=1)

    lstm.build_model(tf.sigmoid)

    k = 15
    avg_acc = 0.0
    avg_auc = 0.0
    avg_mae = 0.0
    for fold in data.k_fold(k):
        print("Fold %d" % fold)
        tf.global_variables_initializer().run()
        lstm.train(sess, data, epochs=3)
        acc, baseline, auc, mae = lstm.test(sess, data)
        avg_acc += acc / k
        avg_auc += auc / k
        avg_mae += mae / k
    
    print("Average Accuracy: %.8f, Average AUC %.8f, Average Mean Absolute Error %.8f" % (avg_acc, avg_auc, avg_mae))

## Paired T-tests

In [11]:
import scipy.stats as stats

### Unweighted Data

In [12]:
unweighted_correctness_only = unweighted_scores["Correctness Only"]
unweighted_new_day_aggregate = unweighted_scores["New Day Aggregate"]
unweighted_new_day_by_exercise = unweighted_scores["New Day by Exercise"]

In [13]:
print("Correctness vs. New Day Aggregate")
acc_result = stats.ttest_rel(unweighted_correctness_only["accuracy"], unweighted_new_day_aggregate["accuracy"])
auc_result = stats.ttest_rel(unweighted_correctness_only["AUC"], unweighted_new_day_aggregate["AUC"])
mae_result = stats.ttest_rel(unweighted_correctness_only["MAE"], unweighted_new_day_aggregate["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. New Day Aggregate
  Accuracy: t-statistic -0.549320, p-value 0.591441
  AUC: t-statistic -0.415951, p-value 0.683752
  MAE: t-statistic 3.139861, p-value 0.007236


In [14]:
print("Correctness vs. New Day by Exercise")
acc_result = stats.ttest_rel(unweighted_correctness_only["accuracy"], unweighted_new_day_by_exercise["accuracy"])
auc_result = stats.ttest_rel(unweighted_correctness_only["AUC"], unweighted_new_day_by_exercise["AUC"])
mae_result = stats.ttest_rel(unweighted_correctness_only["MAE"], unweighted_new_day_by_exercise["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

Correctness vs. New Day by Exercise
  Accuracy: t-statistic -0.606701, p-value 0.553762
  AUC: t-statistic -2.171335, p-value 0.047585
  MAE: t-statistic 2.297483, p-value 0.037529


### Weighted Data

In [18]:
weighted_correctness_only = {
    "accuracy": [0.88568, 0.90380, 0.88392, 0.89487, 0.87396, 0.89272, 0.87284, 0.86231, 0.88940,
                 0.91799, 0.85268, 0.87077, 0.85659, 0.87699, 0.88499],
    "baseline": [0.88568, 0.90380, 0.88392, 0.89487, 0.87396, 0.89294, 0.87284, 0.86231, 0.88935,
                 0.91847, 0.85268, 0.87077, 0.85659, 0.88001, 0.88499], # should be identical for all groups
    "AUC": [0.50000, 0.49999, 0.50870, 0.50000, 0.50000, 0.53163, 0.49955, 0.49996, 0.50317, 0.49008,
            0.50015, 0.50038, 0.49990, 0.46609, 0.50007],
    "MAE": [0.20711, 0.19805, 0.20756, 0.21623, 0.25571, 0.21628, 0.27396, 0.27459, 0.23941, 0.22065,
            0.23682, 0.23068, 0.23206, 0.22245, 0.21569]
}

weighted_new_day_aggregate = {
    "accuracy": [0.88568, 0.90380, 0.88392, 0.89487, 0.87392, 0.89294, 0.87284, 0.86228, 0.88935,
                 0.91842, 0.85268, 0.87077, 0.85659, 0.88001, 0.88660],
    "baseline": [0.88568, 0.90380, 0.88392, 0.89487, 0.87396, 0.89294, 0.87284, 0.86231, 0.88935,
                 0.91847, 0.85268, 0.87077, 0.85659, 0.88001, 0.88499], # should be identical for all groups
    "AUC": [0.50000, 0.50000, 0.49944, 0.50000, 0.50066, 0.49996, 0.50000, 0.50317, 0.50000, 0.51416,
            0.50042, 0.50014, 0.50000, 0.50001, 0.54129],
    "MAE": [0.20800, 0.21037, 0.21817, 0.19948, 0.25157, 0.23541, 0.27970, 0.27187, 0.21337, 0.21905,
            0.24697, 0.20455, 0.24985, 0.22025, 0.21653]
}
    
weighted_new_day_by_exercise = {
    "accuracy": [0.88568, 0.90380, 0.88392, 0.89478, 0.87396, 0.89294, 0.87284, 0.86231, 0.88935,
                 0.91847, 0.85190, 0.87077, 0.85589, 0.87961, 0.88499],
    "baseline": [0.88568, 0.90380, 0.88392, 0.89487, 0.87396, 0.89294, 0.87284, 0.86231, 0.88935,
                 0.91847, 0.85268, 0.87077, 0.85659, 0.88001, 0.88499], # should be identical for all groups
    "AUC": [0.49999, 0.49999, 0.50000, 0.49775, 0.49838, 0.50000, 0.49974, 0.50011, 0.49999, 0.50000,
            0.45629, 0.50011, 0.49970, 0.50098, 0.50014],
    "MAE": [0.21873, 0.20113, 0.23784, 0.23223, 0.25360, 0.24602, 0.28396, 0.27601, 0.24146, 0.24013,
            0.24360, 0.23676, 0.21917, 0.22045, 0.23518]
}


In [None]:
print("Correctness vs. New Day Aggregate")
acc_result = stats.ttest_rel(weighted_correctness_only["accuracy"], weighted_new_day_aggregate["accuracy"])
auc_result = stats.ttest_rel(weighted_correctness_only["AUC"], weighted_new_day_aggregate["AUC"])
mae_result = stats.ttest_rel(weighted_correctness_only["MAE"], weighted_new_day_aggregate["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

In [None]:
print("Correctness vs. New Day by Exercise")
acc_result = stats.ttest_rel(weighted_correctness_only["accuracy"], weighted_new_day_by_exercise["accuracy"])
auc_result = stats.ttest_rel(weighted_correctness_only["AUC"], weighted_new_day_by_exercise["AUC"])
mae_result = stats.ttest_rel(weighted_correctness_only["MAE"], weighted_new_day_by_exercise["MAE"])
print("  Accuracy: t-statistic %.6f, p-value %.6f" % acc_result)
print("  AUC: t-statistic %.6f, p-value %.6f" % auc_result)
print("  MAE: t-statistic %.6f, p-value %.6f" % mae_result)

In [21]:
df['KC(SubSkills)'].unique()

array(['List consecutive multiples of a number',
       'Identify number as common multiple',
       'Identify number as common factor', 'List factor of large number',
       'Identify Fraction using fraction shape',
       'Identify number of items', 'Identify number of recipients',
       'Identify proper fraction from option 1',
       'Identify proper fraction from option 2',
       'Compare Options - operation',
       'Label equivalent fraction in equivalence statement',
       'Rewrite fraction with common denominator'], dtype=object)