In [41]:
import numpy as np
import pandas as pd
import tensorflow as tf
from data import InMemoryExerciseData
from lstm import LSTM
from encoder import Encoder

def initialize_sess():
    global sess
    ruv = set(sess.run(tf.report_uninitialized_variables()))
    uv = [v for v in tf.global_variables() if v.name.split(':')[0].encode('ascii') in ruv]
    tf.variables_initializer(uv).run()
    
def reset_sess():
    global sess
    tf.reset_default_graph()
    sess.close()
    sess = tf.InteractiveSession()    

sess = tf.InteractiveSession()

min_seq_length = 5
max_seq_length = 3000
min_correct = 2
min_responses_for_skill = 16800


In [42]:
df = pd.read_csv('data/bridge_to_algebra_2006_2007_train.txt', sep='\t', lineterminator='\r',
                 usecols=['Anon Student Id', 'KC(SubSkills)', 'Correct First Attempt', 'Corrects',
                          'Incorrects', 'Step Start Time','Problem Name', 'Step Name'])


In [43]:
# DATETIME IS STORED IN FUCKING NANOSECONDS
df['date'] = pd.to_datetime(df['Step Start Time']).astype(np.int64) // 10 ** 9
df.drop(['Step Start Time'], axis=1, inplace=True)

In [44]:
df['_count'] = 1
grouped_df = df.groupby('KC(SubSkills)').sum()

In [45]:
for idx, x in enumerate(grouped_df['_count'].sort_values(ascending=False)):
    if x < min_responses_for_skill:
        break
chosen_skill_names = grouped_df['_count'].sort_values(ascending=False)[0:idx].index
print("Num KCs: %d" % (len(chosen_skill_names)))
df = df[df['KC(SubSkills)'].isin(chosen_skill_names)]

Num KCs: 12


In [46]:
grouped_df = df.groupby('Anon Student Id').sum()
filtered_uids = grouped_df[(grouped_df['_count'] >= min_seq_length) &
                           (grouped_df['_count'] <= max_seq_length) &
                           (grouped_df['Corrects'] >= min_correct)].reset_index()['Anon Student Id']
print("Num Students: %d" % (len(filtered_uids)))
df = df[df['Anon Student Id'].isin(filtered_uids)]

Num Students: 1100


In [7]:
def correctness_only(df, verbose=True):
    """
    Processes data to only include correctness of the previous question as input.
    Ex. For exercises A and B:
        [0, 0, 1, 0]
        Means that the current response is for exercise B and is incorrect.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()

    corrects = np.zeros((num_students, max_seq_length, 1))
    mask = np.zeros((num_students, max_seq_length, 1))
    sequences = np.zeros((num_students, max_seq_length, 2 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 2

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']

            corrects[row_idx, col_idx, 0] = event['Correct First Attempt']
            mask[row_idx, col_idx, 0] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

def new_day_by_exercise(df, new_day_threshold=5, verbose=True):
    """
    Processes data to include correctness of the previous question and whether it has been a day since the
    the last response to this skill.
    Ex. For exercises A and B:
        [0, 0, 1, 1, 0, 0]
        Means that the current response is for exercise B, is incorrect and it is not a new day for B. It
        is a new day for A.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()
    new_day_threshold = 5 * 60 * 60 # Convert to seconds

    corrects = np.zeros((num_students, max_seq_length, 1))
    mask = np.zeros((num_students, max_seq_length, 1))
    sequences = np.zeros((num_students, max_seq_length, 3 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 3

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0
        prev_dates = np.zeros((3 * num_skills))

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']
            sequences[row_idx, col_idx] += ((np.ones((3 * num_skills)) * event['date'] - prev_dates) * (prev_dates != 0)) > new_day_threshold
                        
            prev_dates[idx + 2] = event['date']
            corrects[row_idx, col_idx, 0] = event['Correct First Attempt']
            mask[row_idx, col_idx, 0] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

def new_day(df, new_day_threshold=5, verbose=True):
    """
    Processes data to include correctness of the previous question and whether it has been a day since the
    the last response to this skill.
    Ex. For exercises A and B:
        [0, 0, 1, 1, 0, 1]
        Means that the current response is for exercise B, is incorrect and it has been a day since the 
        previous response to an exercise.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()
    new_day_threshold = 5 * 60 * 60 # Convert to seconds

    corrects = np.zeros((num_students, max_seq_length, 1))
    mask = np.zeros((num_students, max_seq_length, 1))
    sequences = np.zeros((num_students, max_seq_length, 3 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 3

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0
        prev_date = 0.0

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']
            if event['date'] - prev_date > new_day_threshold:
                sequences[row_idx, col_idx] += np.array([0, 0, 1] * num_skills)
                        
            prev_date = event['date']
            corrects[row_idx, col_idx, 0] = event['Correct First Attempt']
            mask[row_idx, col_idx, 0] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

def new_day_correct_by_skill(df, new_day_threshold=5, verbose=True):
    """
    Processes data to include correctness of the previous question and whether it has been a day since the
    the last response to this skill.
    Ex. For exercises A and B:
        [0, 0, 1, 1, 0, 1]
        Means that the current response is for exercise B, is incorrect and it has been a day since the 
        previous response to an exercise.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()
    new_day_threshold = 5 * 60 * 60 # Convert to seconds

    corrects = np.zeros((num_students, max_seq_length, num_skills))
    mask = np.zeros((num_students, max_seq_length, num_skills))
    sequences = np.zeros((num_students, max_seq_length, 3 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 3

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0
        prev_date = 0.0

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']
            if event['date'] - prev_date > new_day_threshold:
                sequences[row_idx, col_idx] += np.array([0, 0, 1] * num_skills)
                        
            prev_date = event['date']
            corrects[row_idx, col_idx, idx // 3] = event['Correct First Attempt']
            mask[row_idx, col_idx, idx // 3] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)

def correctness_only_by_skill(df, verbose=True):
    """
    Processes data to only include correctness of the previous question as input.
    Ex. For exercises A and B:
        [0, 0, 1, 0]
        Means that the current response is for exercise B and is incorrect.
    """
    num_students = df['Anon Student Id'].nunique()
    num_skills = df['KC(SubSkills)'].nunique()

    corrects = np.zeros((num_students, max_seq_length, num_skills))
    mask = np.zeros((num_students, max_seq_length, num_skills))
    sequences = np.zeros((num_students, max_seq_length, 2 * num_skills))

    exercise_index_map = {}
    for idx, exercise in enumerate(df['KC(SubSkills)'].unique()):
        exercise_index_map[exercise] = idx * 2

    for row_idx, uid in enumerate(df['Anon Student Id'].unique()):
        uid_df = df[df['Anon Student Id'] == uid]
        col_idx = 0

        for _, event in uid_df.iterrows():
            idx = exercise_index_map[event['KC(SubSkills)']]
            sequences[row_idx, col_idx, idx] = 1
            sequences[row_idx, col_idx, idx + 1] = event['Correct First Attempt']

            corrects[row_idx, col_idx, idx // 2] = event['Correct First Attempt']
            mask[row_idx, col_idx, idx // 2] = 1
            col_idx += 1

        if verbose and row_idx % 100 == 0:
            print("Processed %d" % row_idx)

    return InMemoryExerciseData(sequences, mask, corrects)


In [47]:
correctness_only_by_skill_data = correctness_only_by_skill(df)
# new_day_data = new_day(df)
new_day_correct_by_skill_data = new_day_correct_by_skill(df)

Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000
Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Processed 1000


In [48]:
data = new_day_correct_by_skill_data
orig_target_masks = np.array(data.target_masks)

In [49]:
# Weight predicting incorrect higher
weight = 3/2
data.target_masks = orig_target_masks * weight - data.targets * (weight - 1)

In [19]:
lstm = LSTM(hidden_dim=200,
            output_dim=data.targets.shape[2],
            input_dim=data.inputs.shape[2],
            learning_rate=5e-3,
            batch_size=64,
            num_layers=1)

lstm.build_model(tf.sigmoid)

In [None]:
avg_acc = 0.0
for fold in data.k_fold(5):
    print("Fold %d" % fold)
    tf.global_variables_initializer().run()
    lstm.train(sess, data, epochs=8)
    acc, baseline = lstm.test(sess, data)
    avg_acc += acc / 5.0

print("Average Accuracy: %.4f" % avg_acc)

# Correctness Only, 1 Epoch: 87.7% accuracy, 89.12% accuracy
# New Day by exercise, 1 Epoch: 87.7% accuracy
# New Day, 2 Epoch: 87.7% accuracy min responses = 16800, ~91% accuracy

Fold 1
epoch 0, MSE: 0.1465
epoch 1, MSE: 0.1427
epoch 2, MSE: 0.1414
epoch 3, MSE: 0.1403
epoch 4, MSE: 0.1388
epoch 5, MSE: 0.1378
epoch 6, MSE: 0.1378
epoch 7, MSE: 0.1364
Accuracy: 0.8960, Baseline: 0.8958, AUC: 0.98336
Fold 2
epoch 0, MSE: 0.1494
epoch 1, MSE: 0.1485
epoch 2, MSE: 0.1479
epoch 3, MSE: 0.1462
epoch 4, MSE: 0.1449
epoch 5, MSE: 0.1436
epoch 6, MSE: 0.1426
epoch 7, MSE: 0.1414
Accuracy: 0.8959, Baseline: 0.8958, AUC: 0.98409
Fold 3
epoch 0, MSE: 0.1442
epoch 1, MSE: 0.1406
epoch 2, MSE: 0.1392
epoch 3, MSE: 0.1381
epoch 4, MSE: 0.1374
epoch 5, MSE: 0.1356
epoch 6, MSE: 0.1345


In [25]:
reset_sess()

  | Data Version  | Weighting | Learning Rate | Epochs Trained | Accuracy  |  AUC  | Folds | Baseline  |
  |---------------|:---------:|:-------------:|:--------------:|:---------:|:-----:|:-----:|:---------:|
  | Correctness   | None      | 1e-2          | 8              | 89.1      |~0.98  |3      | 89.1      |
  | Correctness   | 2x        | 1e-2          | 8              | 89.08     | 0.977 |5      | 89.1      |
  | Correctness   | 1.5x      | 1e-2          | 8              | 89.11     | 0.979 |4      | 89.1      |
  |~~New Day~~    | 4x        | 1e-2          | 8              | 85.95     | 0.953 |2      | 89.1      |
  |~~New Day~~    | 5x        | 1e-2          | 8              | 83.82     | 0.936 |1      | 89.1      |
  |~~New Day~~    | 3x        | 1e-2          | 8              | 88.97     | 0.976 |5      | 89.1      |
  |~~New Day~~    | 2x        | 1e-2          | 8              | 89.07     | 0.979 |2      | 89.1      |
  |~~New Day~~    | 1.5x      | 1e-2          | 8              | 89.10     | 0.979 |4      | 89.1      |
  |~~New Day~~    | 6x        | 1e-2          | 8              | 87.59     | 0.903 |5      | 89.1      |
  |  New Day      | 2x        | 5e-3          | 8              | 89.11     | 0.978 |2      | 89.1      |
  |  New Day      | 3x        | 5e-3          | 8              | 87.91     | 0.965 |3      | 89.1      |
  |  New Day      | None      | 5e-3          | 8              | 89.09     | 0.979 |2      | 89.1      |
  |  New Day      | 7.5x      | 5e-3          | 10             | 71.53     | 0.857 |1      | 89.1      |
  |  New Day      | 1.5x      | 5e-3          | 9              | 89.20     | 0.979 |2      | 89.1      |

*Note: All versions predict by KC
* min_seq_length = 500
* max_seq_length = 3000
* min_correct = 2
* min_responses_for_skill = 16800

In [38]:
data.target_masks[0][39]

array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [13]:
old_day_prev_incorrect_next_correct = np.sum((np.sum(data.inputs, 
                                                     axis=2) == 1) * (np.sum(data.target_masks, axis=2) == 1))
old_day_prev_correct_next_correct = np.sum((np.sum(data.inputs, 
                                                     axis=2) == 2) * (np.sum(data.target_masks, axis=2) == 1))
old_day_prev_incorrect_next_incorrect = np.sum((np.sum(data.inputs, 
                                                     axis=2) == 1) * (np.sum(data.target_masks, axis=2) == 6))
old_day_prev_correct_next_incorrect = np.sum((np.sum(data.inputs, 
                                                     axis=2) == 2) * (np.sum(data.target_masks, axis=2) == 6))
new_day_prev_incorrect_next_correct = np.sum((np.sum(data.inputs, 
                                                     axis=2) == 13) * (np.sum(data.target_masks, axis=2) == 1))
new_day_prev_correct_next_correct = np.sum((np.sum(data.inputs, 
                                                     axis=2) == 14) * (np.sum(data.target_masks, axis=2) == 1))
new_day_prev_incorrect_next_incorrect = np.sum((np.sum(data.inputs, 
                                                     axis=2) == 13) * (np.sum(data.target_masks, axis=2) == 6))
new_day_prev_correct_next_incorrect = np.sum((np.sum(data.inputs, 
                                                     axis=2) == 14) * (np.sum(data.target_masks, axis=2) == 6))


print("Old Day, Prev Incorrect, Next Correct: %d" %old_day_prev_incorrect_next_correct)
print("Old Day, Prev Correct, Next Correct: %d" %old_day_prev_correct_next_correct)
print("Old Day, Prev Incorrect, Next Incorrect: %d" %old_day_prev_incorrect_next_incorrect)
print("Old Day, Prev Correct, Next Incorrect: %d" %old_day_prev_correct_next_incorrect)
print("New Day, Prev Incorrect, Next Correct: %d" %new_day_prev_incorrect_next_correct)
print("New Day, Prev Correct, Next Correct: %d" %new_day_prev_correct_next_correct)
print("New Day, Prev Incorrect, Next Incorrect: %d" %new_day_prev_incorrect_next_incorrect)
print("New Day, Prev Correct, Next Incorrect: %d" %new_day_prev_correct_next_incorrect)

Old Day, Prev Incorrect, Next Correct: 53026
Old Day, Prev Correct, Next Correct: 528875
Old Day, Prev Incorrect, Next Incorrect: 22101
Old Day, Prev Correct, Next Incorrect: 53364
New Day, Prev Incorrect, Next Correct: 1155
New Day, Prev Correct, Next Correct: 7960
New Day, Prev Incorrect, Next Incorrect: 505
New Day, Prev Correct, Next Incorrect: 825
