In [None]:
from EduData import get_data
import os
data_path = './'
file_name = data_path + 'junyi_ProblemLog_original.csv'
# Please refer to this link for Junyi dataset: https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=1198

In [None]:
import pandas as pd
import tqdm

data = pd.read_csv(
    file_name
)
print(data.columns)


In [None]:
data = data.dropna(subset=["problem_type","exercise","time_taken_attempts"])
student_counts = data['user_id'].value_counts()
# find 1000 most active students
top_1000_students = student_counts.nlargest(1000).index
filtered_df = data[data['user_id'].isin(top_1000_students)]
# filter questions answered less than 10 times
question_counts = filtered_df['exercise'].value_counts()
less_10_answered = question_counts[question_counts < 10].index
filtered_df = filtered_df[~filtered_df['exercise'].isin(less_10_answered)]
filtered_df.sort_values(by = 'time_done', inplace = True)
filtered_df['time_first_res'] = filtered_df['time_taken_attempts'].str.split('&').str[0].astype(int)
filtered_df = filtered_df[filtered_df["time_first_res"] > 0]

In [None]:
exercise_relation = pd.read_csv(data_path + "junyi_Exercise_table.csv")

problems = filtered_df.exercise.unique().tolist()
exercise_topic_relation = exercise_relation[exercise_relation["name"].isin(problems)]
skills = exercise_topic_relation["topic"].unique().tolist()
users = filtered_df.user_id.unique()

# question id from 1 to #num_skill
skill2id = { p: i+1 for i, p in enumerate(skills) }
problem2id = { p: i+1 for i, p in enumerate(problems) }


print("number of users: %d" % len(users))
print("number of skills: %d" % len(skills))
print("number of problems: %d" % len(problems))

In [None]:
import numpy as np
problem2skill = {}
for p,s in zip(np.array(exercise_topic_relation.name), np.array(exercise_topic_relation.topic)):
    problem2skill[problem2id[p]] = skill2id[s]
with open(data_path + 'problem2skill', 'w', encoding='utf-8') as f:
    f.write(str(problem2skill))

In [None]:
from sklearn.model_selection import train_test_split, KFold
from scipy.stats import norm
from scipy.stats import poisson

train_student_ids, test_student_ids = train_test_split(users, test_size=0.2, random_state=42)

train_data = filtered_df[filtered_df['user_id'].isin(train_student_ids)]


# compute the mean and variance of the response time for each question
question_time_stats = train_data.groupby('exercise')['time_first_res'].agg(['mean', 'std']).reset_index()

# merge the time statistics to the original data
filtered_df = pd.merge(filtered_df, question_time_stats, on='exercise')
filtered_df['std'] = filtered_df['std'].fillna(0)
print("finish merging")

# compute the time factor with its distribution
filtered_df['time_factor'] = filtered_df.apply(lambda row: 1 if row['std'] == 0 else norm(row['mean'], row['std']).cdf(np.log(row['time_first_res'])), axis=1)
filtered_df = filtered_df.dropna(subset = ['time_factor'])
print("Finish processing time features ")

In [None]:
# compute the mean of the attempts
question_attempt_stats = train_data.groupby('exercise')['count_attempts'].mean().reset_index()
question_attempt_stats.rename(columns = {'count_attempts':'mean_attempt'}, inplace = True)
# merge the attempts statistics to the original data
filtered_df = pd.merge(filtered_df, question_attempt_stats, on='exercise', suffixes=('', '_attempt'))

# compute the attempt factor with its distribution
filtered_df['attempt_factor'] = 1 - poisson(filtered_df['mean_attempt']).cdf(filtered_df['count_attempts'] - 1)
print("Finish processing attempt features ")

In [None]:
# compute the mean of the hints
question_hint_stats = train_data.groupby('exercise')['count_hints'].agg('mean').reset_index()
question_hint_stats.rename(columns = {'count_hints':'mean_hint'}, inplace = True)
# merge the hints statistics to the original data
filtered_df = pd.merge(filtered_df, question_hint_stats, on='exercise', suffixes=('', '_hint'))

# compute the hint factor with its distribution
filtered_df['hint_factor'] = 1 - poisson(filtered_df['mean_hint']).cdf(filtered_df['count_hints'] - 1)

print("Finish processing hint features ")

In [None]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(filtered_df[filtered_df.user_id == student_id])
        all_sequences.extend([student_sequence])
    print(all_sequences)
    return all_sequences


def parse_student_seq(student):
    seq = student
    s = [problem2skill[problem2id[p]] for p in seq.exercise.tolist()]
    a = seq.correct.tolist()
    p = [problem2id[p] for p in seq.exercise.tolist()]
    time_factor = seq.time_factor.tolist()
    attempt_factor = seq.attempt_factor.tolist()
    hint_factor = seq.hint_factor.tolist()

    return s, a, p, time_factor,attempt_factor,hint_factor


train_data = np.array(parse_all_seq(train_student_ids))
test_data = np.array(parse_all_seq(test_student_ids))

In [None]:
def sequences2l(sequences, trg_path):
    with open(trg_path, 'w', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write data into file: %s' % trg_path):
            s_seq, a_seq, p_seq, time_seq, attempt_seq, hint_seq = seq
            seq_len = len(s_seq)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(s) for s in s_seq]) + '\n')
            f.write(','.join([str(a) for a in a_seq]) + '\n')
            f.write(','.join([str(p) for p in p_seq]) + '\n')
            f.write(','.join([format(t, '.6f') for t in time_seq]) + '\n')
            f.write(','.join([format(att, '.6f') for att in attempt_seq]) + '\n')
            f.write(','.join([format(h, '.6f') for h in hint_seq]) + '\n')

sequences2l(train_data, data_path + 'train.txt')
sequences2l(test_data, data_path + 'test.txt')