In [60]:
%matplotlib inline
from sensible_raw.loaders import loader
from collections import defaultdict
import calendar
import pandas as pd
import numpy as np
#import pystan
import matplotlib.pylab as plt
import json

# Get data

In [14]:
def default_to_regular(d):
    """Recursively convert nested defaultdicts to nested dicts.

    Source: http://stackoverflow.com/questions/26496831/how-to-convert-defaultdict-of-defaultdicts-of-defaultdicts-to-dict-of-dicts-o
    """
    if isinstance(d, defaultdict):
        d = {k: default_to_regular(v) for k, v in d.iteritems()}
    return d

def invert_partition(partition):
    if type(partition.items()[0][1]) is list:
        partition_inv = dict()
        for c, nodes in partition.items():
            for n in nodes:
                partition_inv[n] = c
    else:
        partition_inv = defaultdict(list)
        for n, c in partition.items():
            partition_inv[c].append(n)
    return default_to_regular(partition_inv)

def load_data(kind, period):
    columns, data = loader.load_data(kind, period)
    return pd.DataFrame(dict((column, array) for column, array in zip(columns, data)))

**Get vector groups**

In [42]:
dataframe_vectors = load_data("vectors", "vectors")
student_vector_group = dict((k, v) for k, v in zip(dataframe_vectors['user'], dataframe_vectors['vector']) if v > 0 and k < 10000)
vector_group_students = invert_partition(student_vector_group)

# Filter groups with less than 3 members
vector_group_students = dict((k, v) for k, v in vector_group_students.items() if len(v) >= 3)

**Get grades**

In [43]:
dataframe_grades = load_data("grades", "january_2014")
student_grades = defaultdict(list)
for u, g in zip(dataframe_grades['user'], dataframe_grades['grade']):
    if g in ["EM", "BE", "IB", "V", "SN", "S"]: continue
    student_grades[u].append(int(g))

**Get vector group grades**

In [46]:
vector_group_grades = defaultdict(list)
for g, s_arr in vector_group_students.items():
    for s in s_arr:
        vector_group_grades[g].append(np.mean(student_grades[s]))
        #vector_group_grades[g].extend(student_grades[s])

**Map group ids to new group ids starting from 0**

In [54]:
idmap = dict(zip(vector_group_grades.keys(), range(len(vector_group_grades))))
vector_group_grades = dict((idmap[k], v) for k, v in vector_group_grades.items())

**Convert to Stan-friendly data**

In [55]:
x, y = [], []
for vector_group, grades in vector_group_grades.items():
    x.extend([vector_group] * len(grades))
    y.extend(grades)

**Save (only able on remote server)**

In [65]:
with open("data/x.json", 'w') as fp:
    json.dump(x, fp)
with open("data/y.json", 'w') as fp:
    json.dump(y, fp)

# Analysis