In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from nltk.metrics import ConfusionMatrix
from collections import *

In [2]:
annotes = pd.read_csv('text-difficulty-avg-annotes.csv')

In [3]:
session_to_difficulty = dict(zip(annotes.Session, annotes.DiffTarget))

In [4]:
table = pd.read_csv('table_for_prediction_updated.csv')

In [5]:
grade1 = table.loc[table.Grade == 'Grade1']

In [6]:
table = table.loc[table.Session.astype(str).isin(session_to_difficulty)]
table['DiffTarget'] = table.Session.apply(lambda s: session_to_difficulty[str(s)])

In [7]:
table.tail()

Unnamed: 0,Session,Form,Grade,WCPM,Acc,Comp,Exp,level,WCPM_zscore,Acc_zscore,...,gyorP,gyorQ,mlevel_numeric,form_numeric,mlevel_form_diff,text_is_above_level,Acc4,AvgSpan,AccOverSpanNonSW,DiffTarget
1210,53803,Grade4Spring,Grade4,122.0,1.0,4.4,3.7,16,0.404977,0.676739,...,3,3.0,16,17,-1,0,1.0,1.0,0.967213,Right
1211,53945,Grade1Winter,Grade1,64.0,1.0,7.0,2.2,9,0.007175,0.877031,...,3,3.0,9,6,3,1,1.0,1.0,0.909091,Right
1212,54096,Grade4Winter,Grade4,124.0,0.997,5.7,3.6,17,1.127903,0.572681,...,3,3.0,17,16,1,1,0.988054,1.0,0.921875,Right
1213,54166,Grade4Spring,Grade4,124.0,0.996,6.5,3.6,16,0.461105,0.624682,...,3,3.0,16,17,-1,0,0.984096,1.0,0.95082,Right
1214,54195,Grade4Fall,Grade4,82.0,0.695,2.0,2.3,12,-0.572132,-3.891709,...,3,3.0,12,15,-3,0,0.233313,0.970224,0.806818,Easy


In [11]:
diff_to_num = {
    'Right': 2,
    'Easy': 3,
    'Hard': 1
}
def get_diff_to_num(lst):
    return [diff_to_num[x] for x in lst]

In [42]:
def naive_prediction(category=('Form', 'Grade2Spring'), acc_thres=.95, wcpm_low_pct=25, wcpm_high_pct=75):
    table_selected = table.loc[table[category[0]] == category[1]]
    table_selected_wcpms = table_selected.WCPM
    wcpm_25th = np.percentile(table_selected_wcpms, 30)
    wcpm_75th = np.percentile(table_selected_wcpms, 75)

    prediction_column = []
    for _, row in table_selected.iterrows():
        if row['AccOverSpanNonSW'] < acc_thres:
            prediction_column.append('Hard')
        elif row['AccOverSpanNonSW'] >= acc_thres and row['WCPM'] >= wcpm_75th:
            prediction_column.append('Easy')
        else:
            prediction_column.append('Right')
    
    conf_mat_str = ConfusionMatrix(get_diff_to_num(table_selected.DiffTarget), get_diff_to_num(prediction_column))
    conf_mat = confusion_matrix(get_diff_to_num(table_selected.DiffTarget), get_diff_to_num(prediction_column))
    acc = sum(1 for x,y in zip(table_selected.DiffTarget, prediction_column) if x == y) / len(prediction_column)
    print(' = '.join(category))
    print('acc = {:.2f}%'.format(acc*100))
    print(conf_mat_str)

    return acc, conf_mat, conf_mat_str

In [45]:
accs = []
cm_total = np.zeros((3, 3))
for g in ('1','2','3','4'):
    for t in ('Fall', 'Winter', 'Spring'):
        acc, cm, cm_str = naive_prediction(('Form', 'Grade'+g+t), acc_thres=.85)
        cm_total += cm
        accs.append(acc)
print('median accuracy:', np.median(accs))
print('all sessions:')
print(cm_total)
print('raw: {} / {} = {:.3f}'.format(cm_total.trace(), cm_total.sum(), cm_total.trace()/cm_total.sum()))

Form = Grade1Fall
acc = 70.87%
  |  1  2  3 |
--+----------+
1 |<18> 7  . |
2 | 12<29> . |
3 |  . 11<26>|
--+----------+
(row = reference; col = test)

Form = Grade1Winter
acc = 65.96%
  |  1  2  3 |
--+----------+
1 |<12>10  . |
2 | 11<33> 3 |
3 |  2  6<17>|
--+----------+
(row = reference; col = test)

Form = Grade1Spring
acc = 75.00%
  |  1  2  3 |
--+----------+
1 |<25> 4  . |
2 |  9<29> 4 |
3 |  .  8<21>|
--+----------+
(row = reference; col = test)

Form = Grade2Fall
acc = 70.87%
  |  1  2  3 |
--+----------+
1 |<14> 5  . |
2 | 12<38> 3 |
3 |  1  9<21>|
--+----------+
(row = reference; col = test)

Form = Grade2Winter
acc = 77.67%
  |  1  2  3 |
--+----------+
1 |<22> 1  . |
2 |  7<39> 5 |
3 |  . 10<19>|
--+----------+
(row = reference; col = test)

Form = Grade2Spring
acc = 75.26%
  |  1  2  3 |
--+----------+
1 | <8> 4  . |
2 |  8<40> . |
3 |  . 12<25>|
--+----------+
(row = reference; col = test)

Form = Grade3Fall
acc = 79.41%
  |  1  2  3 |
--+----------+
1 |<16> 1  . |
2 | 