In [62]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from nltk.metrics import ConfusionMatrix
from sklearn.metrics import r2_score, cohen_kappa_score

In [63]:
# Load data

df = pd.read_csv('../output/teacher-report/teacher_report_gyor.csv')
with open('../data/moby/jsons/session_to_difficulty_metrics.json') as f:
    session_to_difficulty_metrics = json.load(f)

In [64]:
design = pd.read_csv('data/all_sessions_only_session_scores.csv')

In [65]:
full_matrix = pd.concat([design, df], axis=1).T.drop_duplicates().T

In [66]:
mlevel_numeric_column = []
form_numeric_column = []
mlevel_form_diff_column = []
text_is_above_level_column = []

for idx, row in full_matrix.iterrows():
    session = str(int(row.Session))
    difficulty_metrics = session_to_difficulty_metrics[session]
    
    mlevel_numeric_column.append(difficulty_metrics['mlevel_numeric'])
    form_numeric_column.append(difficulty_metrics['form_numeric'])
    mlevel_form_diff_column.append(difficulty_metrics['mlevel_form_diff'])
    text_is_above_level_column.append(difficulty_metrics['text_is_above_level'])

full_matrix['mlevel_numeric'] = mlevel_numeric_column
full_matrix['form_numeric'] = form_numeric_column
full_matrix['mlevel_form_diff'] = mlevel_form_diff_column
full_matrix['text_is_above_level'] = text_is_above_level_column

In [67]:
# full_matrix.to_csv('data/all_sessions_with_session_and_difficulty_scores.csv', index=None)

In [68]:
full_matrix.columns

Index(['Session', 'WCPM', 'Acc', 'Comp', 'Exp', 'level', 'B_new', 'C_new',
       'E_new', 'F_new', 'G_new', 'H_new', 'J_new', 'K_new', 'L_new', 'M_new',
       'N_new', 'O_new', 'P_new', 'Q_new', 'mlevel_numeric', 'form_numeric',
       'mlevel_form_diff', 'text_is_above_level'],
      dtype='object')

In [82]:
def discretize_predictions(predictions,
                           cutoffs=['value < .5',
                                    '.5 <= value < 1.5',
                                    '1.5 <= value < 2.5',
                                    '2.5 <= value']):
    discretized = []
    for value in predictions:
        for idx, formula in enumerate(cutoffs):
            if eval(formula):
                discretized.append(idx)
    assert len(discretized) == len(predictions)
    return discretized
    

def run_regressor(matrix, feature_cols=[], col_to_predict=''):
    X = matrix[feature_cols]
    y = matrix[col_to_predict]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    
    print('Weights: ' + ', '.join(['{:.2f}'.format(x) for x in reg.coef_]))
    
    y_pred = reg.predict(X_test)    
    y_pred_discretized = discretize_predictions(y_pred)
    
    conf_mat = ConfusionMatrix(y_test, y_pred_discretized)
    
    print('R2 (discretized): {:.3f}'.format(r2_score(y_test, y_pred_discretized)))
    print('R2 (continuous):  {:.3f}'.format(r2_score(y_test, y_pred)))
    
    print('QWK: {:.3f}'.format(cohen_kappa_score(y_test, y_pred_discretized, weights="quadratic")))
    
    print(str(conf_mat))
    
    accuracy = sum(1 for x, y in zip(y_test, y_pred_discretized) if x == y) / len(y_test)
    print('Raw accuracy: {:.3f}'.format(accuracy))

In [83]:
feature_cols=[
    'WCPM',
    'Acc',
    'Comp',
    'Exp',
    'level',
    'mlevel_numeric',
    'form_numeric',
    'mlevel_form_diff',
    'text_is_above_level'
]

run_regressor(full_matrix, feature_cols=feature_cols, col_to_predict='J_new')

Weights: 0.02, 0.24, 0.01, 0.23, -0.08, 0.02, -0.02, 0.03, -0.07
R2 (discretized): 0.615
R2 (continuous):  0.737
QWK: 0.806
    |   0   1   2   3   4   5   6 |
    |   .   .   .   .   .   .   . |
    |   0   0   0   0   0   0   0 |
----+-----------------------------+
0.0 | <68> 17   .   .   .   .   . |
1.0 |  10<101> 23   .   .   .   . |
2.0 |   .  27 <85> 13   .   .   . |
3.0 |   .   2   8 <11>  .   .   . |
4.0 |   .   .   .   .  <.>  .   . |
5.0 |   .   .   .   .   .  <.>  . |
6.0 |   .   .   .   .   .   .  <.>|
----+-----------------------------+
(row = reference; col = test)

Raw accuracy: 0.726


In [85]:
run_regressor(full_matrix, feature_cols=feature_cols, col_to_predict='L_new')

Weights: 0.01, 1.94, 0.03, 0.55, -0.09, 0.02, 0.02, 0.00, -0.03
R2 (discretized): 0.621
R2 (continuous):  0.692
QWK: 0.770
    |   0   1   2   3   4   5   6 |
    |   .   .   .   .   .   .   . |
    |   0   0   0   0   0   0   0 |
----+-----------------------------+
0.0 | <11> 27   1   .   .   .   . |
1.0 |   . <35> 19   .   .   .   . |
2.0 |   .  15 <62>  6   .   .   . |
3.0 |   .   3  65<121>  .   .   . |
4.0 |   .   .   .   .  <.>  .   . |
5.0 |   .   .   .   .   .  <.>  . |
6.0 |   .   .   .   .   .   .  <.>|
----+-----------------------------+
(row = reference; col = test)

Raw accuracy: 0.627


In [98]:
grade_2_matrix = full_matrix.loc[(full_matrix.form_numeric <= 11) & (full_matrix.form_numeric >= 9)]
run_regressor(grade_2_matrix, feature_cols=feature_cols, col_to_predict='J_new')

Weights: 0.03, 0.16, 0.03, 0.08, -0.21, 0.11, 0.07, 0.04, -0.09
R2 (discretized): 0.583
R2 (continuous):  0.713
QWK: 0.774
    |  0  1  2  3  4  5 |
    |  .  .  .  .  .  . |
    |  0  0  0  0  0  0 |
----+-------------------+
0.0 |<24> 7  .  .  .  . |
1.0 |  4<22> 5  .  .  . |
2.0 |  .  8<15> 3  .  . |
3.0 |  .  .  5 <.> .  . |
4.0 |  .  .  .  . <.> . |
5.0 |  .  .  .  .  . <.>|
----+-------------------+
(row = reference; col = test)

Raw accuracy: 0.656


In [99]:
run_regressor(grade_2_matrix, feature_cols=feature_cols, col_to_predict='L_new')

Weights: 0.02, 1.47, 0.05, 0.12, -0.30, 0.19, 0.09, 0.10, 0.15
R2 (discretized): 0.708
R2 (continuous):  0.710
QWK: 0.811
    |  0  1  2  3  4  5  6 |
    |  .  .  .  .  .  .  . |
    |  0  0  0  0  0  0  0 |
----+----------------------+
0.0 | <2>12  .  .  .  .  . |
1.0 |  . <7> 6  .  .  .  . |
2.0 |  .  2<15> 1  .  .  . |
3.0 |  .  . 13<35> .  .  . |
4.0 |  .  .  .  . <.> .  . |
5.0 |  .  .  .  .  . <.> . |
6.0 |  .  .  .  .  .  . <.>|
----+----------------------+
(row = reference; col = test)

Raw accuracy: 0.634
