In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from nltk.metrics import ConfusionMatrix
from sklearn.metrics import r2_score, cohen_kappa_score

In [2]:
# Load data

# df = pd.read_csv('../output/teacher-report/teacher_report_gyor.csv')
df = pd.read_csv('../generate-tr/final_df_gyor.csv')
with open('../data/moby/jsons/session_to_difficulty_metrics.json') as f:
    session_to_difficulty_metrics = json.load(f)
with open('../data/moby/jsons/session_to_acc_zscore.json') as f:
    session_to_acc_zscore = json.load(f)
with open('../data/moby/jsons/session_to_wcpm_zscore.json') as f:
    session_to_wcpm_zscore = json.load(f)
with open('../data/moby/jsons/session_to_acc_zscore_by_grade.json') as f:
    session_to_acc_zscore_by_grade = json.load(f)
with open('../data/moby/jsons/session_to_wcpm_zscore_by_grade.json') as f:
    session_to_wcpm_zscore_by_grade = json.load(f)

In [3]:
design = pd.read_csv('data/all_sessions_only_session_scores.csv')

wcpm_zscore_column = [session_to_wcpm_zscore[str(session)] for session in design.Session]
acc_zscore_column = [session_to_acc_zscore[str(session)] for session in design.Session]
wcpm_zscore_by_grade_column = [session_to_wcpm_zscore_by_grade[str(session)] for session in design.Session]
acc_zscore_by_grade_column = [session_to_acc_zscore_by_grade[str(session)] for session in design.Session]

design['WCPM_zscore'] = wcpm_zscore_column
design['Acc_zscore'] = acc_zscore_column
design['WCPM_zscore_by_grade'] = wcpm_zscore_by_grade_column
design['Acc_zscore_by_grade'] = acc_zscore_by_grade_column

design = design.sort_values(by='Session').set_index('Session')
df = df.sort_values(by='Session').set_index('Session')

In [4]:
full_matrix = design.join(df, on='Session').reset_index()

In [5]:
mlevel_numeric_column = []
form_numeric_column = []
mlevel_form_diff_column = []
text_is_above_level_column = []

for idx, row in full_matrix.iterrows():
    session = str(int(row.Session))
    difficulty_metrics = session_to_difficulty_metrics[session]
    
    mlevel_numeric_column.append(difficulty_metrics['mlevel_numeric'])
    form_numeric_column.append(difficulty_metrics['form_numeric'])
    mlevel_form_diff_column.append(difficulty_metrics['mlevel_form_diff'])
    text_is_above_level_column.append(difficulty_metrics['text_is_above_level'])

full_matrix['mlevel_numeric'] = mlevel_numeric_column
full_matrix['form_numeric'] = form_numeric_column
full_matrix['mlevel_form_diff'] = mlevel_form_diff_column
full_matrix['text_is_above_level'] = text_is_above_level_column

In [6]:
# full_matrix.to_csv('data/all_sessions_with_session_and_difficulty_scores.csv', index=None)

In [7]:
full_matrix.columns

Index(['Session', 'WCPM', 'Acc', 'Comp', 'Exp', 'level', 'WCPM_zscore',
       'Acc_zscore', 'WCPM_zscore_by_grade', 'Acc_zscore_by_grade', 'gyorB',
       'gyorC', 'gyorE', 'gyorF', 'gyorG', 'gyorH', 'gyorJ', 'gyorK', 'gyorL',
       'gyorM', 'gyorN', 'gyorO', 'gyorP', 'gyorQ', 'mlevel_numeric',
       'form_numeric', 'mlevel_form_diff', 'text_is_above_level'],
      dtype='object')

In [8]:
def discretize_predictions(predictions,
                           cutoffs=['value < .5',
                                    '.5 <= value < 1.5',
                                    '1.5 <= value < 2.5',
                                    '2.5 <= value']):
    discretized = []
    for value in predictions:
        for idx, formula in enumerate(cutoffs):
            if eval(formula):
                discretized.append(idx)
    assert len(discretized) == len(predictions)
    return discretized
    

def run_regressor(matrix, feature_cols=[], col_to_predict=''):
    X = matrix[feature_cols]
    y = matrix[col_to_predict]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    
    print('Weights: ' + ', '.join(['{:.2f}'.format(x) for x in reg.coef_]))
    
    y_pred = reg.predict(X_test)    
    y_pred_discretized = discretize_predictions(y_pred)
    
    conf_mat = ConfusionMatrix(y_test, y_pred_discretized)
    
    print('R2 (discretized): {:.3f}'.format(r2_score(y_test, y_pred_discretized)))
    print('R2 (continuous):  {:.3f}'.format(r2_score(y_test, y_pred)))
    
    print('QWK: {:.3f}'.format(cohen_kappa_score(y_test, y_pred_discretized, weights="quadratic")))
    
    print(str(conf_mat))
    
    accuracy = sum(1 for x, y in zip(y_test, y_pred_discretized) if x == y) / len(y_test)
    print('Raw accuracy: {:.3f}'.format(accuracy))

In [9]:
feature_cols=[
    'WCPM',
    'Acc',
    'Comp',
    'Exp',
    'level',
    'WCPM_zscore',
    'Acc_zscore',
    'WCPM_zscore_by_grade',
    'Acc_zscore_by_grade',
    'mlevel_numeric',
    'form_numeric',
    'mlevel_form_diff',
    'text_is_above_level',
]

run_regressor(full_matrix, feature_cols=feature_cols, col_to_predict='gyorJ')

Weights: 0.00, 8.79, 0.05, 0.51, -0.18, 0.76, -0.48, -0.45, 0.09, 0.05, 0.01, 0.04, -0.24
R2 (discretized): 0.509
R2 (continuous):  0.503
QWK: 0.687
  |   0   1   2   3   4   5   6 |
--+-----------------------------+
0 | <11> 31  12   .   .   .   . |
1 |   2  <9> 18   4   .   .   . |
2 |   .   7 <39> 15   .   .   . |
3 |   .   7  56<154>  .   .   . |
4 |   .   .   .   .  <.>  .   . |
5 |   .   .   .   .   .  <.>  . |
6 |   .   .   .   .   .   .  <.>|
--+-----------------------------+
(row = reference; col = test)

Raw accuracy: 0.584


In [10]:
run_regressor(full_matrix, feature_cols=feature_cols, col_to_predict='gyorL')

Weights: 0.01, 7.45, 0.03, 0.59, -0.14, 0.40, -0.33, -0.11, -0.16, 0.03, 0.02, 0.01, -0.15
R2 (discretized): 0.661
R2 (continuous):  0.707
QWK: 0.804
  |   0   1   2   3   4   5   6 |
--+-----------------------------+
0 | <20> 25   1   .   .   .   . |
1 |   . <24> 20   1   .   .   . |
2 |   2  12 <68>  8   .   .   . |
3 |   .   4  41<139>  .   .   . |
4 |   .   .   .   .  <.>  .   . |
5 |   .   .   .   .   .  <.>  . |
6 |   .   .   .   .   .   .  <.>|
--+-----------------------------+
(row = reference; col = test)

Raw accuracy: 0.688


In [11]:
grade_2_matrix = full_matrix.loc[(full_matrix.form_numeric <= 11) & (full_matrix.form_numeric >= 9)]

In [12]:
with open('../data/moby/jsons/session_to_difficulty_metrics.json') as f:
    session_to_difficulty_metrics = json.load(f)

difficulty_metrics_names = list(session_to_difficulty_metrics[list(session_to_difficulty_metrics.keys())[0]].keys())

In [13]:
difficulty_metrics_names

['mlevel_numeric', 'form_numeric', 'mlevel_form_diff', 'text_is_above_level']

In [14]:
new_difficulty_metrics_columns = [[] for metric in difficulty_metrics_names]
for idx, row in grade_2_matrix.iterrows():
    session = str(int(row.Session))
    difficulty_metrics = session_to_difficulty_metrics[session]
    print(difficulty_metrics)
    break

{'mlevel_numeric': 6, 'form_numeric': 9, 'mlevel_form_diff': -3, 'text_is_above_level': 0}


In [15]:
run_regressor(grade_2_matrix, feature_cols=feature_cols, col_to_predict='gyorJ')

Weights: -0.03, 0.19, 0.07, 0.55, -0.60, 2.48, -1.80, -0.00, 2.16, 0.26, 0.21, 0.05, 0.03
R2 (discretized): 0.537
R2 (continuous):  0.596
QWK: 0.711
  |  0  1  2  3  4  5  6 |
--+----------------------+
0 | <2>11  2  .  .  .  . |
1 |  1 <3> 2  .  .  .  . |
2 |  1  2<11> 4  .  .  . |
3 |  .  1 18<35> .  .  . |
4 |  .  .  .  . <.> .  . |
5 |  .  .  .  .  . <.> . |
6 |  .  .  .  .  .  . <.>|
--+----------------------+
(row = reference; col = test)

Raw accuracy: 0.548


In [16]:
run_regressor(grade_2_matrix, feature_cols=feature_cols, col_to_predict='gyorL')

Weights: -0.03, 0.06, 0.03, 0.37, -0.33, 2.02, -0.58, -0.00, 0.66, 0.19, 0.15, 0.04, 0.04
R2 (discretized): 0.679
R2 (continuous):  0.709
QWK: 0.828
  |  0  1  2  3  4  5  6 |
--+----------------------+
0 | <4> 2  .  .  .  .  . |
1 |  .<13> 4  .  .  .  . |
2 |  .  5<11> 2  .  .  . |
3 |  .  . 15<37> .  .  . |
4 |  .  .  .  . <.> .  . |
5 |  .  .  .  .  . <.> . |
6 |  .  .  .  .  .  . <.>|
--+----------------------+
(row = reference; col = test)

Raw accuracy: 0.699
