In [95]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from nltk.metrics import ConfusionMatrix

In [56]:
# Load data

df = pd.read_csv('../output/teacher-report/teacher_report_gyor.csv')

In [57]:
design = pd.read_csv('data/all_sessions_only_session_scores.csv')

In [61]:
full_matrix = pd.concat([design, df], axis=1).T.drop_duplicates().T

In [79]:
full_matrix.columns

Index(['Session', 'WCPM', 'Acc', 'Comp', 'Exp', 'level', 'B_new', 'C_new',
       'E_new', 'F_new', 'G_new', 'H_new', 'J_new', 'K_new', 'L_new', 'M_new',
       'N_new', 'O_new', 'P_new', 'Q_new'],
      dtype='object')

In [109]:
def discretize_predictions(predictions,
                           cutoffs=['value < .5',
                                    '.5 <= value < 1.5',
                                    '1.5 <= value < 2.5',
                                    '2.5 <= value']):
    discretized = []
    for value in predictions:
        for idx, formula in enumerate(cutoffs):
            if eval(formula):
                discretized.append(idx)
    assert len(discretized) == len(predictions)
    return discretized
    

def run_regressor(matrix, feature_cols=[], col_to_predict=''):
    X = matrix[feature_cols]
    y = matrix[col_to_predict]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
    
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    
    y_pred = reg.predict(X_test)    
    y_pred_discretized = discretize_predictions(y_pred)
    
    conf_mat = ConfusionMatrix(y_test, y_pred_discretized)
    print(str(conf_mat))
    
    accuracy = sum(1 for x, y in zip(y_test, y_pred_discretized) if x == y) / len(y_test)
    print('Raw accuracy:', accuracy)

In [110]:
run_regressor(full_matrix, feature_cols=['WCPM', 'Acc', 'Comp', 'Exp', 'level'], col_to_predict='J_new')

    |   0   1   2   3   4   5   6 |
    |   .   .   .   .   .   .   . |
    |   0   0   0   0   0   0   0 |
----+-----------------------------+
0.0 | <55> 23   .   .   .   .   . |
1.0 |   3<116> 22   .   .   .   . |
2.0 |   2  33 <85>  7   .   .   . |
3.0 |   .   1   7 <11>  .   .   . |
4.0 |   .   .   .   .  <.>  .   . |
5.0 |   .   .   .   .   .  <.>  . |
6.0 |   .   .   .   .   .   .  <.>|
----+-----------------------------+
(row = reference; col = test)

Raw accuracy: 0.7315068493150685


In [111]:
run_regressor(full_matrix, feature_cols=['WCPM', 'Acc', 'Comp', 'Exp', 'level'], col_to_predict='L_new')

    |   0   1   2   3   4   5   6 |
    |   .   .   .   .   .   .   . |
    |   0   0   0   0   0   0   0 |
----+-----------------------------+
0.0 | <13> 24   1   .   .   .   . |
1.0 |   1 <32> 18   .   .   .   . |
2.0 |   .  14 <61>  4   .   .   . |
3.0 |   .   6  52<139>  .   .   . |
4.0 |   .   .   .   .  <.>  .   . |
5.0 |   .   .   .   .   .  <.>  . |
6.0 |   .   .   .   .   .   .  <.>|
----+-----------------------------+
(row = reference; col = test)

Raw accuracy: 0.6712328767123288
