In [70]:
import numpy as np
import pandas as pd

from collections import defaultdict

In [71]:
df = pd.read_csv('predicting-annotation-fields/annotations-complete.tsv', sep='\t', skiprows=4)

df = df[[col for col in df.columns if 'Unnamed' not in col]]

df = df.drop(['Stack'], axis=1).dropna(subset=['Session', 'Response'], how='any')
df = df.astype({
    'Session': 'int',
    'level': 'int32',
    'Response': 'int',
    'Item': 'int'
})
df = df.astype({
    'Session': 'str',
    'Response': 'str',
    'Item': 'str'
})
df = df.rename(columns={'WCPM.1': 'WCPM_response',
                        'Comp.1': 'Comp_response',
                        'Expression': 'Exp_response',
                        'Name A': 'Annotator_name'})

df = df.drop_duplicates()

df = df.sort_values(by=['Form', 'Response', 'Session'])

In [72]:
# df.to_csv('AnnotationTable.csv', index=False)

In [73]:
df2 = pd.read_csv('docs/Profiler Response-level Scores.csv')
session_to_saq_scores = defaultdict(list)
session_to_retell_scores = defaultdict(list)

for _, row in df2.iterrows():
    sess = str(int(row.Session))
    saq1 = row.SAQ1 if row.SAQ1 == row.SAQ1 else 0
    saq2 = row.SAQ2 if row.SAQ2 == row.SAQ2 else 0
    retell = row.Retell if row.Retell == row.Retell else 0
    session_to_saq_scores[sess].append(saq1)
    session_to_saq_scores[sess].append(saq2)
    session_to_retell_scores[sess].append(retell)

In [83]:
session_to_normalized_saq_score = {
    session: sum(saq_scores) / len(saq_scores) / 2
    for session, saq_scores in session_to_saq_scores.items()
}
session_to_average_retell_score = {
    session: sum(retell_scores) / len(retell_scores) / 2
    for session, retell_scores in session_to_retell_scores.items()
}

In [84]:
session_to_info_list = defaultdict(list)
for idx, row in df.iterrows():
    d = row.to_dict()
    for i in range(1, 21):
        d[str(i)] -= 1
    session_to_info_list[d['Session']].append(d)

In [85]:
def GET_CATEGORY_SCORE(session, session_to_info_list, category='1'):
    rv = []
    for d in session_to_info_list[session]:
        value = d[category]
        if value == value: # no nan's
            rv.append(value)
    if not rv:
        return 0
    return sum(rv) / len(rv)

In [77]:
def UPDATE_WITH_SCORE(session_to_tr_values,
                      session_to_info_list,
                      categories=['1', '2', '3'],
                      category_name='G',
                      info='add.info',
                      divide_by_2=True):
    for session, info_list in session_to_info_list.items():
        scores = [
            GET_CATEGORY_SCORE(session, session_to_info_list, category=str(category))
            for category in categories
        ]
        average_score = sum(scores) / len(scores)
        if divide_by_2:
            session_to_tr_values[session][str(category_name)] = average_score / 2
        else:
            session_to_tr_values[session][str(category_name)] = average_score
    return session_to_tr_values

In [80]:
def UPDATE_WITH_SCORE_FOR_F(session_to_tr_values,
                            session_to_info_list):
    for session, info_list in session_to_info_list.items():
        
        scores = []
        for d in info_list:
            val_7 = d['7'] if d['7'] == d['7'] else 0
            val_8 = d['8'] if d['8'] == d['8'] else 0
            val_9 = d['9'] if d['9'] == d['9'] else 0
            val_10 = d['10'] if d['10'] == d['10'] else 0
            
            score = (((val_7 + val_8 + val_9) * 2) + val_10) / 7
            scores.append(score)

        average_score = sum(scores) / len(scores)
        session_to_tr_values[session]['F'] = average_score
    return session_to_tr_values

def UPDATE_WITH_SCORE_FOR_Q(session_to_tr_values,
                           session_to_info_list):
    
    for session, info_list in session_to_info_list.items():
        items = list(set(d['Item'] for d in info_list))
        
        if len(items) != 3:
            print(session)
            d0 = [d for d in info_list if d['Item'] == items[0]][0]
            d1 = [d for d in info_list if d['Item'] == items[1]][0]

            wcpm0 = d0['WCPM_response']
            wcpm1 = d1['WCPM_response']

            score = abs(wcpm0 - wcpm1)

            score /= (wcpm0 + wcpm1) / 2

            session_to_tr_values[session]['Q'] = score / 2
            continue
        
        d0 = [d for d in info_list if d['Item'] == items[0]][0]
        d1 = [d for d in info_list if d['Item'] == items[1]][0]
        d2 = [d for d in info_list if d['Item'] == items[2]][0]
        
        wcpm0 = d0['WCPM_response']
        wcpm1 = d1['WCPM_response']
        wcpm2 = d2['WCPM_response']
        
        score = max(abs(wcpm0 - wcpm1), abs(wcpm1 - wcpm2), abs(wcpm2 - wcpm0))
        
        score /= sorted([wcpm0, wcpm1, wcpm2])[1]
        
        session_to_tr_values[session]['Q'] = score
        
    return session_to_tr_values

def UPDATE_FROM_DICT(session_to_tr_values,
                    dictionary,
                    category_name='N'):
    
    for session, info_list in session_to_info_list.items():
        session_to_tr_values[session][category_name] = dictionary[session]
    
    return session_to_tr_values

In [81]:
session_to_tr_values = defaultdict(dict)

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[1, 2],
                                        category_name='B')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[3, 4, 5, 6],
                                        category_name='C')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[16],
                                        category_name='E')

session_to_tr_values = UPDATE_WITH_SCORE_FOR_F(session_to_tr_values,
                                              session_to_info_list)

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[15],
                                        category_name='G')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[17],
                                        category_name='H')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[15],
                                        category_name='G')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[11, 18],
                                        category_name='J')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[14],
                                        category_name='K')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[12],
                                        category_name='L')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[13],
                                        category_name='M')

session_to_tr_values = UPDATE_FROM_DICT(session_to_tr_values,
                                        session_to_normalized_saq_score,
                                        category_name='N')

session_to_tr_values = UPDATE_FROM_DICT(session_to_tr_values,
                                        session_to_average_retell_score,
                                        category_name='O')

session_to_tr_values = UPDATE_WITH_SCORE(session_to_tr_values,
                                        session_to_info_list,
                                        categories=[19],
                                        category_name='P',
                                        divide_by_2=False)

session_to_tr_values = UPDATE_WITH_SCORE_FOR_Q(session_to_tr_values,
                                              session_to_info_list)

26604


In [97]:
def GET_discrete_column(table,
                        column_name,
                        cutoff=[
                            'value < .33',
                            '.33 <= value <= .58',
                            '.58 < value <= .83',
                            '.83 < value']):
    column = table[column_name]
    new_column = []
    for value in column:
        for idx, formula in enumerate(cutoff):
            if eval(formula):
                new_column.append(idx)
    assert len(column) == len(new_column)
    
    table[column_name + '_new'] = new_column
    return table

In [100]:
table = pd.DataFrame(session_to_tr_values).transpose()

table.to_csv('output/teacher-report/teacher_report.csv')

table = GET_discrete_column(table, 'B')
table = GET_discrete_column(table, 'C')
table = GET_discrete_column(table, 'E')
table = GET_discrete_column(table, 'F')
table = GET_discrete_column(table, 'G')
table = GET_discrete_column(table, 'H')
table = GET_discrete_column(table, 'J')
table = GET_discrete_column(table, 'K')
table = GET_discrete_column(table, 'L')
table = GET_discrete_column(table, 'M')
table = GET_discrete_column(table, 'N', cutoff=['value < .5',
                                               '.5 <= value <= .58',
                                               '.58 < value <= .66',
                                               '.66 < value'])
table = GET_discrete_column(table, 'O', cutoff=['value < 1',
                                               '1 <= value <= 1.75',
                                               '1.75 < value <= 2.5',
                                               '2.5 < value'])
table = GET_discrete_column(table, 'P', cutoff=['value < .33',
                                               '.33 <= value <= .5',
                                               '.5 < value <= .66',
                                               '.66 < value'])
table = GET_discrete_column(table, 'Q', cutoff=['value > .66',
                                               '.46 <= value <= .66',
                                               '.25 <= value < .46',
                                               'value < .25'])

table = table[[col for col in table.columns if 'new' in col]]

In [102]:
table.to_csv('output/teacher-report/teacher_report_gyor.csv')