# Case Study 1: Essay Grading with Large Language Models

## 1. Load Data 

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('./asap-aes/training_set_rel3.xlsx')

In [3]:
df['essay_set'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8])

We only use set 1

In [6]:
set_1 = df[df['essay_set'] == 1]
set_1 = set_1.dropna(axis=1, how='all')
set_1

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0
4,5,1,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0
...,...,...,...,...,...,...
1778,1783,1,"Dear @CAPS1, @CAPS2 several reasons on way I t...",4.0,4.0,8.0
1779,1784,1,Do a adults and kids spend to much time on the...,3.0,4.0,7.0
1780,1785,1,My opinion is that people should have computer...,4.0,4.0,8.0
1781,1786,1,"Dear readers, I think that its good and bad to...",1.0,1.0,2.0


Load grading from ChatGPT 4

In [8]:
df = pd.read_csv('./graded_essays.csv')
df

Unnamed: 0,essay_id,score
0,1,6
1,2,6
2,3,6
3,4,6
4,5,6
...,...,...
1778,1783,6
1779,1784,6
1780,1785,6
1781,1786,2


In [22]:
result_df = pd.merge(set_1, df, on='essay_id', how='inner')
result_df.rename(columns={'score': 'score_GPT'}, inplace=True)
result_df['GPT_rater_1_score'] = result_df['score_GPT'] + result_df['rater1_domain1']
result_df

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,score_GPT,GPT_rater_1_score
0,1,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,6,10.0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0,6,11.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0,6,10.0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0,6,11.0
4,5,1,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0,6,10.0
...,...,...,...,...,...,...,...,...
1778,1783,1,"Dear @CAPS1, @CAPS2 several reasons on way I t...",4.0,4.0,8.0,6,10.0
1779,1784,1,Do a adults and kids spend to much time on the...,3.0,4.0,7.0,6,9.0
1780,1785,1,My opinion is that people should have computer...,4.0,4.0,8.0,6,10.0
1781,1786,1,"Dear readers, I think that its good and bad to...",1.0,1.0,2.0,2,3.0


## 2. Evaluate essay grading using quadratic weighted kappa

In [26]:
def Cmatrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = Cmatrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [27]:
quadratic_weighted_kappa(result_df['domain1_score'], result_df['GPT_rater_1_score'])

0.4777812186453426