In [1]:
#connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#create df for responses calculating the diff btwn llm and ground truth
def depreciated_calculate_response_categories(llm_grade, ground_truth, max_difference=4):
    """
    Convert diff btwn llm grade and ground truth

    Take:
        llm_grade: Grade given by llm
        ground_truth: Correct grade from ground truth
        max_difference: Maximum allowed diff for scaling

    Returns:
        accuracy, int range: 1 to max_difference + 1
        Higher numbers == better performance (closer to ground truth)
    """

    difference = abs(llm_grade - ground_truth)
    difference = min(difference, max_difference)
    # Convert to a score where higher is better
    # If difference is 0, score will be max_difference + 1
    # If difference is max_difference, score will be 1
    score = (max_difference + 1) - difference
    return score

In [4]:
#create df for responses calculating the diff btwn llm and ground truth
def dep_2_calculate_response_categories(llm_grade, ground_truth, max_difference=4):
    """
    Convert diff btwn llm grade and ground truth

    Take:
        llm_grade: Grade given by llm
        ground_truth: Correct grade from ground truth
        max_difference: Maximum allowed diff for scaling

    Returns:
        accuracy, int range: 1 to max_difference + 1
        Higher numbers == better performance (closer to ground truth)
    """

    difference = abs(llm_grade - ground_truth)
    # difference = min(difference, max_difference)
    # Convert to a score where higher is better
    # If difference is 0, score will be max_difference + 1
    # If difference is max_difference, score will be 1
    # score = (max_difference + 1) - difference
    score = 1/(1+difference**2)
    ordinal_bins = np.quantile(score, [0, 0.2, 0.4, 0.6, 0.8, 1])
    ordinal_labels = [1, 2, 3, 4, 5]  # Low to high performance

    # Convert continuous scores to ordinal categories
    ordinal_scores = np.digitize(score, bins=ordinal_bins, right=True)

    return score

In [5]:
#create df for responses calculating the diff btwn llm and ground truth
def calculate_response_categories(llm_grade, ground_truth, max_difference=4):
    """
    Convert diff btwn llm grade and ground truth

    Take:
        llm_grade: Grade given by llm
        ground_truth: Correct grade from ground truth
        max_difference: Maximum allowed diff for scaling

    Returns:
        accuracy, int range: 1 to max_difference + 1
        Higher numbers == better performance (closer to ground truth)
    """

    difference = abs(llm_grade - ground_truth)
    if difference <= 0.5:
      return 1

    return 0
    # # difference = min(difference, max_difference)
    # # Convert to a score where higher is better
    # # If difference is 0, score will be max_difference + 1
    # # If difference is max_difference, score will be 1
    # # score = (max_difference + 1) - difference
    # score = 1/(1+difference**2)
    # ordinal_bins = np.quantile(score, [0, 0.2, 0.4, 0.6, 0.8, 1])
    # ordinal_labels = [1, 2, 3, 4, 5]  # Low to high performance

    # # Convert continuous scores to ordinal categories
    # ordinal_scores = np.digitize(score, bins=ordinal_bins, right=True)

    # return score

In [6]:
def prepare_grm_data(llm_dataframes, ground_truth_df):
    """
    Prepares data for graded response IRT analysis by comparing student
    grades to ground truth across multiple rubric categories.
    """
    categories = ['overall', 'cohesion', 'syntax', 'vocabulary', 'phraseology',
                 'grammar', 'conventions']

    category_dataframes = {}

    for category in categories:
        category_data = pd.DataFrame(index=ground_truth_df.index)
        category_data['essay_id'] = ground_truth_df['text_id_kaggle']

        for name, df in llm_dataframes.items():
            aligned_responses = []

            for id in ground_truth_df['text_id_kaggle']:
                try:
                    # Get ground truth grade
                    true_grade = ground_truth_df.loc[ground_truth_df['text_id_kaggle'] == id, category].values[0]

                    # Get LLM grade with better numeric handling
                    llm_grade = df.loc[df['essay_id'] == id, category].values[0]

                    # Convert to float first to handle decimals
                    try:
                        llm_grade = float(llm_grade)
                        # # If you need integers, round it
                        # if category in ['overall']:  # Add categories that should be integers
                        #     llm_grade = round(llm_grade)
                    except (ValueError, TypeError):
                        print(f"Warning: Invalid grade value '{llm_grade}' for {id}, defaulting to 3")
                        llm_grade = 3

                    response = calculate_response_categories(llm_grade, true_grade)
                    aligned_responses.append(response)

                except IndexError:
                    print(f"Warning: Missing data for essay_id {id}, defaulting to middle response")
                    aligned_responses.append(3)  # or whatever your middle response value should be
                except Exception as e:
                    print(f"Error processing essay {id}: {e}")
                    aligned_responses.append(3)

            category_data[name] = aligned_responses

        category_dataframes[category] = category_data

    return category_dataframes

In [10]:
import glob
path = '/content/drive/MyDrive/senior_thesis/essay_grading/final_outputs'
csv_files = [file for file in glob.glob(f'{path}/*.csv')]


## we also want to get a list of common ids in case of nans, etc etc-- and to take it out of the ground truth set

common_ids = pd.DataFrame(columns=['essay_id'])
llm_dfs ={}
for file in csv_files:
    df = pd.read_csv(file)
    df.dropna(inplace=True)
    if common_ids.empty:
        common_ids['essay_id'] = df['essay_id']
    else:
      common_ids = pd.merge(common_ids, df['essay_id'], on=['essay_id'], how='inner')
    file_name = file.split('/')[-1]
    file_name = file_name[:-4]
    llm_dfs[file_name] = df.copy()

In [11]:
llm_dfs.keys()

dict_keys(['llama-3.2-3b', 'llama-3.2-1b', 'gemma-2-2b', 'gemma-2b', 'llama-3-8b-it', 'gemma-1.1-2b-it', 'gemma-2-2b-it', 'llama-3.2-1b-it', 'llama-3.2-3b-it', 'gemma-7b', 'flan-t5-large', 'qwen-2.5-1.5b-it', 'qwen-2.5-0.5b-it', 'qwen-2.5-3b-it', 'qwen-2.5-7b-it-1m', 'phi-3-mini-4k-it', 'phi-3.5-mini-instruct'])

In [12]:
llm_dfs['flan-t5-large']

Unnamed: 0,essay_id,overall,grammar,cohesion,conventions,vocabulary,syntax,phraseology
0,9CDD8FC77D5A,4,4,4,4,4,4,4
1,C19EDCF7BDD6,3,3,3,3,3,3,3
2,99D067A7FA9C,4,4,3,4,4,4,4
3,8E40E03B113A,3,3,3,3,4,3,4
4,9A96A89DF959,4,3,4,3,3,3,3
...,...,...,...,...,...,...,...,...
295,1F8441914809,3,4,3,4,3,3,4
296,8650DDFD974B,4,4,3,4,4,4,4
297,991748F8ED61,3,3,3,3,3,3,3
298,01D8970208A9,4,4,4,4,4,4,4


In [13]:
common_ids = common_ids['essay_id']

In [15]:
ground_truth_full = pd.read_csv("/content/drive/MyDrive/senior_thesis/essay_grading/ellipse_dataset.csv")
ground_truth = ground_truth_full.merge(common_ids, left_on='text_id_kaggle', right_on='essay_id')
ground_truth = ground_truth[['text_id_kaggle','Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology',
       'Grammar', 'Conventions']]
ground_truth = ground_truth.rename(columns={'Overall': 'overall', 'Cohesion': 'cohesion', 'Syntax': 'syntax', 'Vocabulary':'vocabulary', 'Phraseology':'phraseology', 'Grammar':'grammar', 'Conventions':'conventions'})

ground_truth.head()

Unnamed: 0,text_id_kaggle,overall,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,871097AE113C,3.0,2.5,2.5,3.0,3.5,3.0,2.5
1,C5813A0D6AF5,2.5,2.0,3.0,2.5,3.0,2.5,2.0
2,663149D85F9C,3.5,4.0,3.5,3.5,3.5,3.0,3.5
3,90978A2F8599,4.0,4.0,3.5,4.0,3.0,4.0,4.0
4,9F979C691167,3.0,3.5,3.0,3.5,3.0,3.5,3.5


In [16]:
llm_dfs['gemma-2b']

Unnamed: 0,essay_id,conventions,cohesion,grammar,vocabulary,overall,phraseology,syntax
0,9CDD8FC77D5A,4,4,4,4,4,4,4
1,C19EDCF7BDD6,4,4,4,4,4,4,4
2,99D067A7FA9C,5,4,4,4,4,4,4
3,8E40E03B113A,4,4,5,4,4,4,4
4,9A96A89DF959,4,4,5,4,4,4,4
...,...,...,...,...,...,...,...,...
295,1F8441914809,5,5,5,4,4,4,5
296,8650DDFD974B,4,4,5,4,4,4,4
297,991748F8ED61,4,4,4,4,4,4,4
298,01D8970208A9,4,4,4,4,4,4,4


In [17]:
category_data = prepare_grm_data(llm_dfs, ground_truth)

' for 8EE0540E163D, defaulting to 3


In [18]:
category_data['overall']

Unnamed: 0,essay_id,llama-3.2-3b,llama-3.2-1b,gemma-2-2b,gemma-2b,llama-3-8b-it,gemma-1.1-2b-it,gemma-2-2b-it,llama-3.2-1b-it,llama-3.2-3b-it,gemma-7b,flan-t5-large,qwen-2.5-1.5b-it,qwen-2.5-0.5b-it,qwen-2.5-3b-it,qwen-2.5-7b-it-1m,phi-3-mini-4k-it,phi-3.5-mini-instruct
0,871097AE113C,1,0,1,0,0,1,1,0,0,1,0,1,0,0,0,0,0
1,C5813A0D6AF5,1,0,1,0,0,1,1,1,0,1,0,1,0,0,0,0,1
2,663149D85F9C,1,1,1,1,0,1,0,0,0,1,1,1,0,0,0,0,0
3,90978A2F8599,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,9F979C691167,0,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,591B7E3E2B40,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0
285,0757FE18DBAA,0,0,1,0,0,1,1,1,0,1,0,0,0,1,0,1,1
286,759E0659FF69,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0
287,9E4F43374DD5,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [19]:
category_data['overall'].to_csv('overall_irt.csv')

In [23]:
# prompt: make a dataframe that has the columns Model and Accuracy with the accuracy being the accuracy for the model in category_data['overall']

import pandas as pd
model_accuracy = []
for model in category_data['overall'].columns:
  if model != 'essay_id':
    accuracy = category_data['overall'][model].mean()
    model_accuracy.append([model, accuracy])

df_accuracy = pd.DataFrame(model_accuracy, columns=['Model', 'Accuracy'])
df_accuracy.to_csv('grm_model_accuracy.csv')

In [20]:
category_data['overall']['gemma-1.1-2b-it'].equals(category_data['overall']['gemma-2-2b-it'])

False

In [21]:
!pip install girth --upgrade
import girth
from girth import grm_mml as grm

type(category_data['overall'])
category_data['overall'].set_index('essay_id')
category_data['overall']


#drop rows summing to 0
category_data['overall'] = category_data['overall'][category_data['overall'].iloc[:, 1:].sum(axis=1) != 0]

overall = category_data['overall'].iloc[:, 1:].to_numpy().astype(int)

from girth import twopl_mml as twopl
output = twopl(overall)

output

Collecting girth
  Downloading girth-0.8.0-py3-none-any.whl.metadata (9.7 kB)
Downloading girth-0.8.0-py3-none-any.whl (67 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: girth
Successfully installed girth-0.8.0


KeyboardInterrupt: 

##actual grm analysis here

In [None]:
!pip install py-irt

In [None]:
overall

In [None]:
#, options= {"max_iteration":500, "quadrature_bounds":(-6, 6)}

In [None]:
output = onepl_mml(overall)

In [None]:
type(output)

In [None]:
output

In [None]:
output["Difficulty"]

In [None]:
# graph of distribution of discrimination and difficulty across items
fig, ax1 = plt.subplots(figsize=(10, 5))

#disc
ax1.hist(output['Discrimination'], bins=20)
ax1.set_title('Distribution of Discrimination Parameters')
ax1.set_xlabel('Discrimination')
ax1.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
output_df = pd.DataFrame({"essay_id": common_ids, "Discrimination": output['Discrimination']})
for i in range(output['Difficulty'].shape[1]):
    output_df[f'Difficulty_Threshold_{i+1}'] = output['Difficulty'][:, i]
output_df.head()

In [None]:
# #drop anything with non filled in thresholds -- should I do this? idk lol
# output_df = output_df.dropna()
# output_df.head()

In [None]:
most_discriminative = output_df.nlargest(5,'Discrimination')
most_discriminative.head()

In [None]:
least_discriminative = output_df.nsmallest(5,'Discrimination')
least_discriminative.head()

In [None]:
most_disc_ex = '''
As we have two different people, we have two types of parents that one of them, they believe and they force their children that should take some classes like music, drama, or an art class instead of elective classes. But another parents they do not believe and force them to take those classes. On the other hand we have some students that they like to take elective classes instead of those classes; and in opposite, we have students that they do not like to take elective classes.

In general, as we know the place of schools, we can easily discover that students need have some classes that they want to be and they like to enjoy it and has fun and exciting for them. Here are some reasons that is showed why they have to take those classes; like music, a drama, or an art class. First reason is if they took those classes, they can decrease their stress that they have in their life and their school. Second reason is, they can easily figure out what skills are they have in their life and out of the main classes in school. Third reason is, if they really learned from those classes, they can teach to another persons and get money for that. Fourth reason is, it is good for their health from all aspects. Fifth reason is, it gives them energy; so they can always be active in their life specially in their school.

First, on the other hand, some students is special because maybe they came from another country or have special condition that they not required to take those classes because they have to graduate or because of their English language; so they have to study and practice more; and they can not require to take those classes. They have to take main and elective classes that their counselor suggested to them. for example, I came from another country so I have to practice more in my English language and other main classes that I have in school, for these reasons I could not take an art, music or a drama class.

Second, another students just take those classes because their parents want to, so these students can not learn and enjoy from those classes and even they will hate from those classes. for example, I saw my friends that he took a music class but he did not like that, but their parents force and required him to take that class, in summary he fell from all classes that he had in his school, and finally he did not graduate from school because of that.

Third, some students that know English language include writing, speaking and listening, and has all A's in their main classes like Physics, Math and Chemistry and another main classes, they can take music, an art or a drama class. As I said some benefit of those classes on above, they can improve their skills and they can go in forward to their life that they want to be.

Finally, those classes can be help students a lot, but it has hinge on their status, and no one should not force students to take those classes unless they want to and they like to take those classes.

In conclusion, those classes should not be compulsory, because in general, each work should has be with love, until we can learn and help others and improve ourselves.
'''

least_disc_ex = '''
I am sure my heart is raising that I saw a lot of people behavior is going bazaar! Then it is said that your own behavior is the best way to influence other people because it creates trust, shows that your helping others, and creates a positive big impact on others.

Your own behavior creates trust is to show that other can respect you and care about you. Friends behavior creates trust because they will sometimes help others. For example, if i have a huge injury and I can't walk, then probably they'll know that I'm not joking around and I'll trust them to help me. Another person behavior that creates trust is teachers, their behavior is mostly positive and always influence others. They'll always care about helping their student and makes the student trust the teacher what they need help on. Even though students think that teacher dont trust them, but others think teachers have faith on others to try hard to help and let them work hard for the future. Also, families creates trust so they will always have influence on me. It like i made their personality more positive than before. Your own behavior creates trust because of helping others, having positive personality, and increase better relationship on others.

Showing that you help others is another way to influence other people from your own behavior. Sometimes my families help others and they will never forget about us. Like all of our family went to Generic_City and went on a mission to help others, like we build houses or feed a lot of poor people with food that we made. Another person that help others is friends, they always help me if i'm in a stress zone. They always comfort me feel better when I'm upset or mad. Also, teachers help other because they want the students to past the school. Although some of them might fail, yet teachers try their best to let them past. They want them to try hard so teachers need to make them work hard and help them learn. Like my teachers will try their best to help me past my quizzes or test. Helping others is the best way to influence people like friends, teachers, and families that they will support you and others what they need.

The reason why your own behavior creates a big positive impact of others. So more and more people will show respect and will mostly love you. This will create a big positive impact on friends because I show love for them and they will probably love me back. All of us should always show love and care so that they wont hate you. For families, they always have a positive impact because of others positive attitude. Showing a good attitude can be like walking the bridge to their heart. This can help their feeling be better and better. Another person that have a big impact on others is firefighters. They always care about others and their lives so they always protect them when they're in trouble. There was one story that these firefighters that they need equipment because their tools is rusty and broken, but one person called them for help so they came the house was on fire and the wife is still trapped. So they got her out and she is save. So the husband so so proud, and send them 2,000 dollars a month so the firefighter can bye the equipment. This it to show how much the husband care about them. So this can create a big positive impact because of showing love, fixing feelings, and having risk to protect.

Other might say that your own behavior can affect others. It has said that you own be behavior can influence other by showing that you help others create a positive big impact, and creates trust. We should see if others behavior can influence others and change the world.
'''


In [None]:
most_difficult = output_df.nlargest(5,'Difficulty_Threshold_1')
most_difficult.head()

In [None]:
least_difficult = output_df.nsmallest(5,'Difficulty_Threshold_1')
least_difficult.head()

In [None]:
most_diff_ex = '''
Every day all the people in the world make a lot decisions and each one have different way to decided. Sometimes you have you have to ask for help, becasue there is a lot of people with experienced for example, parents,teachers and experts. I think decisions can affect your future and also can affect other people. Adults, specially old people have a lot advices for you to be a better person.

First of all, decisions are the key of the one person and can affect the future for the rest of the life. For example,education is one of the most important thing in the life, and that's why your parents always keep saying go to school, and finish the high school that advice is given to you because they want to see a better future for you and you have to listen because is the experienced that is talk is not your parents.

Second, adults have a lot of experience specially old people (grandmother and grandparent).For example, every time when your parents talk for your future, they know about the life because all they have experienced and maybe when they were young, make a bad decisions and that's why the given to you a lot advices because they worried about you.

Finally, most of the time specially (young girls and boys) when they decided something always is wrong because they only care they self but that decisions can affect other people. For expample I from Generic_City and in my country all the time you see young people make the decision to drop out of school and get to the gangs. As a result, they get to the jail or they died and the only people who cry is your family bacuse they want all the best for you.

In conclusion make decisions is not easy for each one because some are easily and some are hard, but I think you shoot ask all the time first for help with your parents and with experts because they have a lot experienced.
'''


least_diff_ex = '''
Dear principal

The students and I have been thinking to ask you if you can change the schools policy to a grade C .The reason is that the students will be able to participate in sports or school activities.

Students want to be able be in the schools teams and the students want to have fun in school .

If we need to get B average and many of the students are C average

it would not be fair to the C average students. So can the policy change .

by students
'''


In [None]:
#generate item response category characteristic curves for a given item

def plot_icc(item_index):
    """
    Generate item response category characteristic curves
    thetas: array with thetas
    Discrimination: single number for disc
    Difficulty: array with thresholds
    """
    theta = output['Ability']
    theta = np.array(theta)
    theta = np.sort(theta)

    discrimination = output_df['Discrimination'][item_index]
    discrimination = np.array(discrimination)

    difficulty = output['Difficulty'][item_index]
    difficulty = np.array(difficulty)


    #drop na threshold vals
    difficulty = difficulty[np.logical_not(np.isnan(difficulty))]

        # Calculate cumulative category probabilities
    cum_probs = []
    for diff in difficulty:
        logit = discrimination * (theta - diff)
        p = 1 / (1 + np.exp(-logit))
        cum_probs.append(p)
    cum_probs = np.array(cum_probs)

    # Calculate category probabilities
    k = len(difficulty) + 1  # number of categories
    cat_probs = np.zeros((k, len(theta)))

    # First category (0)
    cat_probs[0] = 1 - cum_probs[0]

    # Middle categories
    for i in range(1, k-1):
        cat_probs[i] = cum_probs[i-1] - cum_probs[i]

    # Last category
    cat_probs[-1] = cum_probs[-1]

    # Plotting
    plt.figure(figsize=(10, 6))
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

    for i in range(k):
        plt.plot(theta, cat_probs[i], color=colors[i],
                label=f'Category {i}', linewidth=2)

    plt.xlabel('Theta (Ability)')
    plt.ylabel('Probability')
    plt.title(f'Item {item_index}: Category Response Curves')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xlim(min(theta), max(theta))
    plt.ylim(0, 1)
    plt.show()


In [None]:
plot_icc(30)

In [None]:
category_data['overall'].columns

## Comparing to benchmarks?

In [None]:
#df for abilities from output
thetas = pd.DataFrame({"Model": category_data['overall'].columns, "Theta": output['Ability']})
thetas.head()

In [None]:
benchmarks = pd.read_csv("https://docs.google.com/spreadsheets/d/1wIkHngYOwSewOlVM7PquMqpH338uBEKKwPQSv6tEuzs/export?format=csv")
benchmarks.head()


In [None]:
stats = pd.merge(benchmarks, thetas)
stats.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler_minmax = MinMaxScaler()
stats_minmax = stats.copy()
stats_minmax[["IFEval", "Big Bench Hard", "MuSR", "Theta"]] = scaler_minmax.fit_transform(stats[["IFEval", "Big Bench Hard", "MuSR", "Theta"]])

# # should z score instead maybe??? Idk girl
# scaler_std = StandardScaler()
# stats_std = stats.copy()
# stats_std[["IFEval", "Big Bench Hard", "MuSR", "Theta"]] = scaler_std.fit_transform(stats[["IFEval", "Big Bench Hard", "MuSR", "Theta"]])

# print(stats_std.head())


In [None]:
print(stats)


In [None]:
def get_theta_info(stats):

  from scipy.stats import spearmanr

  correlations = {col: spearmanr(stats["Theta"], stats[col])[0] for col in ["IFEval", "Big Bench Hard", "MuSR"]}
  print("Spearman Correlations with Theta:", correlations)

    #explained variance
  from sklearn.linear_model import LinearRegression
  X = stats[["IFEval", "Big Bench Hard", "MuSR"]]
  y = stats["Theta"]
  reg = LinearRegression().fit(X, y)
  r_squared = reg.score(X, y)
  print("R² of Theta predicting benchmarks:", r_squared)

In [None]:
get_theta_info(stats_minmax)

In [None]:

output_cohesion = grm(category_data['cohesion'].to_numpy().astype(int))
theta_cohesion = pd.DataFrame({"Model": category_data['overall'].columns, "Theta": output_cohesion['Ability']})

In [None]:

stats2 = pd.merge(benchmarks, theta_cohesion)
stats2.head()

get_theta_info(stats2)

In [None]:

output_vocab = grm(category_data['vocabulary'].to_numpy().astype(int))
theta_vocab = pd.DataFrame({"Model": category_data['overall'].columns, "Theta": output_vocab['Ability']})

stats_vocab = pd.merge(benchmarks, theta_vocab)
stats_vocab.head()

get_theta_info(stats_vocab)

In [None]:

output_syntax = grm(category_data['syntax'].to_numpy().astype(int))
theta_syntax = pd.DataFrame({"Model": category_data['overall'].columns, "Theta": output_syntax['Ability']})

stats_syntax = pd.merge(benchmarks, theta_syntax)
stats_syntax.head()

get_theta_info(stats_syntax)