#Requirements

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [3]:
from google.colab import drive
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch import cuda, tensor, no_grad
from tqdm.auto import tqdm
import json

In [4]:
# {
#   "overall": ,
#   "cohesion": ,
#   "syntax": ,
#   "vocabulary": ,
#   "phraseology": ,
#   "grammar": ,
#   "conventions": ,
# }


In [5]:
rubric_overall = '''A high-scoring essay has native-like facility in the use of language with syntactic variety, appropriate word choice and phrases; well-controlled text organization; precise use of grammar and conventions; rare language inaccuracies that do not impede communication.
A low scoring essay has a limited range of familiar words or phrases loosely strung together; frequent errors in grammar (including syntax) and usage. Communication impeded in most cases by language inaccuracies.'''

rubric_cohesion = '''A high-scoring essay has text organization consistently well-controlled using a variety of effective linguistic features such as reference and transitional words and phrases to connect ideas across sentences and paragraphs; appropriate overlap of ideas.
A low scoring essay has no clear control of organization; cohesive devices not present or unsuccessfully used; presentation of ideas unclear'''

rubric_syntax = '''A high-scoring essay has flexible and effective use of a full range of syntactic structures including simple, compound, and complex sentences; there may be rare minor and negligible errors in sentence formation.
A low-scoring essay has pervasive and basic errors in sentence structure and word order that cause confusion; basic sentence errors are common.'''


rubric_vocab = '''A high-scoring essay has a wide range of vocabulary flexibly and effectively used to convey precise meanings; skillful use of topic-related terms and less common words; rare negligible inaccuracies in word use.
A low scoring essay has limited vocabulary often inappropriately used; limited control of word choice and word forms; little attempt to use topic-related terms.'''


rubric_phrase = '''A high scoring essay has flexible and effective use of a variety of phrases, such as idioms, collocations, and lexical bundles, to convey precise and subtle meanings; rare minor inaccuracies that are negligible.
A low scoring essay hsa memorized chunks of language, or simple phrasal patterns, predominate; many repetitions and misuses of phrases. '''


rubric_grammar = '''A high scoring essay has command of grammar and usage with few or no errors. A low scoring essay has errors in grammar and usage throughout.'''


rubric_conv = '''A high scoring essay has consistent use of appropriate conventions to convey meaning; spelling, capitalization, and punctuation errors nonexistent or negligible. A low scoring essay has minimal use of conventions; spelling, capitalization, and punctuation errors throughout.'''


#Model Initialization

In [6]:
device = 'cuda' if cuda.is_available() else 'cpu'
print('loading tokenizer...\n')
tokenizer = T5Tokenizer.from_pretrained("t5-small")
print('loading model...\n')
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", low_cpu_mem_usage=True)
print('load to device...\n')
model = model.to(device)

loading tokenizer...



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


loading model...



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

load to device...



In [7]:
import pandas as pd

#get data file
!wget -q -O data.csv "https://docs.google.com/uc?export=download&id=13NuKWse5ZLbE7-3WtmdzYSeOOaJ3MydG"

data = pd.read_csv("data.csv")
#print(data.head())
print(data.columns)



Index(['id_1', 'id_2', 'text_1', 'text_2', '1_better', 'prompt'], dtype='object')


In [8]:

df = data.sample(n=100, random_state=42)
print(df.columns)

Index(['id_1', 'id_2', 'text_1', 'text_2', '1_better', 'prompt'], dtype='object')


In [9]:
def preprocess(data, rubric_text):

    #print(f"Processing with rubric: {rubric_text[50:]}...")
    # inputs = [f" '{rubric_text}'{full_text}" for full_text in data['full_text'] ]
    inputs = [f"You will compare two essays. based on the following rubric:\n'{rubric_text}'\n Essay A:\n'{text_1}'.\n Essay B:\n'{text_2}' (Give me a essay LETTER) The higher scoring essay is Essay "
      for text_1, text_2 in zip(data['text_1'], data['text_2'])]

    batch = tokenizer(inputs,
                      padding=True,
                      truncation=False,
                      max_length=2048,
                      return_tensors='pt',
                      return_attention_mask=True)
    batch['id_1'] = data['id_1']
    batch['id_2'] = data['id_2']
    batch['prompt'] = data['prompt' ]
    return batch

def compute_metric():
    return

def collate(data):
    return {'input_ids': tensor([ex['input_ids'] for ex in data]),
          'id_1': [ex['id_1'] for ex in data],
          'id_2': [ex['id_2'] for ex in data],
          'prompt': [ex['prompt'] for ex in data]}

# # def preprocess(data, rubric_text):
# #     batch = {}
# #     batch['messages'] = [f"You will compare two essays based on the following rubric:\n'{rubric_text}'\n Essay 1:\n'{text_1}'.\n Essay 2:\n' {text_2} Which essay scores higher? Provide ONLY the number."
# #     for text_1, text_2 in data['text_1', 'text_2']]
# #     batch['id_1'] = data['id_1']
# #     batch['id_2'] = data['id_2']
# #     batch['prompt'] = data['prompt' ]

#     return batch

# def compute_metric():
#     return

# def collate(data):
#     return {'messages': [ex['messages'] for ex in data],
#             'essay_ids': [ex['essay_ids'] for ex in data]}

In [10]:
from datasets import Dataset

batch_size = 1
# print('loading dataset...\n')
# train_data = Dataset.from_pandas(df)
# train_data = train_data.select_columns(['full_text', 'text_id_kaggle'])

In [11]:
results = {}
original_data = Dataset.from_pandas(df)

# original_data = original_data.select_columns(['full_text', 'text_id_kaggle'])
# rubrics = [rubric_overall, rubric_cohesion, rubric_syntax, rubric_vocab, rubric_phrase, rubric_grammar, rubric_conv]
# rubric_names = ['overall', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
rubrics = [rubric_overall]
rubric_names = ['overall']
for i, (rubric_name, rubric) in enumerate(zip(rubric_names, rubrics)):
    current_data = original_data.map(
        lambda x: preprocess(x, rubric),
        batched=True,
        batch_size=batch_size
    )

    train_dataloader = DataLoader(current_data,
                                batch_size=batch_size,
                                shuffle=False,
                                collate_fn=collate)

    with open(f'{rubric_name}.csv', 'w') as out_file:
        out_file.write('id_1,id_2,prompt,pred\n')
        with no_grad():
            for batch in train_dataloader:
                id_1 = batch['id_1']
                id_2 = batch['id_2']
                prompt = batch['prompt']
                del batch['id_1']
                del batch['id_2']
                del batch['prompt']

                batch = {k: v.to(device) for k, v in batch.items()}
                generated_output = model.generate(input_ids=batch['input_ids'], max_length=512)
                decoded_preds = tokenizer.batch_decode(generated_output, skip_special_tokens=True)
                # print(decoded_preds)
                for j in range(len(id_1)):
                    out_file.write(f'{id_1[j]},{id_2[j]},{prompt[j]},{decoded_preds[j][-1]}\n')

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [15]:
df = pd.read_csv('overall.csv')


ParserError: Error tokenizing data. C error: EOF inside string starting at row 85

In [12]:
import glob
path = '/content'
csv_files = [file for file in glob.glob(f'{path}/*.csv') if 'data.csv' not in file]

output_df = None

for file in csv_files:
    df = pd.read_csv(file)
    # print(df)

    # Extract the filename (without path and extension) for column naming
    file_name = file.split('/')[-1].split('.')[0]

    df = df.rename(columns={'pred': file_name})

    if output_df is None:
        output_df = df  # Initialize with the first file
    else:
        print(file_name)
        print(df)
        print(df['prompt'])
        output_df = pd.merge(output_df, df, on=['id_1', 'id_2', 'prompt'], how='inner')  # Inner join to match 'ids'



ParserError: Error tokenizing data. C error: EOF inside string starting at row 68

In [None]:
drive_path = '/content/drive/My Drive/senior_thesis/comparison_essay_grading/outputs/flant5-small.csv'
output_df.to_csv(drive_path, index=False)

print(f"saved at {drive_path}.")