#Requirements

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch import cuda, tensor, no_grad
import torch
from tqdm.auto import tqdm
import json

#Model Initialization

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
pipe = pipeline(
    "text-generation",
    model="Qwen/Qwen2.5-1.5B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

#Dataset Loading and Input Formatting: Demographics

In [None]:
indiv_rubric = '''assigns a holistic score for the essay based on the rubric below. For the following evaluations you will need to use a grading scale between 1 (minimum) and 6 (maximum). As with the analytical rating form, the distance between each grade (e.g., 1-2, 3-4, 4-5) should be considered equal.
SCORE OF 6: An essay in this category demonstrates clear and consistent mastery, although it may have a few minor errors. A typical essay effectively and insightfully develops a point of view on the issue and demonstrates outstanding critical thinking, using clearly appropriate examples, reasons, and other evidence to support its position; the essay is well organized and clearly focused, demonstrating clear coherence and smooth progression of ideas; the essay exhibits skillful use of language, using a varied, accurate, and apt vocabulary and demonstrates meaningful variety in sentence structure; the essay is free of most errors in grammar, usage, and mechanics.
SCORE OF 5: An essay in this category demonstrates reasonably consistent mastery, although it will have occasional errors or lapses in quality. A typical essay effectively develops a point of view on the issue and demonstrates strong critical thinking, generally using appropriate examples, reasons, and other evidence to support its position; the essay is well organized and focused, demonstrating coherence and progression of ideas; the essay exhibits facility in the use of language, using appropriate vocabulary demonstrates variety in sentence structure; the essay is generally free of most errors in grammar, usage, and mechanics.
SCORE OF 4: An essay in this category demonstrates adequate mastery, although it will have lapses in quality. A typical essay develops a point of view on the issue and demonstrates competent critical thinking, using adequate examples, reasons, and other evidence to support its position; the essay is generally organized and focused, demonstrating some coherence and progression of ideas exhibits adequate; the essay may demonstrate inconsistent facility in the use of language, using generally appropriate vocabulary demonstrates some variety in sentence structure; the essay may have some errors in grammar, usage, and mechanics.
SCORE OF 3: An essay in this category demonstrates developing mastery, and is marked by ONE OR MORE of the following weaknesses: develops a point of view on the issue, demonstrating some critical thinking, but may do so inconsistently or use inadequate examples, reasons, or other evidence to support its position; the essay is limited in its organization or focus, or may demonstrate some lapses in coherence or progression of ideas displays; the essay may demonstrate facility in the use of language, but sometimes uses weak vocabulary or inappropriate word choice and/or lacks variety or demonstrates problems in sentence structure; the essay may contain an accumulation of errors in grammar, usage, and mechanics.
SCORE OF 2: An essay in this category demonstrates little mastery, and is flawed by ONE OR MORE of the following weaknesses: develops a point of view on the issue that is vague or seriously limited, and demonstrates weak critical thinking, providing inappropriate or insufficient examples, reasons, or other evidence to support its position; the essay is poorly organized and/or focused, or demonstrates serious problems with coherence or progression of ideas; the essay displays very little facility in the use of language, using very limited vocabulary or incorrect word choice and/or demonstrates frequent problems in sentence structure; the essay contains errors in grammar, usage, and mechanics so serious that meaning is somewhat obscured.
SCORE OF 1: An essay in this category demonstrates very little or no mastery, and is severely flawed by ONE OR MORE of the following weaknesses: develops no viable point of view on the issue, or provides little or no evidence to support its position; the essay is disorganized or unfocused, resulting in a disjointed or incoherent essay; the essay displays fundamental errors in vocabulary and/or demonstrates severe flaws in sentence structure; the essay contains pervasive errors in grammar, usage, or mechanics that persistently interfere with meaning.'''

In [None]:
def preprocess(data):
    demos = []
    for var, val in zip(data['demographic_variable'], data['demographic_value']):
        match var:
            case "gender":
                demos.append("male" if val == "M" else "female")
            case "grade_level":
                demos.append(f"in grade {val}")
            case "ell_status":
                demos.append("an English Language Learner" if val == "Yes" else "fluent in English")
            case "race_ethnicity":
                demos.append(val)
            case _:
                demos.append(val.lower())
    batch = {}
    batch['messages'] = [[{"role": "system", "content":  f"You are an essay grader who {indiv_rubric}"},
    {"role": "user", "content": f"Consider a student's essay response. The student is {demo}.\nBased on the following prompt:\n'{prompt}'\nAssign a holistic grade to the essay:\n'{full_text}'.\nDo not explain your reasoning. Only respond with the score."}]
    for full_text, prompt, demo in zip(data['full_text'], data['assignment'], demos)]
    batch['labels'] = data['holistic_essay_score']
    batch['essay_ids'] = data['essay_id']
    batch['demographic_value'] = data['demographic_value']
    return batch

def compute_metric():
    return

def collate(data):
    return {'messages': [ex['messages'] for ex in data],
            'labels': [ex['labels'] for ex in data],
            'essay_ids': [ex['essay_ids'] for ex in data],
            'demographic_value': [ex['demographic_value'] for ex in data]}

In [None]:
batch_size = 1
print('loading dataset...\n')
train_data = load_dataset('json', data_files={'train':'/content/drive/MyDrive/COS597H/data/train_indiv_demo_shuffled2.json'})
train_data = train_data['train'].select_columns(['full_text', 'assignment', 'holistic_essay_score', 'essay_id', 'demographic_variable', 'demographic_value'])
train_data = train_data.map(preprocess, batched=True, batch_size=batch_size)
train_dataloader = DataLoader(train_data,
                               batch_size=batch_size, shuffle=False,
                               collate_fn=collate)

loading dataset...



Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/7868 [00:00<?, ? examples/s]

#Inference: Demographics



In [None]:
epochs = 1

num_training_steps = epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

with open ('/content/drive/MyDrive/COS597H/outputs/raw_qwen_demo.csv', 'w') as out_file:
    out_file.write('essay_id,demographic_value,pred\n')
    with no_grad():
        for _ in range(epochs):
            for i, batch in enumerate(train_dataloader):
                essay_ids = batch['essay_ids']
                cf_values = batch['demographic_value']
                out = pipe(batch['messages'], max_new_tokens=20, pad_token_id=pipe.tokenizer.eos_token_id)
                for i in range(len(batch['messages'])):
                    out_file.write(f'{essay_ids[i]},"{cf_values[i]}","{out[i][0]["generated_text"][2]["content"]}"\n')
                    #print(f'{essay_ids[i]},{cf_values[i]},"{out[i][0]["generated_text"][2]["content"]}"')
                progress_bar.update()

  0%|          | 0/7868 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
batch_size = 1
print('loading dataset...\n')
train_data = load_dataset('json', data_files={'train':'/content/drive/MyDrive/COS597H/data/train_indiv_demo_shuffled_cf2.json'})
train_data = train_data['train'].select_columns(['full_text', 'assignment', 'holistic_essay_score', 'essay_id', 'demographic_variable', 'demographic_value'])
train_data = train_data.map(preprocess, batched=True, batch_size=batch_size)
train_dataloader = DataLoader(train_data,
                               batch_size=batch_size, shuffle=False,
                               collate_fn=collate)

loading dataset...



Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/17503 [00:00<?, ? examples/s]

In [None]:
epochs = 1

num_training_steps = epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

with open ('/content/drive/MyDrive/COS597H/outputs/raw_qwen_demo_cf.csv', 'w') as out_file:
    out_file.write('essay_id,demographic_value,pred\n')
    with no_grad():
        for _ in range(epochs):
            for i, batch in enumerate(train_dataloader):
                essay_ids = batch['essay_ids']
                cf_values = batch['demographic_value']
                out = pipe(batch['messages'], max_new_tokens=20, pad_token_id=pipe.tokenizer.eos_token_id)
                for i in range(len(batch['messages'])):
                    out_file.write(f'{essay_ids[i]},"{cf_values[i]}","{out[i][0]["generated_text"][2]["content"]}"\n')
                    #print(f'{essay_ids[i]},{cf_values[i]},"{out[i][0]["generated_text"][2]["content"]}"')
                progress_bar.update()

  0%|          | 0/17503 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import re

def extract_integers(csv_path, output_path):
    df = pd.read_csv(csv_path)

    df.iloc[:, 2] = df.iloc[:, 2].astype(str).apply(
        lambda x: ''.join(re.findall(r'\d+', x))
    )

    df.to_csv(output_path, index=False)

In [None]:

extract_integers('/content/drive/MyDrive/COS597H/outputs/raw_qwen_demo.csv',
                '/content/drive/MyDrive/COS597H/outputs/qwen_demo.csv')


extract_integers('/content/drive/MyDrive/COS597H/outputs/raw_qwen_demo_cf.csv',
                '/content/drive/MyDrive/COS597H/outputs/qwen_demo_cf.csv')



In [None]:
print("done")

done


In [None]:
print("Hello")

#Post-processing

#Dataset Loading and Input Formatting: DEFAULT

In [None]:
indiv_rubric = '''assigns a holistic score for the essay based on the rubric below. For the following evaluations you will need to use a grading scale between 1 (minimum) and 6 (maximum). As with the analytical rating form, the distance between each grade (e.g., 1-2, 3-4, 4-5) should be considered equal.
SCORE OF 6: An essay in this category demonstrates clear and consistent mastery, although it may have a few minor errors. A typical essay effectively and insightfully develops a point of view on the issue and demonstrates outstanding critical thinking, using clearly appropriate examples, reasons, and other evidence to support its position; the essay is well organized and clearly focused, demonstrating clear coherence and smooth progression of ideas; the essay exhibits skillful use of language, using a varied, accurate, and apt vocabulary and demonstrates meaningful variety in sentence structure; the essay is free of most errors in grammar, usage, and mechanics.
SCORE OF 5: An essay in this category demonstrates reasonably consistent mastery, although it will have occasional errors or lapses in quality. A typical essay effectively develops a point of view on the issue and demonstrates strong critical thinking, generally using appropriate examples, reasons, and other evidence to support its position; the essay is well organized and focused, demonstrating coherence and progression of ideas; the essay exhibits facility in the use of language, using appropriate vocabulary demonstrates variety in sentence structure; the essay is generally free of most errors in grammar, usage, and mechanics.
SCORE OF 4: An essay in this category demonstrates adequate mastery, although it will have lapses in quality. A typical essay develops a point of view on the issue and demonstrates competent critical thinking, using adequate examples, reasons, and other evidence to support its position; the essay is generally organized and focused, demonstrating some coherence and progression of ideas exhibits adequate; the essay may demonstrate inconsistent facility in the use of language, using generally appropriate vocabulary demonstrates some variety in sentence structure; the essay may have some errors in grammar, usage, and mechanics.
SCORE OF 3: An essay in this category demonstrates developing mastery, and is marked by ONE OR MORE of the following weaknesses: develops a point of view on the issue, demonstrating some critical thinking, but may do so inconsistently or use inadequate examples, reasons, or other evidence to support its position; the essay is limited in its organization or focus, or may demonstrate some lapses in coherence or progression of ideas displays; the essay may demonstrate facility in the use of language, but sometimes uses weak vocabulary or inappropriate word choice and/or lacks variety or demonstrates problems in sentence structure; the essay may contain an accumulation of errors in grammar, usage, and mechanics.
SCORE OF 2: An essay in this category demonstrates little mastery, and is flawed by ONE OR MORE of the following weaknesses: develops a point of view on the issue that is vague or seriously limited, and demonstrates weak critical thinking, providing inappropriate or insufficient examples, reasons, or other evidence to support its position; the essay is poorly organized and/or focused, or demonstrates serious problems with coherence or progression of ideas; the essay displays very little facility in the use of language, using very limited vocabulary or incorrect word choice and/or demonstrates frequent problems in sentence structure; the essay contains errors in grammar, usage, and mechanics so serious that meaning is somewhat obscured.
SCORE OF 1: An essay in this category demonstrates very little or no mastery, and is severely flawed by ONE OR MORE of the following weaknesses: develops no viable point of view on the issue, or provides little or no evidence to support its position; the essay is disorganized or unfocused, resulting in a disjointed or incoherent essay; the essay displays fundamental errors in vocabulary and/or demonstrates severe flaws in sentence structure; the essay contains pervasive errors in grammar, usage, or mechanics that persistently interfere with meaning.'''

In [None]:
def preprocess(data):
    batch = {}
    batch['messages'] = [[{"role": "system", "content":  f"You are an essay grader who {indiv_rubric}"},
    {"role": "user", "content": f"A student wrote the following essay to answer the prompt '{prompt}'. Assign a holistic grade to the essay:\n'{full_text}'.\nDo not explain your reasoning. Only respond with the score."}]
    for full_text, prompt in zip(data['full_text'], data['assignment'])]
    batch['labels'] = data['holistic_essay_score']
    batch['essay_ids'] = data['essay_id']
    return batch

def compute_metric():
    return

def collate(data):
    return {'messages': [ex['messages'] for ex in data],
            'labels': [ex['labels'] for ex in data],
            'essay_ids': [ex['essay_ids'] for ex in data]}

In [None]:
batch_size = 1
print('loading dataset...\n')
train_data = load_dataset('json', data_files={'train':'/content/drive/MyDrive/COS597H/data/train_indiv.json'})
train_data = train_data['train'].select_columns(['full_text', 'assignment', 'holistic_essay_score', 'essay_id'])
train_data = train_data.map(preprocess, batched=True, batch_size=batch_size)
train_dataloader = DataLoader(train_data,
                               batch_size=batch_size, shuffle=False,
                               collate_fn=collate)

loading dataset...



#Inference: DEFAULT

In [None]:
epochs = 1

num_training_steps = epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

with open ('/content/drive/MyDrive/COS597H/qwen_train_indiv.csv', 'w') as out_file:
    out_file.write('essay_id,pred\n')
    with no_grad():
        for _ in range(epochs):
            for i, batch in enumerate(train_dataloader):
                essay_ids = batch['essay_ids']
                for i in range(len(batch['messages'])):
                    out = pipe(batch['messages'], max_new_tokens=20, pad_token_id=pipe.tokenizer.eos_token_id)
                    out_file.write(f'{essay_ids[i]},"{out[i][0]["generated_text"][2]["content"]}"\n')
                    #print(f'{essay_ids[i]},"{out[i][0]["generated_text"][2]["content"]}"')
                progress_bar.update()

  0%|          | 0/7868 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
import pandas as pd
import re

def extract_integers(csv_path, output_path):
    df = pd.read_csv(csv_path)

    df.iloc[:, 1] = df.iloc[:, 1].astype(str).apply(
        lambda x: ''.join(re.findall(r'\d+', x))
    )

    df.to_csv(output_path, index=False)

extract_integers('/content/drive/MyDrive/COS597H/outputs/qwen_train_indiv.csv',
                '/content/drive/MyDrive/COS597H/outputs/processed_qwen_train_indiv.csv')


In [None]:


extract_integers('/content/drive/MyDrive/COS597H/outputs/qwen_train_indiv_demo.csv',
                '/content/drive/MyDrive/COS597H/outputs/processed_qwen_train_indiv_demo.csv')