#Requirements

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!pip install datasets

In [None]:
from google.colab import drive
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch import cuda, tensor, no_grad
from tqdm.auto import tqdm
import json

In [64]:
# {
#   "overall": ,
#   "cohesion": ,
#   "syntax": ,
#   "vocabulary": ,
#   "phraseology": ,
#   "grammar": ,
#   "conventions": ,
# }


In [65]:
rubric_overall = '''A high-scoring essay has native-like facility in the use of language with syntactic variety, appropriate word choice and phrases; well-controlled text organization; precise use of grammar and conventions; rare language inaccuracies that do not impede communication.
A low scoring essay has a limited range of familiar words or phrases loosely strung together; frequent errors in grammar (including syntax) and usage. Communication impeded in most cases by language inaccuracies.'''

rubric_cohesion = '''A high-scoring essay has text organization consistently well-controlled using a variety of effective linguistic features such as reference and transitional words and phrases to connect ideas across sentences and paragraphs; appropriate overlap of ideas.
A low scoring essay has no clear control of organization; cohesive devices not present or unsuccessfully used; presentation of ideas unclear'''

rubric_syntax = '''A high-scoring essay has flexible and effective use of a full range of syntactic structures including simple, compound, and complex sentences; there may be rare minor and negligible errors in sentence formation.
A low-scoring essay has pervasive and basic errors in sentence structure and word order that cause confusion; basic sentence errors are common.'''


rubric_vocab = '''A high-scoring essay has a wide range of vocabulary flexibly and effectively used to convey precise meanings; skillful use of topic-related terms and less common words; rare negligible inaccuracies in word use.
A low scoring essay has limited vocabulary often inappropriately used; limited control of word choice and word forms; little attempt to use topic-related terms.'''


rubric_phrase = '''A high scoring essay has flexible and effective use of a variety of phrases, such as idioms, collocations, and lexical bundles, to convey precise and subtle meanings; rare minor inaccuracies that are negligible.
A low scoring essay hsa memorized chunks of language, or simple phrasal patterns, predominate; many repetitions and misuses of phrases. '''


rubric_grammar = '''A high scoring essay has command of grammar and usage with few or no errors. A low scoring essay has errors in grammar and usage throughout.'''


rubric_conv = '''A high scoring essay has consistent use of appropriate conventions to convey meaning; spelling, capitalization, and punctuation errors nonexistent or negligible. A low scoring essay has minimal use of conventions; spelling, capitalization, and punctuation errors throughout.'''


#Model Initialization

In [87]:

from google.colab import drive
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch import cuda, tensor, no_grad
import torch
from tqdm.auto import tqdm
import json
access_token = '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'

device = 'cuda' if cuda.is_available() else 'cpu'
pipe = pipeline(
    "text-generation",
    model="google/gemma-7b",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token = access_token,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cpu


In [88]:
import pandas as pd

#get data file
!wget -q -O data.csv "https://docs.google.com/uc?export=download&id=13NuKWse5ZLbE7-3WtmdzYSeOOaJ3MydG"


data = pd.read_csv("data.csv")
#print(data.head())
print(data.columns)

# essays = df[['text_id_kaggle','full_text']]
# essays.rename(columns={'text_id_kaggle': 'essay_id'}, inplace=True)
# essays

Index(['id_1', 'id_2', 'text_1', 'text_2', '1_better', 'prompt'], dtype='object')


In [89]:

df = data.sample(n=100, random_state=42)
print(df.columns)


Index(['id_1', 'id_2', 'text_1', 'text_2', '1_better', 'prompt'], dtype='object')


In [90]:

def preprocess(data, rubric_text):
    batch = {}
    batch['messages'] = [f"You will compare two essays based on the following rubric:\n'{rubric_text}'\n Essay A:\n'{text_1}'.\n and Essay B:\n' {text_2} Does Essay A or B score higher? The higher scoring essay is Essay"
      for text_1, text_2 in zip(data['text_1' ] , data[ 'text_2'])]
    batch['id_1'] = data['id_1']
    batch['id_2'] = data['id_2']
    batch['prompt'] = data['prompt' ]
    return batch

def compute_metric():
    return

def collate(data):
    return {'messages': [ex['messages'] for ex in data],
            'id_1': [ex['id_1'] for ex in data],
            'id_2': [ex['id_2'] for ex in data],
            'prompt': [ex['prompt'] for ex in data]}


In [91]:
from datasets import Dataset

# batch_size = 1
# print('loading dataset...\n')
# train_data = Dataset.from_pandas(df)
# train_data = train_data.select_columns(['full_text', 'text_id_kaggle'])

In [None]:
results = {}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cpu':
    print("WARNING: Using CPU. This will be slow. Try switching to a GPU runtime.")

# Set a larger batch size for faster processing
batch_size = 8  # Increase based on available GPU memory


original_data = Dataset.from_pandas(df)
# original_data = original_data.select_columns(['full_text', 'text_id_kaggle'])
rubrics = [rubric_overall, rubric_cohesion, rubric_syntax, rubric_vocab, rubric_phrase, rubric_grammar, rubric_conv]
rubric_names = ['overall', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

rubrics = [rubric_overall]
rubric_names = ['overall']




for i, (rubric_name, rubric) in enumerate(zip(rubric_names, rubrics)):
    print(f"Processing rubric: {rubric_name}")

    current_data = original_data.map(
        lambda x: preprocess(x, rubric),
        batched=True,
        batch_size=batch_size*4  # Larger batches for preprocessing
    )

    # Create more efficient dataloader
    train_dataloader = DataLoader(
        current_data,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,  # Use multiple workers for data loading
        collate_fn=collate
    )

    with open(f'{rubric_name}.csv', 'w') as out_file:
        out_file.write('id_1,id_2,prompt,pred\n')
        with torch.no_grad():  # Disable gradient calculation for inference
            for batch in tqdm(train_dataloader):
                id_1 = batch['id_1']
                id_2 = batch['id_2']
                prompt = batch['prompt']

                # Use more efficient generation settings
                out = pipe(
                    batch['messages'],
                    max_new_tokens=1,
                    pad_token_id=pipe.tokenizer.eos_token_id,
                    do_sample=False  # Greedy decoding is faster
                )

                for i in range(len(batch['messages'])):
                    out_file.write(f'{id_1[i]},{id_2[i]},{prompt[i]},"{out[i][0]["generated_text"][-1]}"\n')

Processing rubric: overall


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  0%|          | 0/13 [00:00<?, ?it/s]

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


In [None]:
pd.read_csv('overall.csv')

In [None]:
import glob
path = '/content'
csv_files = [file for file in glob.glob(f'{path}/*.csv') if 'data.csv' not in file]

output_df = None

for file in csv_files:
    df = pd.read_csv(file)

    # Extract the filename (without path and extension) for column naming
    file_name = file.split('/')[-1].split('.')[0]

    df = df.rename(columns={'pred': file_name})

    if output_df is None:
        output_df = df  # Initialize with the first file
    else:
        output_df = pd.merge(output_df, df, on='essay_id', how='inner')  # Inner join to match 'ids'



In [None]:
drive_path = '/content/drive/My Drive/senior_thesis/part_2/outputs/gemma-7b_raw.csv'
output_df.to_csv(drive_path, index=False)

print(f"saved at {drive_path}.")

##Output post processing
