In [7]:
import sys
sys.path.append('..')

from utils import read_env
from PyPDF2 import PdfReader
import os
import docx
import shutil

### proxy service

In [8]:
import subprocess

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

### Data preprocessing

In [9]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = []
    for i, filename in enumerate(os.listdir(directory)):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            new_text = read_pdf(file_path)
        elif filename.endswith(".docx"):
            new_text = read_word(file_path)
        elif filename.endswith(".txt"):
            new_text = read_txt(file_path)
        # new_text = re.sub(r'\n+', '\n', new_text).strip()
        combined_text.append(new_text)
        if i == 9:
            break
    return combined_text


In [11]:
# Load documents from the directory
data_path = os.getenv("data_path")
all_text = read_documents_from_directory(os.path.join(data_path, "ori_papers"))

In [None]:
# Save the training and validation data as text files
train_vs_test = [7, 3]
train_id = int(len(all_text)/10*7)

train_text = "".join(all_text[:train_id])
test_text = "".join(all_text[train_id:])

if os.path.exists(os.path.join(data_path, "dataset")):
   shutil.rmtree(os.path.join(data_path, "dataset"))
os.makedirs(os.path.join(data_path, "dataset"))

with open(os.path.join(data_path, "dataset/train.txt"), "w") as f:
   f.write(train_text)
with open(os.path.join(data_path, "dataset/test.txt"), "w") as f:
   f.write(test_text)

### Model training

In [17]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import torch

In [2]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [3]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [4]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    
    return {"accuracy": accuracy}

In [19]:
def train(train_file_path, model_name, 
          output_dir, 
          overwrite_output_dir, 
          per_device_train_batch_size, 
          num_train_epochs,
          test_file_path=None,
          save_strategy="no",
          save_steps=0,
          logging_steps=500):
    
    if save_steps != None: save_strategy = "steps"
    
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    os.makedirs(os.path.join(output_dir, "tokenizer"))

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    
    test_dataset = None
    if test_file_path != None:
        test_dataset = load_dataset(test_file_path, tokenizer)
    
    data_collator = load_data_collator(tokenizer)
    tokenizer.save_pretrained(os.path.join(output_dir, 'tokenizer'))

    model = GPT2LMHeadModel.from_pretrained(model_name)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=overwrite_output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            num_train_epochs=num_train_epochs,
            save_strategy=save_strategy,
            save_steps=save_steps,
            logging_steps=logging_steps,
            include_inputs_for_metrics=True,
    )

    trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
    )

    trainer.train()
    trainer.save_model(os.path.join(output_dir, "final"))
    return trainer

In [12]:
train_file_path = os.path.join(data_path, "dataset/train.txt")
model_name = 'gpt2'
output_dir = os.getenv("output_dir")
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0

In [20]:
# Train
trainer = train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    test_file_path=os.path.join(data_path, "dataset/test.txt")
)



Step,Training Loss
500,2.5387
1000,1.9032
1500,1.5406
2000,1.2552
2500,1.0255
3000,0.8404
3500,0.6964
4000,0.5847
4500,0.5064
5000,0.4439


In [21]:
trainer.evaluate()


ValueError: multiclass-multioutput is not supported

Inference

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GenerationConfig
import numpy as np

In [13]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(model_path):
    tokenizer = GPT2Tokenizer.from_pretrained(os.path.join(model_path, "tokenizer"))
    return tokenizer

def evaluate_perplexity(model, tokenizer, generated_sequence):
    # Tokenize the generated sequence
    input_ids = tokenizer.encode(generated_sequence, return_tensors='pt')

    # Get model logits for the generated sequence
    with torch.no_grad():
        logits = model(input_ids)[0]

    # Calculate perplexity
    perplexity = torch.exp(torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1)))

    return perplexity.item()

def generate_text(model_path, sequence, max_length, model=None, verbose=False):
    if (model == None):
        model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    generation_config = GenerationConfig(
        max_new_tokens=max_length,
        early_stopping=True,
        num_beams = 3,
        top_k=10,
        top_p=0.80,
        remove_invalid_values = True,
        pad_token_id = model.config.eos_token_id,
        eos_token_id = model.config.eos_token_id,
    )

    final_outputs = model.generate(
        inputs=ids,
        generation_config = generation_config,
        return_dict_in_generate=True, 
        output_scores=True
    )

    transition_scores = model.compute_transition_scores(
        final_outputs.sequences, final_outputs.scores, normalize_logits=True
    )[0]

    generated_tokens = final_outputs.sequences[0]
    if verbose:
        for tok, score in zip(generated_tokens, transition_scores):
            # | token | token string | log probability | probability
            print(f"| {tok:5d} | {tokenizer.decode(tok):15s} | {score.numpy():6.2f} | {np.exp(score.numpy()):.2%}")

    return tokenizer.decode(generated_tokens, skip_special_tokens=True)[len(sequence)+1:]

This model got trained on the entire text and took much longer to train, and yet it fails to give meaningful results. 

In [18]:
model = load_model(output_dir)
model.evaluate()

AttributeError: 'GPT2LMHeadModel' object has no attribute 'evaluate'

In [None]:
model1_path = output_dir
sequence1 = "[Q] What is the Monosaccharide?"
max_len = 100
print(generate_text(model1_path, sequence1, max_len))

In [None]:
model1_path = output_dir
sequence1 = "[Q] Please generate 100 questions about Monosaccharide"
max_len = 100
print(generate_text(model1_path, sequence1, max_len)[len(sequence1)+1:])

In [15]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
import evaluate
rouge = evaluate.load('rouge')

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Load reference and candidate responses from text files
def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        file = [line.strip() for line in file]
    return file

def generate_test_answer_candidate(question_path, answer_path):
    questions = load_file(question_path)
    answers = []
    for q in questions:
        answer = generate_text(model1_path, sequence1, max_len)
        answers.append(answer)
    with open(answer_path, "w") as f:
        f.write("\n".join(answers))

In [None]:
generate_test_answer_candidate(os.getenv("question_path"), os.getenv("candidate_answers_path"))

In [None]:
reference_responses = load_file(os.getenv("reference_answers_path"))
candidate_responses = load_file(os.getenv("candidate_answers_path"))

# Tokenization (assuming responses are already tokenized)
reference_tokenized = [response.split() for response in reference_responses]
candidate_tokenized = [response.split() for response in candidate_responses]

# Calculate BLEU score
bleu_score = corpus_bleu(reference_tokenized, candidate_tokenized, smoothing_function=SmoothingFunction().method1)

print("BLEU Score:", bleu_score)


In [None]:
# Calculate ROUGE score
result = rouge.compute(predictions=candidate_responses, references=reference_responses)

print("Rouge\n    score1: {}, score2: {}\n    rougeL: {}, rougeLsum: {}".format(result["rouge1"], result["rouge2"], result["rougeL"], result["rougeLsum"]))