In [None]:
!pip install transformers

In [None]:
!pip install -U PyPDF2
!pip install python-docx

In [None]:
!pip install pandas

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [16]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx
import shutil

### proxy service

In [17]:
import subprocess

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

### Data preprocessing

In [18]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = []
    for i, filename in enumerate(os.listdir(directory)):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            new_text = read_pdf(file_path)
        elif filename.endswith(".docx"):
            new_text = read_word(file_path)
        elif filename.endswith(".txt"):
            new_text = read_txt(file_path)
        # new_text = re.sub(r'\n+', '\n', new_text).strip()
        combined_text.append(new_text)
        if i == 9:
            break
    return combined_text


In [19]:
# Load documents from the directory
data_path = '/root/autodl-tmp/Fine-Tuned-GPT-2-with-articles-ground-truth/data'
all_text = read_documents_from_directory(os.path.join(data_path, "ori_papers"))



In [20]:
# Save the training and validation data as text files
train_vs_test = [7, 3]
train_id = int(len(all_text)/10*7)

train_text = "".join(all_text[:train_id])
test_text = "".join(all_text[train_id:])

if os.path.exists(os.path.join(data_path, "dataset")):
   shutil.rmtree(os.path.join(data_path, "dataset"))
os.makedirs(os.path.join(data_path, "dataset"))

with open(os.path.join(data_path, "dataset/train.txt"), "w") as f:
   f.write(train_text)
with open(os.path.join(data_path, "dataset/test.txt"), "w") as f:
   f.write(test_text)

### Model training

In [21]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import torch

In [22]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [23]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [47]:
def train(train_file_path, model_name, 
          output_dir, 
          overwrite_output_dir, 
          per_device_train_batch_size, 
          num_train_epochs, 
          save_steps,
          logging_steps=500):
    
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    os.makedirs(os.path.join(output_dir, "tokenizer"))

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)
    tokenizer.save_pretrained(os.path.join(output_dir, 'tokenizer'))

    model = GPT2LMHeadModel.from_pretrained(model_name)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=overwrite_output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
            logging_steps=logging_steps
    )

    trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model(os.path.join(output_dir, "final"))

In [48]:
train_file_path = os.path.join(data_path, "dataset/train.txt")
model_name = 'gpt2'
output_dir = '/root/autodl-tmp/Fine-Tuned-GPT-2-with-articles-ground-truth/results'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 5000

In [49]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss
500,2.5387
1000,1.9032
1500,1.5406
2000,1.2552
2500,1.0255
3000,0.8404
3500,0.6964
4000,0.5847
4500,0.5064
5000,0.4439


Inference

In [30]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [52]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(model_path):
    tokenizer = GPT2Tokenizer.from_pretrained(os.path.join(model_path, "tokenizer"))
    return tokenizer

def generate_text(model_path, sequence, max_length):
    
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

This model got trained on the entire text and took much longer to train, and yet it fails to give meaningful results. 

In [55]:
model1_path = output_dir
sequence1 = "[Q] What is the Monosaccharide?"
max_len = 50
generate_text(model1_path, sequence1, max_len) 

[Q] What is the Monosaccharide?
(a) Glucose
(b) Sucrose
(c) Fructose
(d) Glucose
Answer.  (b)
480. Which of
