In [25]:
! pip install -U transformers
! pip install -U torch
! pip install -U accelerate
! pip install -U datasets
! pip install huggingface_hub
! pip install fastapi uvicorn


0.Install Required Libraries:
Install Hugging Face libraries, along with either PyTorch or TensorFlow, depending on your preference.

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

> Step 1: Load the dataset

In [4]:
# Load a dataset from Hugging Face. Here, we use "wikitext" as an example.
# dataset = load_dataset("wikitext", "LAW_benchmark")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]  # Use a validation split as the eval dataset
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


> Step 2: Load the tokenizer and model

In [5]:
model_name = "gpt2"  # Change to your model's name if different
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set padding token ID if it's missing (especially for models like GPT-2)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding if no pad token exists
    model.config.pad_token_id = tokenizer.eos_token_id  # Update model config accordingly

> Step 3: Define the tokenization function

In [None]:
def tokenize_function(examples):
    # Tokenize and set 'labels' equal to 'input_ids' for language modeling
    tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()  # Setting labels for causal language modeling
    return tokens

# Apply tokenization to the training and evaluation datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)


> Step 4: Define the data collator (automatically handles padding)

In [7]:
# Step 4: Define the data collator (automatically handles padding)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False for causal language modeling (like GPT-2)
)


> Step 5: Define training arguments

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
)



> Step 6: Set up the Trainer

In [9]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,  # Add the evaluation dataset here
    data_collator=data_collator,
)


> Step 7: Train the model

In [None]:
trainer.train()

> Step 8:  Evaluate the Model

In [11]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

  0%|          | 0/1880 [00:00<?, ?it/s]

{'eval_loss': 3.635511636734009, 'eval_runtime': 67.9022, 'eval_samples_per_second': 55.374, 'eval_steps_per_second': 27.687, 'epoch': 0.01}
Evaluation results: {'eval_loss': 3.635511636734009, 'eval_runtime': 67.9022, 'eval_samples_per_second': 55.374, 'eval_steps_per_second': 27.687, 'epoch': 0.005337981371534397}


> Step 9: Save the model and Tokenizer

In [12]:
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")

('./fine_tuned_llama2/tokenizer_config.json',
 './fine_tuned_llama2/special_tokens_map.json',
 './fine_tuned_llama2/vocab.json',
 './fine_tuned_llama2/merges.txt',
 './fine_tuned_llama2/added_tokens.json',
 './fine_tuned_llama2/tokenizer.json')

> Step 10: Test Inference

In [13]:
from transformers import pipeline

text_generator = pipeline("text-generation", model="./fine_tuned_llama2", tokenizer=tokenizer)
prompt = "In a legal context, it is essential to"
generated_text = text_generator(prompt, max_length=100, num_return_sequences=1)
print(generated_text)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'In a legal context, it is essential to consider the circumstances under which an individual was convicted in Australia at the time of the offence. We refer to these as the first sentence for each prisoner being charged with an offence under section 5 of the Crimes Act 1989. The following are a few of the more relevant sections of this legislation from the beginning as well as an important statement from the Victorian Government concerning the nature of sentencing.\n\n"As a matter of law a conviction of an offence shall not exceed'}]


> Step 11: Deploy the Model to a Production Environment, upload this mode to huggingface hub

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.token='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'
api.upload_folder(folder_path="./fine_tuned_llama2", repo_id="xju3/learning-wikitext-2-raw-1")

> Step 12: Accessing the Model via API

In [None]:
from transformers import pipeline

# Load model directly from Hugging Face Hub
model_name = "xju3/learning-wikitext-2-raw-1"
text_generator = pipeline("text-generation", model=model_name, token="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")

# Generate text
prompt = "In a legal context, it is essential to"
result = text_generator(prompt, max_length=100, num_return_sequences=1)
print(result)

> Step 13: Test The Api Running on Local Server

In [33]:
import requests

url = "http://127.0.0.1:8000/generate"
# url = "http://127.0.0.1:5000/generate"
data = {"prompt": "In a legal context, it is essential to", "max_length": 100}

response = requests.post(url, json=data)
print(response.json())

{'generated_text': "In a legal context, it is essential to remember both the legal system's important role in enabling its construction, and its role as a source of knowledge to facilitate its continued growth.\n\nThe history of legal frameworks\n\nThe early forms of juridical authority have often varied. Some have been loosely based on Greek law; many have been incorporated into modern Greek law.\n\nLaw at the beginning of the eighteenth century was an integral part of Greek law and of politics generally (see: Herod"}
