# Using LLM(GPT-2)

## Import packages, define model and tokenizer

In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:06<00:00, 171kB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:01<00:00, 424kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 131kB/s]
Downloading pytorch_model.bin: 100%|██████████| 548M/548M [00:32<00:00, 16.9MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 26.9kB/s]


In [3]:
text = "The quick brown fox jumps over the lazy dog"

input_ids = tokenizer.encode(text, return_tensors='pt')

## Modeling

### Greedy Search

In [4]:
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The quick brown fox jumps over the lazy dog and runs off.

"I'm sorry, I'm sorry, I'm sorry," the fox says.

"I'm sorry, I'm sorry, I'm sorry," the fox says


### Beam Search

In [5]:
beam_outputs = model.generate(
    input_ids,
    max_length=50,
    num_beams=5,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The quick brown fox jumps over the lazy dog.

"What's wrong with you?"

"I don't know."

"What's wrong with you?"

"I don't know."

"What's wrong


In [6]:
beam_outputs2 = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5, # 다섯 개의 문장을 리턴
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_outputs2[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The quick brown fox jumps over the lazy dog.

"What's wrong with you?" he asks. "You're not going to be able to do anything about it. You're just a fox. I don't know what you're doing


### Sampling

In [7]:
torch.random.manual_seed(0)

sample_outputs = model.generate(
    input_ids, 
    do_sample=True, # 샘플링 사용
    max_length=50, 
    top_k=0 #top_k=0으로 설정하면 타임스텝별로 하나의 토큰만 샘플링
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The quick brown fox jumps over the lazy dog along the pilot's body and... is "tree legs forth the bunny." Echo, the new co-worker, attempts to break she so cagey. Then it's a doozy when Lumpy and


### Top-p Sampling

In [8]:
topp_sample_outputs = model.generate(
    input_ids,
    do_sample=True, #샘플링 전략 사용
    max_length=50, # 최대 디코딩 길이는 50
    top_k=50, # 확률 순위가 50위 밖인 토큰은 샘플링에서 제외
    top_p=0.95, # 누적 확률이 95%인 후보집합에서만 생성
    num_return_sequences=3 #3개의 결과를 디코딩해낸다
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(topp_sample_outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The quick brown fox jumps over the lazy dog, who slithers along in the dog's lap, as the dog bites the red fox into the ground, and then he goes over to the dog's rear. The dog bites the front paw into


## Data Crawling

In [12]:
from datasets import load_dataset

dataset = load_dataset("lawcompany/KLAID", 'ljp')

Downloading builder script: 100%|██████████| 3.38k/3.38k [00:00<00:00, 1.13MB/s]
Downloading readme: 100%|██████████| 3.96k/3.96k [00:00<00:00, 1.18MB/s]


Downloading and preparing dataset klaid/ljp to /Users/yunho/.cache/huggingface/datasets/lawcompany___klaid/ljp/1.0.0/170cd5bb1a0d9f3e383773bc69b51eb6a717918f91b682fe094492d865feaf4c...


Downloading data: 100%|██████████| 137M/137M [00:15<00:00, 8.93MB/s] 
                                                                    

Dataset klaid downloaded and prepared to /Users/yunho/.cache/huggingface/datasets/lawcompany___klaid/ljp/1.0.0/170cd5bb1a0d9f3e383773bc69b51eb6a717918f91b682fe094492d865feaf4c. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 75.65it/s]


In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['laws_service_id', 'fact', 'laws_service'],
        num_rows: 161192
    })
})

In [15]:
import pandas as pd
df = pd.DataFrame(dataset['train'])
df

Unnamed: 0,laws_service_id,fact,laws_service
0,32,피고인은 2018. 8. 9. 23:33경 술을 마신 상태로 경산시 사동에 있는 상...,"도로교통법 제148조의2 제2항,도로교통법 제44조 제2항"
1,0,피고인은 2016. 3. 19. 10:16경 경북 칠곡군 왜관읍 왜관대교 앞 도로에...,"도로교통법 제152조 제1호,도로교통법 제43조"
2,0,피고인은 2016. 10. 10 16:55경 평택시 오성면 복합화력발전소 앞 도로에...,"도로교통법 제152조 제1호,도로교통법 제43조"
3,34,피고인 A은 노동일에 종사 중이다. 피고인은 2017. 2. 2. 20:00경 부산...,형법 제260조 제1항
4,11,피고인은 2015. 7. 2. 06:35경 부산 부산진구 B에 있는 C슈퍼 앞길에서...,형법 제314조 제1항
...,...,...,...
161187,46,피고인은 구리시 C에 있는 D부동산을 운영하면서 ‘E상가조합’을 결성하여 위 상가조...,"형법 제356조,형법 제355조 제1항"
161188,34,"피고인은 2015. 4. 16. 15:16경 대구 달서구 C에 있는 D 주유소에서,...",형법 제260조 제1항
161189,8,피고인은 2020. 7. 18. 06:20경 수원시 영통구 B에 있는 ‘C’ 식당 ...,"도로교통법 제148조의2 제1항,도로교통법 제44조 제1항"
161190,1,피고인은 2011. 3. 15. 18:30경 서울 서초구 B에 있는 전 배우자인 C...,형법 제136조 제1항


In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TextClassificationPipeline, Trainer, TrainingArguments

# Load the pre-trained GPT-3.5 model and tokenizer
model = GPT2ForSequenceClassification.from_pretrained("EleutherAI/gpt-neo-2.7B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")

# Load the legal text classification dataset
df = pd.read_csv("legal_dataset.csv")

# Convert the text labels to numerical values
label2id = {"employment law": 0, "contract law": 1, "intellectual property law": 2}
df["label"] = df["label"].apply(lambda x: label2id[x])

# Tokenize the input text and encode the labels
inputs = tokenizer(df["text"].tolist(), padding=True, truncation=True, return_tensors="pt")
labels = df["label"].tolist()

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Instantiate the Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=inputs,
    train_labels=labels,
    data_collator=lambda data: {"input_ids": torch.stack([x["input_ids"] for x in data]),
                                "attention_mask": torch.stack([x["attention_mask"] for x in data]),
                                "labels": torch.tensor([x["labels"] for x in data])},
)

# Train the model
trainer.train()

# Use the trained model to make predictions on new text
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer)
text = "This is a legal document related to a contract dispute."
prediction = pipeline(text)[0]
predicted_label = list(label2id.keys())[list(label2id.values()).index(prediction["label"])]
print("Predicted label:", predicted_label)


### using LLM

In [None]:
import openai
openai.api_key = "YOUR_API_KEY"

def analyze_legal_text(text):
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=text,
        max_tokens=1024,
        n=1,
        stop=None,
        temperature=0.5,
    )

    return response.choices[0].text

legal_text = "The defendant has been found guilty of the charge of theft. He is hereby sentenced to 2 years in prison and a fine of $10,000."

analysis = analyze_legal_text(legal_text)
print(analysis)


fine-tuning

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Load the legal text data
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='train.txt',
    block_size=128
)
eval_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='eval.txt',
    block_size=128
)

# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    save_total_limit=1,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Evaluate the model on the test set
test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='test.txt',
    block_size=128
)

trainer.evaluate(test_dataset)
