In [None]:
# Install all necessary libraries
!pip install transformers datasets torch pytesseract pillow spacy python-docx jinja2 requests apscheduler

# Download SpaCy's small English model for NER
!python -m spacy download en_core_web_sm


In [None]:
# Import necessary modules for BERT document classification
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load the tokenizer and BERT model (pre-trained)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Assume 3 document categories

# Load a dataset (we'll use a sample dataset for demonstration)
# Replace 'ag_news' with your legal dataset
dataset = load_dataset('ag_news', split='train[:1000]')

# Preprocess the dataset (tokenize the text)
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Training arguments for fine-tuning the BERT model
training_args = TrainingArguments(
    output_dir='./results',           # Output directory for the fine-tuned model
    evaluation_strategy="epoch",      # Evaluate the model after every epoch
    learning_rate=2e-5,               # Set learning rate
    per_device_train_batch_size=16,   # Batch size for training
    num_train_epochs=3,               # Number of epochs
)

# Use the Trainer class to train the model
trainer = Trainer(
    model=model,                      # The fine-tunable BERT model
    args=training_args,               # Training arguments defined above
    train_dataset=tokenized_dataset,  # Tokenized dataset
)

# Train the model
trainer.train()
