In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
# vous devez importez les données ici

data = pd.read_csv('/content/gdrive/MyDrive/Projet/legal_text_classification.csv')

In [4]:
data.head(5)

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [5]:
data.shape

(24985, 4)

In [6]:
data.describe(include=['O'])

Unnamed: 0,case_id,case_outcome,case_title,case_text
count,24985,24985,24985,24809
unique,24985,10,18581,17920
top,Case1,cited,Minister for Immigration and Ethnic Affairs v ...,submitted that this Court should hold that the...
freq,1,12219,70,42


In [7]:
data.isnull().sum()

case_id           0
case_outcome      0
case_title        0
case_text       176
dtype: int64

In [8]:
data['case_outcome'].unique()

array(['cited', 'applied', 'followed', 'referred to', 'related',
       'considered', 'discussed', 'distinguished', 'affirmed', 'approved'],
      dtype=object)

In [8]:
data['label'] = [0 if label == "cited" else 1
                                 if label=='applied'  else 2
                                 if label=='followed' else  3
                                 if label == "referred to" else 4
                                 if label=='related'  else 5
                                 if label=='considered' else 6
                                 if label == "discussed" else 7
                                 if label=='distinguished' else 8
                                 if label=='affirmed' else
                               9  for label in data['case_outcome']]

In [10]:
data['label'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
data['case_outcome']

0                cited
1                cited
2                cited
3                cited
4                cited
             ...      
24980            cited
24981            cited
24982            cited
24983    distinguished
24984    distinguished
Name: case_outcome, Length: 24985, dtype: object

In [9]:
data['text'] = [str(x) for x in data['case_text']]

In [13]:
data.isnull().sum()

case_id           0
case_outcome      0
case_title        0
case_text       176
label             0
text              0
dtype: int64

In [10]:
data = data.drop(columns=['case_id', 'case_outcome', 'case_title', 'case_text'])

In [15]:
data.shape

(24985, 2)

In [16]:
data.head(10)

Unnamed: 0,label,text
0,0,Ordinarily that discretion will be exercised s...
1,0,The general principles governing the exercise ...
2,0,Ordinarily that discretion will be exercised s...
3,0,The general principles governing the exercise ...
4,0,The preceding general principles inform the ex...
5,0,I accept that the making of a rolled up offer ...
6,0,The preceding general principles inform the ex...
7,0,On the question of the level of unreasonablene...
8,1,recent decision of the High Court in Australia...
9,2,Hexal Australia Pty Ltd v Roche Therapeutics I...


In [11]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.4 MB/s[0m eta [36m0:00:0

In [12]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.15.10-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.36-py3-none-any.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.5/189.5 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.31.0-py2.py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.8/224.8 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.2-cp310-cp310-manylinux_2_5_x86_64.manyli

In [13]:
!pip install huggingface_hub



In [25]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertForSequenceClassification
from transformers.optimization import AdamW
from transformers import BertPreTrainedModel
from transformers import AutoConfig
from huggingface_hub import PyTorchModelHubMixin
import wandb

import pandas as pd
from sklearn.model_selection import train_test_split

from tqdm import tqdm

from sklearn.model_selection import train_test_split

#import wandb  # monitoring

config = {
    "model_name": "bert-base-uncased",
    "max_length": 80,
    "hidden_state": 768,
    "data": data,
    "batch_size": 2,
    "learing_rate": 2e-5,
    "n_epochs": 1,
    "n_classes": 10,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}


tokenizer = BertTokenizer.from_pretrained(config['model_name'])

class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.df = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        text = self.df["text"][index]
        label = self.df["label"][index]

        inputs = self.tokenizer(
            text=text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
           "label": torch.tensor(label),
        }


def dataloader(dataset, batch_size, shuffle):
    return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)




In [26]:
class CustomModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, model_name, n_classes):
        super(CustomModel, self).__init__()
        self.pretrained_model = BertForSequenceClassification.from_pretrained(
            model_name, num_labels=10
        )  # hidden_state 786 Bert_base




    def forward(self, input_ids, attention_mask):

        output = self.pretrained_model(
            input_ids=input_ids, attention_mask=attention_mask
        )  # (batch, 768)

        # Utilisez la sortie "logits" du modèle pré-entraîné
        logits = output.logits

        # Réorganisez les dimensions des logits avant la classification linéaire
        #logits = logits.view(-1, self.pretrained_model.config.hidden_size)

        # Passez les logits par le classificateur linéaire
        #output = self.classifier(logits)


        return logits



In [27]:

def train_step(model, train_loader, optimizer, loss_fn, device):
    model.train()

    total_loss = 0

    for data in tqdm(train_loader, total=len(train_loader)):

        input_ids = data["input_ids"].squeeze(1).to(device)
        attention_mask = data["attention_mask"].to(device)
        label = data["label"].to(device)

        optimizer.zero_grad()

        output = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = loss_fn(
            output, label
        )
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)


def validation_step(model, validation_loader, loss_fn, device):

    total_loss = 0
    correct_prediction = 0

    with torch.no_grad():
        for data in tqdm(validation_loader, total=len(validation_loader)):
            input_ids = data["input_ids"].squeeze(1).to(device)
            attention_mask = data["attention_mask"].to(device)
            label = data["label"].to(device)

            output = model(input_ids=input_ids, attention_mask=attention_mask)

            loss = loss_fn(
                output, label
            )
            total_loss += loss.item()

            pred = torch.argmax(torch.softmax(output, dim=1), dim=1)
            correct_prediction += torch.sum(pred == label)

    return total_loss / len(validation_loader), 100 * correct_prediction / len(
        validation_loader
    )





In [34]:
wandb.login(key='yourkey')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [35]:

wandb.init(project="bert_classification")

[34m[1mwandb[0m: Currently logged in as: [33mtatchum-ulrich[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.10
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20230918_091857-1k8g03nr[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mmagic-wildflower-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/tatchum-ulrich/bert_classification[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/tatchum-ulrich/bert_classification/runs/1k8g03nr[0m


In [36]:



dataset =MyDataset(data=config['data'], tokenizer=tokenizer, max_length=config['max_length'] )

train_dataset , validation_dataset = train_test_split(dataset , test_size =0.2)

train_loader = dataloader(train_dataset , batch_size=config['batch_size'], shuffle=True)
validation_loader = dataloader(validation_dataset, batch_size=config['batch_size'] , shuffle=False)


model  = CustomModel(model_name=config['model_name'] , n_classes=10)
model.to(config['device'])

loss_fn = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters() , lr =config['learing_rate'])

for epoch in range(config['n_epochs']):

    loss_train =train_step(model , train_loader , optimizer , loss_fn ,config['device'])
    loss_validation , accuracy = validation_step(model , validation_loader , loss_fn , config['device'])

    wandb.log({'loss_train': loss_train,
          'loss_validation': loss_validation,
           'accuracy': accuracy
            })





    # sauvegarder
torch.save(model , 'bert-model_10.pth')

torch.save(tokenizer , 'bert-model.pth')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 9994/9994 [12:17<00:00, 13.55it/s]
100%|██████████| 2499/2499 [00:40<00:00, 61.47it/s]


In [38]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
model.push_to_hub('hot_bert_classification')
tokenizer.push_to_hub('hot_bert_classification')

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ulrichING/hot_bert_classification/commit/8c7dd09ac9f6d7bb32253b351aacac4fe41a3afe', commit_message='Upload tokenizer', commit_description='', oid='8c7dd09ac9f6d7bb32253b351aacac4fe41a3afe', pr_url=None, pr_revision=None, pr_num=None)