In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!nvidia-smi

In [None]:
!pip install transformers

In [174]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys
import json
import zipfile
import itertools
import re
import tensorflow as tf

In [None]:
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

In [None]:
full_text=[]
for line in open('/content/drive/My Drive/TR_TechChallenge/TRDataChallenge2023.txt','r', encoding='utf-8', errors='replace'):
    full_text.append(json.loads(line))

In [None]:
full_text_pd = pd.json_normalize(full_text)

In [None]:
full_text_pd['number_of_postures'] = full_text_pd['postures'].str.len()
full_text_pd_postures_count = full_text_pd[['documentId','postures','number_of_postures']]
total_number_of_postures = np.sum(full_text_pd_postures_count['number_of_postures'])
all_postures_in_df = list(full_text_pd_postures_count['postures'].values)
list_of_unique_postures = set(list(itertools.chain.from_iterable(all_postures_in_df)))
count_of_postures = len(list_of_unique_postures)


In [None]:
print("Total number of documents = ",full_text_pd.shape[0])

In [None]:
print("Total number of distinct postures in the documents = ",count_of_postures)

In [None]:
def count_of_paragraph_occurrance(text):

    return text.count("'paragraphs'")

In [None]:
full_text_pd['count_of_paragraphs'] = full_text_pd['sections'].apply(lambda x: count_of_paragraph_occurrance(str(x)))
total_number_of_paragraphs = np.sum(full_text_pd['count_of_paragraphs'])

In [None]:
print("Total number paragraphs in the documents = ",total_number_of_paragraphs)

In [None]:
train_df = full_text_pd

In [None]:
train_df.head()

In [None]:
def cleanse_text(text):
    return re.sub(r'[^a-zA-Z0-9,.\s]', '', text)

In [17]:
train_df['sections'] = train_df['sections'].apply(lambda x: cleanse_text(str(x)))

In [None]:
# Use pd.get_dummies() to one-hot encode the 'classes' column
train_df_expanded = train_df['postures'].str.join('|').str.get_dummies().add_prefix('class_')

# Concatenate the expanded columns with the original DataFrame
train_df_with_classes = pd.concat([train_df, train_df_expanded], axis=1)

# Drop the original 'classes' column if needed
# df.drop('classes', axis=1, inplace=True)

print(train_df_with_classes)

In [19]:
target_list = [col_name for col_name in train_df_with_classes.columns if  "class_" in col_name]

In [67]:
from transformers import BertForSequenceClassification,BertModel,BertTokenizer,TFBertForSequenceClassification

In [74]:
# hyperparameters
MAX_LEN = 256
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 1e-05

In [21]:
# from transformers import RobertaConfig,RobertaModel,RobertaTokenizer

In [66]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [23]:
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [217]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['sections']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [218]:
train_val_size = 0.9
train_val_df = train_df_with_classes.sample(frac=train_val_size, random_state=200).reset_index(drop=True)
test_df = train_df_with_classes.drop(train_val_df.index).reset_index(drop=True)


train_size = 0.8
train_df = train_val_df.sample(frac=train_size, random_state=200).reset_index(drop=True)
val_df = train_val_df.drop(train_df.index).reset_index(drop=True)

In [219]:
train_df.shape

(12960, 229)

In [220]:
val_df.shape

(3240, 229)

In [221]:
test_df.shape

(1800, 229)

In [222]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)

In [223]:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [224]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [225]:
device

device(type='cuda')

In [226]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [236]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        for param in self.bert_model.parameters():
            param.requires_grad = False
        # self.bert_model.layers[0].trainable = False
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 224)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [228]:
# for param in model.parameters():
#     param.requires_grad = False

In [229]:
# model.parameters()

In [237]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [238]:
val_targets=[]
val_outputs=[]

In [240]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path):

  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf


  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)

    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }

        # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)

      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [241]:
ckpt_path = "/content/drive/MyDrive/TR_TechChallenge/curr_ckpt"
best_model_path = "/content/drive/MyDrive/TR_TechChallenge/best_model.pt"

In [242]:
# val_data_loader

In [243]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

############# Epoch 1: Training Start   #############


KeyboardInterrupt: ignored

In [186]:
# # testing
# example = test_df['sections'][0]
# encodings = tokenizer.encode_plus(
#     example,
#     None,
#     add_special_tokens=True,
#     max_length=MAX_LEN,
#     padding='max_length',
#     return_token_type_ids=True,
#     truncation=True,
#     return_attention_mask=True,
#     return_tensors='pt'
# )
# model.eval()
# with torch.no_grad():
#     input_ids = encodings['input_ids'].to(device, dtype=torch.long)
#     attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
#     token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
#     output = model(input_ids, attention_mask, token_type_ids)
#     final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
#     print(train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))])

In [42]:
# test_df['postures'][0]

['Motion to Dismiss']

In [44]:
# testing
example = test_df['sections'][0]
encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(train_df.columns[5:].to_list()[int(np.argmax(final_output, axis=1))])

class_On Appeal


In [45]:
test_df['postures'][0]

['Motion to Dismiss']

In [46]:
size_of_test = test_df.shape[0]

In [50]:
list_of_predicted_classes = []
list_of_actual_classes = []

for i in range(size_of_test):
  # testing
  example = test_df['sections'][i]
  encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
  )
  model.eval()
  with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    predicted_classes = train_df.columns[5:].to_list()[int(np.argmax(final_output, axis=1))]
    actual_classes = test_df['postures'][i]

    list_of_predicted_classes.append(predicted_classes)
    list_of_actual_classes.append(actual_classes)



In [56]:
test_results_df = pd.DataFrame(list(zip(list_of_predicted_classes,list_of_actual_classes)),columns=['predicted','actual'])

In [57]:
test_results_df

Unnamed: 0,predicted,actual
0,class_On Appeal,[Motion to Dismiss]
1,class_On Appeal,"[On Appeal, Review of Administrative Decision]"
2,class_On Appeal,"[Motion for Attorney's Fees, Motion for Costs,..."
3,class_On Appeal,[Review of Administrative Decision]
4,class_On Appeal,[Motion to Dismiss]
...,...,...
1795,class_On Appeal,[Appellate Review]
1796,class_On Appeal,[Objection to Proof of Claim]
1797,class_On Appeal,"[Appellate Review, Trial or Guilt Phase Motion..."
1798,class_On Appeal,"[Appellate Review, Jury Selection Challenge or..."


In [62]:
final_output

[[0.24487321078777313,
  0.03271285071969032,
  0.03280995786190033,
  0.05372561886906624,
  0.036099810153245926,
  0.03034413978457451,
  0.038850221782922745,
  0.029800895601511,
  0.025841087102890015,
  0.08112870156764984,
  0.053456228226423264,
  0.041251521557569504,
  0.04167282208800316,
  0.032878030091524124,
  0.03415391966700554,
  0.038218941539525986,
  0.03711304813623428,
  0.027805417776107788,
  0.06777795404195786,
  0.041697338223457336,
  0.045272987335920334,
  0.04136497527360916,
  0.03954328969120979,
  0.06146438419818878,
  0.06272262334823608,
  0.03941253200173378,
  0.03943419083952904,
  0.04245886951684952,
  0.051515042781829834,
  0.06798746436834335,
  0.05389263853430748,
  0.04625324159860611,
  0.06446653604507446,
  0.0477258525788784,
  0.0403229184448719,
  0.036600105464458466,
  0.04241889342665672,
  0.026125088334083557,
  0.050363361835479736,
  0.04066731035709381,
  0.03505120426416397,
  0.03799719735980034,
  0.03058026172220707,
 

In [59]:
[int(np.argmax(final_output, axis=1))]

[190]

In [61]:
final_output[0][190]

0.518435537815094