# Import libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers, losses, metrics
from tensorflow.keras.preprocessing import text, sequence

# Preprocessing

In [2]:
data_path = '/kaggle/input/legal-citation-text-classification/legal_text_classification.csv'
df_data = pd.read_csv(data_path)
df_data

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...
...,...,...,...,...
24980,Case25203,cited,Reches Pty Ltd v Tadiran Pty Ltd (1998) 85 FCR...,That is not confined to persons who control th...
24981,Case25204,cited,Sir Lindsay Parkinson &amp; Co Ltd v Triplan L...,Once the threshold prescribed by s 1335 is sat...
24982,Case25205,cited,Spiel v Commodity Brokers Australia Pty Ltd (I...,Once the threshold prescribed by s 1335 is sat...
24983,Case25206,distinguished,"Tullock Ltd v Walker (Unreported, Supreme Cour...",Given the extent to which Deumer stands to gai...


In [3]:
df_data['case_outcome'].value_counts()

case_outcome
cited            12219
referred to       4384
applied           2448
followed          2256
considered        1712
discussed         1024
distinguished      608
related            113
affirmed           113
approved           108
Name: count, dtype: int64

In [4]:
df_data['input'] = df_data['case_text']
df_data

Unnamed: 0,case_id,case_outcome,case_title,case_text,input
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,The preceding general principles inform the ex...
...,...,...,...,...,...
24980,Case25203,cited,Reches Pty Ltd v Tadiran Pty Ltd (1998) 85 FCR...,That is not confined to persons who control th...,That is not confined to persons who control th...
24981,Case25204,cited,Sir Lindsay Parkinson &amp; Co Ltd v Triplan L...,Once the threshold prescribed by s 1335 is sat...,Once the threshold prescribed by s 1335 is sat...
24982,Case25205,cited,Spiel v Commodity Brokers Australia Pty Ltd (I...,Once the threshold prescribed by s 1335 is sat...,Once the threshold prescribed by s 1335 is sat...
24983,Case25206,distinguished,"Tullock Ltd v Walker (Unreported, Supreme Cour...",Given the extent to which Deumer stands to gai...,Given the extent to which Deumer stands to gai...


In [5]:
unique_values = set(df_data['case_outcome'])
output_dims = len(unique_values)
index_to_value = {index:value for index, value in enumerate(unique_values)}
value_to_index = {value:index for index, value in index_to_value.items()}
print(f'Unique_values : {unique_values}')
print(f'index_to_value :{index_to_value}')
print(f'value_to_index : {value_to_index}')
print(f'Output dimension : {output_dims}')

Unique_values : {'applied', 'referred to', 'distinguished', 'affirmed', 'related', 'considered', 'followed', 'approved', 'discussed', 'cited'}
index_to_value :{0: 'applied', 1: 'referred to', 2: 'distinguished', 3: 'affirmed', 4: 'related', 5: 'considered', 6: 'followed', 7: 'approved', 8: 'discussed', 9: 'cited'}
value_to_index : {'applied': 0, 'referred to': 1, 'distinguished': 2, 'affirmed': 3, 'related': 4, 'considered': 5, 'followed': 6, 'approved': 7, 'discussed': 8, 'cited': 9}
Output dimension : 10


In [6]:
df_train = df_data[['input', 'case_outcome']]
one_hot_encoded = [value_to_index[v] for v in df_train['case_outcome']]
df_train['output'] = one_hot_encoded
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['output'] = one_hot_encoded


Unnamed: 0,input,case_outcome,output
0,Ordinarily that discretion will be exercised s...,cited,9
1,The general principles governing the exercise ...,cited,9
2,Ordinarily that discretion will be exercised s...,cited,9
3,The general principles governing the exercise ...,cited,9
4,The preceding general principles inform the ex...,cited,9
...,...,...,...
24980,That is not confined to persons who control th...,cited,9
24981,Once the threshold prescribed by s 1335 is sat...,cited,9
24982,Once the threshold prescribed by s 1335 is sat...,cited,9
24983,Given the extent to which Deumer stands to gai...,distinguished,2


In [7]:
np.random.seed(24985)
data = np.array(df_train[['input', 'output']])
print(data.shape)
np.random.shuffle(data)

(24985, 2)


In [8]:
train_data_len = round(len(data)  * (90 / 100))
y = tf.one_hot(data[:, 1], output_dims).numpy()
train_data = data[:train_data_len]
test_data = data[train_data_len:]
train_x = list(map(str, train_data[:, 0]))
train_y = y[:train_data_len]
test_x = list(map(str, test_data[:, 0]))
test_y = y[train_data_len:]

maxlen = max(len(x) for x in train_x)
dashline = '\n' + '-' * 50  + '\n'
print(f'Length of train data : {len(train_data)}')
print(f'Length of test data : {len(test_data)}')
print(dashline)
print(f'Train_x [0] : \n {train_x[0]}')
print(dashline)
print(f'Test_x [0]] : \n {test_x[0]}')
print(dashline)
print(f'Train_y [0] : {train_y[0]}')
print(f'Test_y [0] : {test_y[0]}')

Length of train data : 22486
Length of test data : 2499

--------------------------------------------------

Train_x [0] : 
 In the law of torts, the law has created the 'reasonable person'. In the law of defamation, from which the notion of republication may have been sourced, the question as to whether a subsequent publication was intended by the original publisher or was a natural and probable result of providing the information in the first place is assessed objectively ( John Fairfax &amp; Sons Ltd v Cojuangco [1988] HCA 54 ; (1988) 165 CLR 346 at 350: Webb v Bloch [1928] HCA 47 ; (1928) 41 CLR 331 at 363-366 and see as to the objective nature of the act of publication E Hulton &amp; Co v Jones [1910] AC 20).

--------------------------------------------------

Test_x [0]] : 
 In this case, it is not possible for the Court to make orders concerning a hypothetical question which was not the subject of a claim by the applicant, nor decision by the first respondent, nor contention be

In [9]:
train_y

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [10]:
lengths = [len(x) for x in train_x]
maxlen = round(np.percentile(lengths, 75))
maxlen

2522

In [11]:
tokenizer = text.Tokenizer(num_words = 10000, oov_token = '<oov>')
tokenizer.fit_on_texts(train_x)
train_sequences = tokenizer.texts_to_sequences(train_x)
test_sequences = tokenizer.texts_to_sequences(test_x)
pad_train_x = sequence.pad_sequences(train_sequences, maxlen = maxlen, padding = 'post', truncating = 'post')
pad_test_x = sequence.pad_sequences(test_sequences, maxlen = maxlen, padding = 'post', truncating = 'post')

In [12]:
max_index = len(np.max(pad_train_x, axis = -2))
print(f"Maximum index in input data: {max_index}")

Maximum index in input data: 2522


# Model development and evaluation

In [26]:
def classify_model(vocab_size, output_dims, embedding_dims = 64):
    inputs = layers.Input(shape = (None,))
    x = layers.Embedding(vocab_size, embedding_dims)(inputs)
    x, fh, fc, bh, bc = layers.Bidirectional(layers.LSTM(units = 64, return_state = True, return_sequences = True))(x)
    h = layers.Concatenate()([fh, bh])    
    c = layers.Concatenate()([fc, bc])
    x, h1, c1 = layers.LSTM(units = 128, return_state = True, return_sequences = True)(x, initial_state = [h, c])
    h = layers.Concatenate()([h, h1])
    c = layers.Concatenate()([c, c1])
    x = layers.LSTM(units = 256)(x, initial_state = [h, c])
    x = layers.Dense(units = 128, activation = 'softmax')(x)
    outputs = layers.Dense(units = output_dims)(x)
    model = models.Model(inputs = inputs, outputs = outputs)
    return model

In [27]:
vocab_size = len(tokenizer.word_index) + 1
model = classify_model(vocab_size = vocab_size, output_dims = output_dims)

In [28]:
model.compile(
    optimizer = optimizers.Adam(),
    loss = losses.CategoricalCrossentropy(),
    metrics = [metrics.CategoricalCrossentropy(from_logits = True),'accuracy']
)

In [29]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='accuracy',  
    patience=0,          
    verbose=1,
    restore_best_weights=True
)

In [30]:
model.fit(pad_train_x, train_y, validation_split = 0.1, epochs = 3, callbacks = early_stopping)

Epoch 1/3
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 441ms/step - accuracy: 0.4649 - categorical_crossentropy: 2.2879 - loss: 6.8078 - val_accuracy: 0.4900 - val_categorical_crossentropy: 2.2865 - val_loss: 5.6403
Epoch 2/3
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m279s[0m 441ms/step - accuracy: 0.4941 - categorical_crossentropy: 2.2862 - loss: 5.6291 - val_accuracy: 0.4900 - val_categorical_crossentropy: 2.2865 - val_loss: 5.6403
Epoch 3/3
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m279s[0m 441ms/step - accuracy: 0.4895 - categorical_crossentropy: 2.2864 - loss: 5.6614 - val_accuracy: 0.4900 - val_categorical_crossentropy: 2.2865 - val_loss: 5.6403
Epoch 3: early stopping
Restoring model weights from the end of the best epoch: 2.


<keras.src.callbacks.history.History at 0x7fa504724340>

In [31]:
model.evaluate(pad_test_x, test_y)

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 207ms/step - accuracy: 0.4966 - categorical_crossentropy: 2.2863 - loss: 5.6997


[5.611339569091797, 2.2863645553588867, 0.48499399423599243]