In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


In [3]:
data = pd.read_csv("legal_text_classification.csv")

In [4]:
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_id       24985 non-null  object
 1   case_outcome  24985 non-null  object
 2   case_title    24985 non-null  object
 3   case_text     24809 non-null  object
dtypes: object(4)
memory usage: 780.9+ KB


In [6]:
data_cleaned = data.dropna(subset=['case_text'])

In [7]:
from sklearn.preprocessing import LabelEncoder
X = data_cleaned['case_text']
y = data_cleaned['case_outcome']

y_encoder = LabelEncoder()
y_encoded = y_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [8]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

In [9]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
xgb.fit(X_train_tfidf, y_train)
xgb_preds_encoded = xgb.predict(X_test_tfidf)

Parameters: { "use_label_encoder" } are not used.



In [11]:
xgb_preds = y_encoder.inverse_transform(xgb_preds_encoded)
y_test_decoded = y_encoder.inverse_transform(y_test)

In [12]:

xgb_report = classification_report(y_test_decoded, xgb_preds)

print("XGBoost Results:")
print(xgb_report)

XGBoost Results:
               precision    recall  f1-score   support

     affirmed       0.59      0.43      0.50        23
      applied       0.37      0.16      0.22       496
     approved       0.30      0.14      0.19        21
        cited       0.58      0.91      0.71      2440
   considered       0.36      0.11      0.17       353
    discussed       0.43      0.12      0.18       206
distinguished       0.64      0.26      0.37       110
     followed       0.59      0.24      0.34       437
  referred to       0.59      0.36      0.45       855
      related       0.60      0.14      0.23        21

     accuracy                           0.57      4962
    macro avg       0.51      0.29      0.34      4962
 weighted avg       0.54      0.57      0.51      4962



In [13]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 100) 
clf.fit(X_train_tfidf, y_train)
clf_preds_encoded = clf.predict(X_test_tfidf)


In [14]:
clf_report = classification_report(y_test, clf_preds_encoded)

print("random forest Results:")
print(clf_report)

random forest Results:
              precision    recall  f1-score   support

           0       1.00      0.09      0.16        23
           1       0.51      0.16      0.24       496
           2       0.33      0.14      0.20        21
           3       0.56      0.95      0.71      2440
           4       0.56      0.13      0.21       353
           5       0.48      0.07      0.13       206
           6       0.76      0.20      0.32       110
           7       0.71      0.27      0.39       437
           8       0.78      0.36      0.49       855
           9       0.60      0.14      0.23        21

    accuracy                           0.58      4962
   macro avg       0.63      0.25      0.31      4962
weighted avg       0.61      0.58      0.52      4962



In [20]:
print(xgb_preds_encoded[100])

3
