### Задание

Попробуйте поработать с датасетом юридических текстов. В датасете всего две важных колонки признаков: заголовок дела и его текст, а целевая переменная - case_outcome (мультиклассовая классификация).

В базовом варианте можно оставить только текст дела, если хотите поинтереснее - можно попробовать распарсить case_title, добыв оттуда дополнительные признаки.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn import metrics

from sklearn.metrics import classification_report

from nltk.tokenize import word_tokenize

from sklearn.metrics import accuracy_score

import seaborn as sns

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('legal_text_classification.csv')
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [3]:
case_outcome = data['case_outcome'].unique()
print(case_outcome)

['cited' 'applied' 'followed' 'referred to' 'related' 'considered'
 'discussed' 'distinguished' 'affirmed' 'approved']


In [4]:
data.info()

# есть NA

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_id       24985 non-null  object
 1   case_outcome  24985 non-null  object
 2   case_title    24985 non-null  object
 3   case_text     24809 non-null  object
dtypes: object(4)
memory usage: 780.9+ KB


In [5]:
data = data.dropna()

In [6]:
# удаляю непринципиально важный 'case_id'

data.drop('case_id', axis=1, inplace=True)

In [7]:
# объединяю две важные текстовые колонки - пусть весь текст находится в одном столбце

data['text_title'] = data['case_title'] + data['case_text']

In [8]:
data = data.drop(['case_text', 'case_title'], axis=1)

In [9]:
data.sample(10)

Unnamed: 0,case_outcome,text_title
2272,discussed,Pfizer Overseas Pharmaceuticals v Eli Lilly &a...
4236,cited,Saltman Engineering Coy Ltd v Campbell Enginee...
23967,cited,Cook v Pasminco Ltd (No 2) [2000] FCA 1819 ; (...
21377,referred to,Wyong-Gosford Progressive Community Radio Inc ...
24922,referred to,Hong v Minister for Immigration and Indigenous...
16183,cited,The Silver Fox Company Pty Ltd as Trustee for ...
11937,distinguished,Saffron v Societe Miniere Cafrika [1958] HCA 5...
7212,applied,Roskell v Snelgrove [2008] FCA 427to the first...
4434,cited,In the Matter of Village Roadshow Limited (No ...
7962,referred to,Westpac Banking Corporation v Totterdell (1998...


In [10]:
x_train, x_test, y_train, y_test = train_test_split(data.text_title, data.case_outcome)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

from nltk import ngrams

In [12]:
# преобразование слов в вектора; униграммы

vec = CountVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)

In [13]:
list(vec.vocabulary_.items())[:10]

[('ampolex', 18640),
 ('ltd', 37557),
 ('perpetual', 42775),
 ('trustee', 52493),
 ('company', 23832),
 ('canberra', 22216),
 ('1996', 4974),
 ('hca', 32487),
 ('15', 2821),
 ('137', 2180)]

In [14]:
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)



In [15]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

               precision    recall  f1-score   support

     affirmed       0.38      0.60      0.46        15
      applied       0.17      0.36      0.23       280
     approved       0.28      0.33      0.31        27
        cited       0.88      0.59      0.71      4573
   considered       0.19      0.43      0.27       184
    discussed       0.22      0.39      0.28       144
distinguished       0.21      0.52      0.30        66
     followed       0.19      0.39      0.25       257
  referred to       0.30      0.51      0.37       647
      related       0.32      0.60      0.41        10

     accuracy                           0.55      6203
    macro avg       0.31      0.47      0.36      6203
 weighted avg       0.71      0.55      0.60      6203



In [16]:
# преобразование слов в вектора; биграммы и триграммы

vec = CountVectorizer(ngram_range=(2, 3))
bow = vec.fit_transform(x_train)

In [17]:
list(vec.vocabulary_.items())[:10]

[('ampolex ltd', 386560),
 ('ltd perpetual', 1637465),
 ('perpetual trustee', 2036814),
 ('trustee company', 2759364),
 ('company canberra', 814191),
 ('canberra ltd', 717118),
 ('ltd 1996', 1632985),
 ('1996 hca', 80132),
 ('hca 15', 1311193),
 ('15 1996', 37232)]

In [None]:
# К сожалению, эта ячейка бесконечно долго считается. Я пока в поисках объяснений...

clf = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

In [12]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
vec = TfidfVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced') # возможно, поможет немного сбалансировать классы
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

               precision    recall  f1-score   support

     affirmed       0.76      0.35      0.48        62
      applied       0.28      0.29      0.28       588
     approved       0.24      0.12      0.16        64
        cited       0.66      0.66      0.66      3001
   considered       0.26      0.23      0.24       480
    discussed       0.38      0.22      0.28       458
distinguished       0.44      0.21      0.29       277
     followed       0.27      0.34      0.30       466
  referred to       0.36      0.55      0.44       724
      related       0.59      0.23      0.33        83

     accuracy                           0.49      6203
    macro avg       0.42      0.32      0.35      6203
 weighted avg       0.50      0.49      0.48      6203



In [None]:
# удаление пунктуации, стоп-слов

In [14]:
import nltk
from nltk.corpus import stopwords
from string import punctuation

nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
noise = stopwords.words('english') + list(punctuation)

In [17]:
import nltk
nltk.download('punkt')

vec = CountVectorizer(ngram_range=(1, 1), tokenizer=word_tokenize, stop_words=noise)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


               precision    recall  f1-score   support

     affirmed       0.48      0.70      0.57        20
      applied       0.22      0.31      0.26       440
     approved       0.09      0.21      0.12        14
        cited       0.79      0.65      0.71      3682
   considered       0.21      0.35      0.26       242
    discussed       0.21      0.35      0.26       158
distinguished       0.26      0.44      0.33        80
     followed       0.24      0.38      0.29       380
  referred to       0.51      0.48      0.49      1169
      related       0.34      0.61      0.44        18

     accuracy                           0.55      6203
    macro avg       0.34      0.45      0.37      6203
 weighted avg       0.62      0.55      0.58      6203



In [None]:
# Даже с удаленной пунктуацией, стоп-словами - результат не отличается принципиально.