# Final Project

## Basic Information


| **Title:**       | Deep Learning and Natural Language Processing applied to the legal texts |
|------------------|----------------------------------------------------------|
| **Abstract:**    |                                                        |
| **Author:**      | Thiago Raulino Dal Pont                                |
| **Affiliation:** | Graduate Program in Automation and Systems Engineering  |
| **Date**         | July 14, 2022                                          |


## Goals of the project

- ...

## Project structure
- Preprocessing
- Representation
- Modeling
- Evaluation


## Requirements


``pip install -r requirements.txt``

``python3 -m spacy download pt_core_news_sm``


## Importing dependencies

In [1]:
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from src.modeling.util import get_class_weight
from src.preprocessing.preprocessing_shallow_ml import PreProcessingShallowML

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import tqdm
from sklearn.linear_model import LogisticRegression

from gensim.models import KeyedVectors
from keras import Input, Model, metrics, regularizers
from keras.callbacks import EarlyStopping
from keras.layers import Embedding, Reshape, Conv2D, MaxPooling2D, concatenate, Flatten, Dropout, Dense
from keras.optimizers import Adam, SGD
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical


In [2]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)

2022-07-17 20:57:16.420690: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2022-07-17 20:57:16.467101: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-17 20:57:16.467485: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce 940MX computeCapability: 5.0
coreClock: 1.189GHz coreCount: 3 deviceMemorySize: 1.96GiB deviceMemoryBandwidth: 37.33GiB/s
2022-07-17 20:57:16.467718: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2022-07-17 20:57:16.469190: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2022-07-17 20:57:16.470500: I tensorflow/stream_executor/platform

## Dataset basic information

In [3]:
DATASET_2CLASS_PATH = os.path.join("Data", "final_dataset_2l_wo_result", "")

preprocessor = PreProcessingShallowML()
preprocessor.load_dataset(DATASET_2CLASS_PATH)


Loading dataset
{'labels': {'ganha': None, 'perde': None}}
  -> Found 1044 files inside Data/final_dataset_2l_wo_result/ganha/*.txt
  -> Found 116 files inside Data/final_dataset_2l_wo_result/perde/*.txt


# Data preparation

- In this project, we implemented a class to handle the text preprocessing in such a way that we can easily select distinct methods.

In [4]:
preprocessor.preprocess_corpus(
    keep_raw=True,
    lowercase=True,
    stemming=False,
    remove_html=True,
    remove_punct=True,
    remove_stopwords=True
)

dataset_shallow_ml = preprocessor.df_corpora

Preprocessing corpus
  -> Converting to lowercase
  -> Removing HTML
  -> Tokenizing
  -> Removing punctuation
  -> Removing Stopwords
  -> Joining tokens into string


In [5]:
preprocessor.preprocess_corpus(
    keep_raw=True,
    lowercase=True,
    stemming=False,
    remove_html=True,
    remove_punct=True,
    remove_stopwords=False
)

dataset_dl = preprocessor.df_corpora

Preprocessing corpus
  -> Converting to lowercase
  -> Removing HTML
  -> Tokenizing
  -> Removing punctuation
  -> Joining tokens into string


- Dataset splitting

In [6]:
X = dataset_dl["processed_content"]
y = dataset_dl["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123, stratify=y, shuffle=True)
print("Dataset shapes:")
print(" -> Train: X=%s\ty=%s" % (str(X_train.shape), str(y_train.shape)))
print(" -> Test:  X=%s\ty=%s" % (str(X_test.shape), str(y_test.shape)))

Dataset shapes:
 -> Train: X=(1044,)	y=(1044,)
 -> Test:  X=(116,)	y=(116,)


In [7]:

# Class weights
class_weights = get_class_weight(y_train)


## Modelling with Deep Learning

In this section, we apply the dataset

### Modelling with Temporal Convolutional Network

In [8]:

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

In [9]:

EMBED_SIZE = 100
EPOCHS=50
BATCH_SIZE=16
NUM_WORDS = 10000

In [10]:
t = Tokenizer(oov_token='<UNK>',  num_words=NUM_WORDS)
# fit the tokenizer on the documents
t.fit_on_texts(X_train)
t.word_index['<PAD>'] = 0
word_index = t.word_index

VOCAB_SIZE = len(t.word_index)

In [11]:
train_sequences = t.texts_to_sequences(X_train)
test_sequences = t.texts_to_sequences(X_test)

In [12]:
print("Vocabulary size={}".format(len(t.word_index)))
print("Number of Documents={}".format(t.document_count))

Vocabulary size=18589
Number of Documents=1044


In [13]:
MAX_SEQUENCE_LENGTH = 2000

In [14]:
# pad dataset to a maximum review length in words
X_train = sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_test.shape

((1044, 2000), (116, 2000))

In [15]:
le = LabelEncoder()
num_classes=2 # positive -> 1, negative -> 0

In [16]:
from tensorflow.keras.utils import to_categorical
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [17]:
# Build the embeddings
word_vectors = KeyedVectors.load_word2vec_format("Data/pre-trained_embeddings/glove.txt", binary=False)

In [18]:
vocabulary_size = min(len(word_index) + 1, NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBED_SIZE))

count = 0
random_count = 0
vec = np.random.rand(EMBED_SIZE)
for word, i in word_index.items():
    if i >= MAX_SEQUENCE_LENGTH:
        continue

    count += 1
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        vec = np.random.rand(EMBED_SIZE)
        embedding_matrix[i] = vec
        random_count += 1


print("Random %.2f" % (random_count/count))
embedding_matrix.shape

Random 0.01


(10000, 100)

In [19]:
import sklearn

accs = []
f1s = []
models = []
hists = []

BATCH_SIZE = 16

class_weights[1]=10

for i in tqdm.tqdm(range(10)):


    X_train_i, X_val_i, y_train_i, y_val_i = train_test_split(X_train, y_train, test_size=0.1, random_state=42,  stratify=y_train, shuffle=True)

    # create the model
    tf.keras.backend.clear_session()
    embedding_layer = Embedding(vocabulary_size, EMBED_SIZE, weights=[embedding_matrix], trainable=True)

    filter_sizes = [2, 3]
    num_filters = 20
    drop = 0.5

    inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
    embedding = embedding_layer(inputs)
    reshape = Reshape((MAX_SEQUENCE_LENGTH, EMBED_SIZE, 1))(embedding)

    convs = []
    maxpools = []

    for filter_size in filter_sizes:
        conv = Conv2D(num_filters, (filter_size, EMBED_SIZE), activation='relu',
                      kernel_regularizer=regularizers.l2(0.01))(reshape)

        maxpool = MaxPooling2D(
            (MAX_SEQUENCE_LENGTH - filter_size + 1, 1), strides=(1, 1))(conv)

        maxpools.append(maxpool)
        convs.append(conv)

    merged_tensor = concatenate(maxpools, axis=1)

    flatten = Flatten()(merged_tensor)
    # reshape = Reshape((3 * num_filters,))(flatten)
    dropout = Dropout(drop)(flatten)
    #conc = Dense(40)(dropout)
    output = Dense(2, activation='softmax',
                   kernel_regularizer=regularizers.l2(0.01))(dropout)

    # this creates a model that includes
    model = Model(inputs, output)


    callbacks = [EarlyStopping(monitor='val_recall', patience=10, restore_best_weights=True, mode="max")]
    opt = SGD(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy', tf.keras.metrics.Recall()])

    history = model.fit(X_train_i, y_train_i,
          validation_data=(X_val_i, y_val_i),
          epochs=EPOCHS,
          batch_size=BATCH_SIZE,
          class_weight=class_weights,
          verbose=0,
          callbacks=callbacks)

    y_pred = model.predict(X_val_i)
    accs.append(sklearn.metrics.accuracy_score(np.argmax(y_val_i, axis=1), np.argmax(y_pred, axis=1)))
    f1s.append(sklearn.metrics.f1_score(np.argmax(y_val_i, axis=1), np.argmax(y_pred, axis=1)))
    hists.append(history)
    models.append(model)

print("Acc: %.1f(%.1f)%%" % (100 * np.median(accs), 100 * np.std(accs)))
print("F1:  %.2f(%.2f)" % (np.median(f1s), np.std(f1s)))

best_model = models[np.argmax(f1s)]
best_history = hists[np.argmax(f1s)]

  0%|          | 0/10 [00:00<?, ?it/s]2022-07-17 20:57:54.675994: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2022-07-17 20:57:54.698900: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2899885000 Hz
2022-07-17 20:57:54.699598: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5596e821b470 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-07-17 20:57:54.699658: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-07-17 20:57:54.751875: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-17 20:57:54.752113: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5596e79a2920 initial

Acc: 94.3(1.0)%
F1:  0.76(0.04)





In [20]:
from sklearn.metrics import classification_report

y_pred = best_model.predict(X_test)
print(classification_report(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))

from sklearn.metrics import confusion_matrix

print(confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))

              precision    recall  f1-score   support

           0       1.00      0.89      0.94       104
           1       0.52      1.00      0.69        12

    accuracy                           0.91       116
   macro avg       0.76      0.95      0.81       116
weighted avg       0.95      0.91      0.92       116

[[93 11]
 [ 0 12]]


## Modeling with Shallow ML

In [21]:

X = dataset_shallow_ml["processed_content"]
y = dataset_shallow_ml["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123, stratify=y, shuffle=True)
print("Dataset shapes:")
print(" -> Train: X=%s\ty=%s" % (str(X_train.shape), str(y_train.shape)))
print(" -> Test:  X=%s\ty=%s" % (str(X_test.shape), str(y_test.shape)))

le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

Dataset shapes:
 -> Train: X=(1044,)	y=(1044,)
 -> Test:  X=(116,)	y=(116,)


In [22]:
vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.05, max_features=5000)

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [23]:
accs = []
f1s = []
models = []
for i in tqdm.tqdm(range(10)):
    X_train_i, X_val_i, y_train_i, y_val_i = train_test_split(X_train_bow, y_train, test_size=0.1, random_state=i,
                                                               stratify=y_train, shuffle=True)

    # Class weights
    class_weights = get_class_weight(y_train_i)

    #model = RandomForestClassifier(class_weight=class_weights, max_depth=10, n_estimators=200, n_jobs=3)
    model = MLPClassifier(batch_size=32)
    model.fit(X_train_i, y_train_i)

    y_pred = model.predict(X_val_i)

    accs.append(sklearn.metrics.accuracy_score(y_val_i, y_pred))
    f1s.append(sklearn.metrics.f1_score(y_val_i, y_pred))
    models.append(model)

print("Acc: %.1f(%.1f)%%" % (100 * np.median(accs), 100 * np.std(accs)))
print("F1:  %.2f(%.2f)" % (np.median(f1s), np.std(f1s)))

best_model = models[np.argmax(f1s)]


100%|██████████| 10/10 [02:18<00:00, 13.85s/it]

Acc: 96.2(2.6)%
F1:  0.78(0.19)





In [24]:
from sklearn.metrics import classification_report

y_pred = best_model.predict(X_test_bow)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       104
           1       1.00      0.83      0.91        12

    accuracy                           0.98       116
   macro avg       0.99      0.92      0.95       116
weighted avg       0.98      0.98      0.98       116



In [25]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

[[104   0]
 [  2  10]]
