## Text Embedding Analysis Through Legal-BERT

### Dataset Reading

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("../data/metadata.csv")

df.head()

Unnamed: 0,filename,author,type,text
0,regina_v_wing_chong.txt,Crease,case,"CREASE, J. 1885. REGINA v. WING CHONG. \n\n14t..."
1,wong_hoy_woon_v_duncan.txt,Crease,case,"CREASE, J.\n\nWONG HOY WOON v. DUNCAN.\n\n1894..."
2,regina_v_mee_wah.txt,Begbie,case,BRITISH COLUMBIA REPORTS.\n\nREGINA v. MEE WAH...
3,regina_v_victoria.txt,Begbie,case,"OF BRITISH COLUMBIA.\n\nREGINA r, CORPORATION ..."
4,quong_wing_v_the_king.txt,Fitzpatrick,case,QUONG WING v. THE KING. CAN. \n\nSupreme Cour...


### Naive Word Embedding Analysis

In [2]:
# Define a function to clean the text
import re

def clean_text(text):
    
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    
    return text.strip()

In [3]:
from nltk.tokenize import word_tokenize

# Create the large corpus by joining all text from all authors
all_text = " ".join(df["text"].tolist())

clean_text = clean_text(all_text)

In [4]:
# Load the tokenizer and model from Hugging Face
from transformers import AutoTokenizer, AutoModel
import torch

# We will use the Legal-BERT model for this task
tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
model = AutoModel.from_pretrained('nlpaueb/legal-bert-base-uncased')

# set the model to evaluation mode
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
# Create the word embeddings
# Tokenize the cleaned text into words
tokens = word_tokenize(clean_text)

# Get unique words to avoid redundant computation
unique_tokens = list(set(tokens))

# Print the shape of unique tokens
print(f'There are {len(unique_tokens)} unique tokens in this corpus.')

There are 4849 unique tokens in this corpus.


In [6]:
# Prepare a dictionary to store word embeddings
word_embeddings = {}

# For each word, get its BERT embedding by feeding it as a single-token input
for word in unique_tokens:
    word_inputs = tokenizer(word, return_tensors='pt', truncation=True, max_length=10)
    with torch.no_grad():
        word_outputs = model(**word_inputs)
        # Use the [CLS] token embedding as the word embedding
        word_embedding = word_outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        word_embeddings[word] = word_embedding

In [7]:
# Print embedding for the word of interest 'chinese'

print(f"BERT embedding for 'chinese':\n{word_embeddings.get('chinese')}")

BERT embedding for 'chinese':
[-6.20929539e-01 -1.41670823e-01  6.38972700e-01  5.66699132e-02
  2.49502540e-01  3.55757505e-01 -9.64455083e-02  3.54799002e-01
 -2.72700071e-01 -6.37607515e-01  1.72131464e-01  5.87601185e-01
  5.80037721e-02 -1.98575929e-01 -6.22221410e-01  6.23443425e-01
 -2.84136593e-01 -2.01131850e-01 -1.16010755e-01  3.39487463e-01
 -1.49680659e-01  4.16029960e-01  4.64205593e-01 -4.62918848e-01
  3.87409419e-01  6.31607294e-01  6.86673880e-01  2.19446510e-01
 -3.76841813e-01  1.29365414e-01 -2.28451476e-01 -2.85087526e-01
  3.50298733e-01  4.33137774e-01 -4.69815671e-01  2.95415729e-01
  5.21581918e-02 -2.85912603e-02  4.41664994e-01  2.89366961e-01
  3.54161382e-01 -7.48492539e-01  7.74241015e-02 -1.15738958e-01
 -1.74300909e-01  1.22695386e-01 -2.15352607e+00 -3.29316437e-01
  1.01312399e-02 -3.54919508e-02 -1.23483628e-01  6.59714639e-01
 -8.31658393e-03  6.29764616e-01  6.69252157e-01 -4.71154869e-01
  7.91465193e-02 -6.24100566e-01 -4.18076426e-01 -6.55551255

In [8]:
# Compute cosine similarity between all words with Chinese in the model
from scipy.spatial.distance import cosine

similarity_scores = {}

for other_word in word_embeddings.keys():
    if other_word != "chinese":
        similarity = 1 - cosine(word_embeddings["chinese"], word_embeddings[other_word])
        similarity_scores[other_word] = similarity

# Sort by cosine similarity
sorted_similarity = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

# Print the top 10 most similar words
print("Top 10 most similar words to 'chinese':")
for word, score in sorted_similarity[:10]:
    print(f"{word}: {score:.4f}")

Top 10 most similar words to 'chinese':
japanese: 0.8790
chong: 0.8652
alien: 0.8581
fourteen: 0.8564
jaw: 0.8557
king: 0.8519
hong: 0.8516
contradiction: 0.8485
cousin: 0.8480
inferior: 0.8472


In [9]:
# Generate a t-SNE plot for visualization
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)

embeddings = np.array(list(word_embeddings.values()))
tsne_results = tsne.fit_transform(embeddings)

In [22]:
# Create a DataFrame for visualization
import plotly.express as px

df_tsne = pd.DataFrame(tsne_results, columns=['x', 'y'])
df_tsne['word'] = list(word_embeddings.keys())
# Highlight the word 'chinese' in the plot
df_tsne['highlight'] = df_tsne['word'].apply(lambda x: 'chinese' if x == 'chinese' else '')

fig = px.scatter(
    df_tsne,
    x='x',
    y='y',
    title='t-SNE Visualization of legal-BERT Word Embeddings',
    color='highlight',                        
    hover_data=['word'], 
    text= 'highlight',
    height=600,
    width=800
)

fig.show()





### Text Embedding Analusis of Crease and Begbie Corpus

In [57]:
# Compile the Crease texts into a single text list
crease_texts = df[df['author'] == 'Crease']['text'].tolist()

# Compile the Begbie texts into a single text list
begbie_texts = df[df['author'] == 'Begbie']['text'].tolist()

# Combine both lists in a dictionary
judge_dict = {
    'Crease': crease_texts,
    'Begbie': begbie_texts
}

In [58]:
# Define a function to embed text using the model
from typing import Union, List

def embed_text(
    text: str,
    focus_token: Union[str, List[str]] = None,
    window: int = 5,
    tokenizer=tokenizer,
    model=model):
    """
    text: the raw string
    focus_token: either a single word, or a list of words to look for
    window: how many tokens on each side to include
    tokenizer: HuggingFace tokenizer
    model: BERT model
    """
    # Run the model once
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden = outputs.last_hidden_state.squeeze(0)  # (seq_len, hidden_dim)

    if focus_token is None:
        return hidden[0].cpu().numpy()
    
    # Normalize to list
    keywords = (
        [focus_token] if isinstance(focus_token, str)
        else focus_token
    )

    # Pre-tokenize each keyword to its subtoken ids
    kw_token_ids = {
        kw: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(kw))
        for kw in keywords
    }

    input_ids = inputs["input_ids"].squeeze(0).tolist()
    spans = []  # list of (start, end) index pairs

    # find every match of every keyword
    for kw, sub_ids in kw_token_ids.items():
        L = len(sub_ids)
        for i in range(len(input_ids) - L + 1):
            if input_ids[i:i+L] == sub_ids:
                spans.append((i, i+L))

    if not spans:
        # fallback on CLS vector
        return hidden[0].cpu().numpy()

    # For each span, grab the window around it
    vecs = []
    for (start, end) in spans:
        lo = max(1, start - window)
        hi = min(hidden.size(0), end + window)
        # mean‑pool over all tokens in this extended window
        span_vec = hidden[lo:hi].mean(dim=0).cpu().numpy()
        vecs.append(span_vec)

    # Average across all spans
    return np.mean(np.stack(vecs, axis=0), axis=0)

In [59]:
from nltk import sent_tokenize 
# Create a dictionary to hold the mentionings of "Chinese" by author
judge_snippets = {}

keywords = ["Chinese", "China", "Chinaman", "Chinamen"]
for auth, texts in judge_dict.items():
    snippets = []
    for txt in texts:
        sentence = sent_tokenize(txt)
        for sent in sentence:
            if any(keyword in sent for keyword in keywords):
                snippets.append(sent)
    judge_snippets[auth] = snippets

In [60]:
# Investigate the length of the snippets
n_snippet = {auth: len(snippets) for auth, snippets in judge_snippets.items()}

print("Snippet size by author:")
for auth, num in n_snippet.items():
    print(f"{auth}: {num}")
    

Snippet size by author:
Crease: 140
Begbie: 107


In [61]:
# Define an ethnicity anchor, not including "chinese"
ethnicities = ["Japanese", "Indian", "Korean", "Vietnamese", "Filipino", 
               "Canadian", "American", "European", "British", "African",
               "French", "German", "Italian", "Spanish", "Portuguese", "Australian"]
# Create embeddings
eth_vecs = []
for e in ethnicities:
    eth_vecs.append(embed_text(e))
    
eth_anchor = np.mean(eth_vecs, axis=0)

In [62]:
# Create embeddings and subtract the ethnicity anchor
embeddings_dict = {'Crease': [], 'Begbie': []}

for auth, snippets in judge_snippets.items():
    for snip in snippets:
        v = embed_text(snip, focus_token=keywords, window=15)
        embeddings_dict[auth].append(v - eth_anchor)


In [63]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute per author mean and cosine similarity
mean_crease = np.mean(embeddings_dict["Crease"], axis=0, keepdims=True)  
mean_begbie = np.mean(embeddings_dict["Begbie"], axis=0, keepdims=True)

# Compute the pairwise cosine similarity
sim_crease_begbie = cosine_similarity(mean_crease, mean_begbie)[0, 0] 

print(f"Cosine similarity between Crease and Begbie: {sim_crease_begbie:.4f}")

Cosine similarity between Crease and Begbie: 0.9954


In [64]:
# Create UMAP projection for visualization
import umap 

all_vecs = np.vstack(embeddings_dict["Crease"] + embeddings_dict["Begbie"])
labels  = (["Crease"] * len(embeddings_dict["Crease"])) + (["Begbie"] * len(embeddings_dict["Begbie"]))

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1)
proj = reducer.fit_transform(all_vecs) 

proj_means = reducer.transform(np.vstack([mean_crease, mean_begbie]))  

In [65]:
# plot using plotly to further explore
import plotly.express as px
import textwrap

def wrap_text(text, width=60):
    return '<br>'.join(textwrap.wrap(text, width=width))

umap_df = pd.DataFrame(proj, columns=['UMAP 1', 'UMAP 2'])
umap_df['Author'] = labels
umap_df['Text'] = [snip for auth in judge_snippets for snip in judge_snippets[auth]]
umap_df['Text'] = umap_df['Text'].apply(lambda t: wrap_text(t, width=50))

fig = px.scatter(umap_df, x='UMAP 1', y='UMAP 2', 
                 color='Author', hover_data=['Text'], 
                 width=800, height=500 )
fig.update_traces(marker=dict(size=5))
fig.update_layout(title='UMAP Projection of Word Embeddings by Author (Ethnicty Anchor Subtracted)')
fig.show()

fig.write_html("umap_projection.html")





In [66]:
# Print out the 10 most similar embedding sentences to Crease's mean embedding
from sklearn.metrics.pairwise import cosine_similarity

crease_similarity_df = pd.DataFrame(columns=['Author', 'Text', 'Similarity Score'])

# Iterate through the embeddings and their corresponding sentences
for auth, snippets in judge_snippets.items():
    for snippet, emb in zip(snippets, embeddings_dict[auth]):
        similarity = cosine_similarity(emb.reshape(1, -1), mean_crease)[0][0]
        crease_similarity_df.loc[len(crease_similarity_df)] = [
            auth, snippet, similarity
        ]

# Sort by similarity score
crease_sorted_similarity = crease_similarity_df.sort_values(by='Similarity Score', ascending=False)

print("Top 10 most similar sentences to Crease's mean embedding:\n")

for _, row in crease_sorted_similarity.head(10).iterrows():
    wrapped_para = textwrap.fill(row['Text'], width=100)
    print(f"Author: {row['Author']}\nSentence: {wrapped_para}\nSimilarity Score: {row['Similarity Score']:.4f}\n")

Top 10 most similar sentences to Crease's mean embedding:

Author: Begbie
Sentence: Statutes were by their title and preamble REGINA v. MEE WAH expressly aimed at Chinamen by name;
that this distinction also renders inapplicable all the United States' cases cited; that this
enactment is quite general extending to all laundries without exception and we must not look beyond
the words of the enactment to enquire what its object was; that there is in fact one laundry in
Victoria not conducted by Chinamen on which the tax will fall with equal force so that it is
impossible to say that Chinamen are hereby exclusively selected for taxation; the circumstance that
they are chiefly affected being a mere coincidence; that the bylaw only imposes $100.00 per annum,
keeping far within the limit of $150.00 permitted by the Statute; that the tax clearly is calculated
to procuring additional Municipal revenue and that no other object is hinted at.
Similarity Score: 0.9679

Author: Crease
Sentence: The 

In [67]:
# Print out the 10 most similar embedding sentences to Begbie's mean embedding
begbie_similarity_df = pd.DataFrame(columns=['Author', 'Text', 'Similarity Score'])

# Iterate through the embeddings and their corresponding sentences
for auth, snippets in judge_snippets.items():
    for snippet, emb in zip(snippets, embeddings_dict[auth]):
        similarity = cosine_similarity(emb.reshape(1, -1), mean_begbie)[0][0]
        begbie_similarity_df.loc[len(begbie_similarity_df)] = [
            auth, snippet, similarity
        ]

# Sort by similarity score
begbie_sorted_similarity = begbie_similarity_df.sort_values(by='Similarity Score', ascending=False)

print("Top 10 most similar sentences to Begbie's mean embedding:\n")

for _, row in begbie_sorted_similarity.head(10).iterrows():
    
    wrapped_para = textwrap.fill(row['Text'], width=100)
    
    print(f"Author: {row['Author']}\nSentence: {wrapped_para}\nSimilarity Score: {row['Similarity Score']:.4f}\n")

Top 10 most similar sentences to Begbie's mean embedding:

Author: Crease
Sentence: Though possessed of all the qualities I have described, Chinamen do not make good settlers in the
sense of raising up citizens of a free.
Similarity Score: 0.9687

Author: Begbie
Sentence: Whites who have evil communications with Chinese must themselves be lamentably depraved beforehand;
and so, I should be disposed to say, immoral Chinese are not only not more injurious, but they are
quite innocuous to the morals of the whites, in comparison with white people of similar or allied
habits.
Similarity Score: 0.9665

Author: Crease
Sentence: I know of retired officers and persons of settled incomes who would not have thought of coming here
if they had not known that Chinese servants could he had here, though very indifferent compared with
those one can obtain in China itself.
Similarity Score: 0.9643

Author: Crease
Sentence: The strike of the Chinese in Victoria when resisting an intentionally discriminat