In [307]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_sm')
import string

import gensim
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# ! conda install -c conda-forge sentence-transformers

from sentence_transformers import SentenceTransformer

from gensim.models import Word2Vec

# ! pip install transformers -q
# ! pip install simpletransformers wandb pytorch-lightning
# ! pip install -U transformers torch sentencepiece
# ! pip install -U summa


# $\textbf{All Pre-processing Functions on Top}$

In [308]:
def read_data(path):
    df = pd.read_pickle(path,compression = 'zstd')
    df = df.dropna(how="any")
    return df

def tokenize_and_remove_stopwords(X):
    stop_words = set(stopwords.words('english'))
    return X['tokens'].apply(lambda x: [word for word in x if word not in stop_words])


def preprocess(df):

    def remove_space(match_obj):
        if match_obj.group() is not None:
            return match_obj.group().replace(' ','|')

    df['Text'] = df['Text'].str.replace(r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?', 'URL', regex=True)


    # remove '|' so we can use it as a unique space seperator for cases
    df['Text'] = df['Text'].str.replace(r'|', '', regex=True)


    df['Text'] = df['Text'].str.replace(r'U\. S\. C\.', 'U.S.C.', regex=True)
    df['Text'] = df['Text'].str.replace(r'U\. S\.', 'U.S.', regex=True)
    df['Text'] = df['Text'].str.replace(r'No\. ', 'No.', regex=True)
    r"F\.( \d+-?\w*,?)+( \([A-Z]*\d* ?\d*\),?)*"
    # df['Text'] = df['Text'].str.replace(r"F\.( \d+-?\w*,?)+( \([A-Z]*\d* ?\d*\),?)*", remove_space, regex=True)
    text = []
    for element in df['Text']:
        element = re.sub(r"\bF\.( \d+-?\w*,?)+( \([A-Z]*\d* ?\d*\),?)*", remove_space, element)
        element = re.sub(r"\b(\d+ )?U\.S\.C\. [A-Z]+\d*", remove_space, element)
        element = re.sub(r"\b(\d+ )?U\.S\.(( |-)\(?\d+,?\)?)+", remove_space, element)
        element = re.sub(r"\bArt\. [A-Z]+,?( [A-Z]\d+)?", remove_space, element)
        element = re.sub(r"\b\d{4} [A-Z]+ \d+,?( \*\d*)?", remove_space, element)
        element = re.sub(r"\bn\.( \d+)+( \(.+?\))?", remove_space, element)
        element = re.sub(r"\bPp\. [0-9\-]+", remove_space, element)
        # element = re.sub(r"\b([A-Z]+[a-z]*[A-Z\.']+,? )+v\.( [A-Z]+[a-z]*[A-Z\.']+,?)+ ?", remove_space, element)
        element = re.sub(r"\b\d+ U\.S.[_,\- ]*\(\d+\)", remove_space, element)
        text.append(element)
    df['Text'] = text
    # Disabled as legal cases need punctuations to work
    # # remove non-alphabetical
    # df['Text'] = df['Text'].str.replace('[^a-zA-Z0-9\'\".!()]', ' ', regex=True).str.strip()


    # remove extra spaces
    df['Text'] = df['Text'].str.replace(' +', ' ', regex=True).str.strip()


    return df

def process_VS_data(df):

    #assert tokens column is present
    assert 'tokens' in df.columns

    new_tokens = []
    for case in df['tokens']:
        new_token = []
        case_found = False
        for tokenIdx in range(len(case)):
            new_token.append(case[tokenIdx])
            # Handling cases
            #United States v. Rostenkowski, 59 F. 3d 1291, 1297 (CADC 1995).
            #United States Supreme Court AARON J. SCHOCK v. UNITED STATES(2019) No. 18-406
            if case[tokenIdx] == 'v.':
                case_found = True
            elif case_found and (case[tokenIdx].startswith('No.') or case[tokenIdx][0].islower() or case[tokenIdx][0].isnumeric() or case[tokenIdx-1].lower().startswith('al.')):
                # we need to deal with this
                last_word = new_token.pop()
                castStr = ''
                while len(new_token) > 0 and (new_token[-1] == 'v.' or (new_token[-1].lower() != 'see' and new_token[-1][0].isupper())) :
                    castStr =  new_token.pop() + '|' + castStr
                new_token.append(castStr[:-1])
                case_found = False
                new_token.append(last_word)
        new_tokens.append(new_token)
    df['tokens'] = new_tokens

    # change all tokens to lower case
    df['tokens'] = df['tokens'].apply(lambda x: [item.lower() for item in x])

    return df


In [309]:
# read data
path = 'data/all_year.pkl.zst'
df = read_data(path)[:15]

## Pre-processing here

In [310]:
df = preprocess(df)
#split the processed text into tokens for further processing with pipes | as the combinator
df['tokens'] = df['Text'].str.split() 
df['tokens'] = process_VS_data(df)['tokens']
df['tokens'] = tokenize_and_remove_stopwords(df)
df['new_text'] = df['tokens'].apply(lambda x: ' '.join(x))
#apply nlp to the new text
df['nlp'] = df['new_text'].apply(lambda x: nlp(x))
#for each nlp object, get lemma and pos
df['lemma'] = df['nlp'].apply(lambda x: [token.lemma_ for token in x])
# remove punctuations from lemma
df['lemma'] = df['lemma'].apply(lambda x: [token for token in x if token not in string.punctuation])
# create a new column and convert lemma to string
df['final_text'] = df['lemma'].apply(lambda x: ' '.join(x))
# remove '\'s' from final_text
df['final_text'] = df['final_text'].str.replace(r'\'s', '', regex=True)

# $\textbf{Text Summarization Functions}$

In [311]:
from transformers import pipeline
from summa import summarizer

def summarize_pipeline(df):
    summarizer = pipeline("summarization")
    df["summary_pipeline"] = df["final_text"].apply(lambda x: summarizer(x, truncation=True, max_length=1024, min_length=300, do_sample=False)[0]['summary_text'])
    return df


def summarize_summa(df):
    df["summary_summa"] = df["final_text"].apply(lambda x: summarizer.summarize(x, ratio=0.02))
    return df

In [312]:
# applied pipeline method to summarize text
df = summarize_pipeline(df)

# applied summa method to summarize text
df = summarize_summa(df)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)
Your max_length is set to 1024, but you input_length is only 775. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=387)


# $\textbf{Vectorization Functions}$

## Doc2Vec

In [313]:
def tagged_document(list_of_list_of_words):
       for i, list_of_words in enumerate(list_of_list_of_words):
      		yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

def process_doc2vec_similarity(X,case_id):

	documents = list(tagged_document(X))
		
	model = gensim.models.doc2vec.Doc2Vec(documents, vector_size=300, window=11, min_count=10, epochs=30)


	# Only handle words that appear in the doc2vec pretrained vectors. enwiki_ebow model contains 669549 vocabulary size.
	# tokens = list(filter(lambda x: x in model.wv.vocab.keys(), documents[case_id].words))

	base_vector = model.infer_vector(documents[case_id].words.split())

	vectors = []
	for i, document in enumerate(documents):

		# tokens = list(filter(lambda x: x in model.wv.vocab.keys(), document))
		vector = model.infer_vector(document.words.split())
		vectors.append(vector)

		# print("making vector at index:", i)

	scores = cosine_similarity([base_vector], vectors).flatten()

	# top 10 highest scores
	highest_score_indices = scores.argsort()[-5:][::-1]

	
	print(highest_score_indices)

	highest_score = 0
	highest_score_index = 0
	for i, score in enumerate(scores):
		if highest_score < score and i != case_id:
			highest_score = score
			highest_score_index = i

	most_similar_document = documents[highest_score_index]
	print("Most similar document by Doc2vec with the score:", highest_score, highest_score_index)


In [320]:
print("Final Text")
process_doc2vec_similarity(df['final_text'], 1)

print("Summary Pipeline")
process_doc2vec_similarity(df['summary_pipeline'], 1)

print("Summary Summa")
process_doc2vec_similarity(df['summary_summa'], 1)

Final Text
[1 0 9 3 7]
Most similar document by Doc2vec with the score: 0.99545455 0
Summary Pipeline
[ 4 13  1 12  7]
Most similar document by Doc2vec with the score: 0.99699116 4
Summary Summa
[ 6  2  5  1 12]
Most similar document by Doc2vec with the score: 0.9946988 6


## TF-IDF

In [315]:
def vectorize_tfidf(X):
    tfidf=TfidfVectorizer(min_df = 0.01, max_df=0.95, ngram_range = (1,3), max_features=1500, norm='l2')
    X_data = tfidf.fit_transform(X)
    return X_data.toarray()

def process_tfidf_similarity(X,case_id):
    X_data = vectorize_tfidf(X)
    cosine_similarities = linear_kernel(X_data[case_id:case_id+1], X_data).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-5:-1]
    print(related_docs_indices)


In [319]:
print("Final Text")
process_tfidf_similarity(df['final_text'], 1)

print("Summary Pipeline")
process_tfidf_similarity(df['summary_pipeline'], 1)

print("Summary Summa")
process_tfidf_similarity(df['summary_summa'], 1)

Final Text
[ 1 14 10 12]
Summary Pipeline
[ 1 14 10  5]
Summary Summa
[ 1 14 12  5]


## BERT

In [317]:
def bert_similarity(X,case_id):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentences = X
    sentence_embeddings = model.encode(sentences)
    query = sentences[case_id]
    query_embedding = model.encode(query)

    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    number_top_matches = 5
    similar_sentences = cosine_similarity([query_embedding], sentence_embeddings)[0].argsort()[-number_top_matches:][::-1]
    print(similar_sentences)

In [318]:
print("Final Text")
bert_similarity(df['final_text'], 1)

print("Summary Pipeline")
bert_similarity(df['summary_pipeline'], 1)

print("Summary Summa")
bert_similarity(df['summary_summa'], 1)

Final Text
[ 1  7  9 13  6]
Summary Pipeline
[ 1  7  9 13  3]
Summary Summa
[ 1 14  0 12  2]


## Word2Vec

In [115]:
def train_word2vec(X):
    model = Word2Vec(X, vector_size=100, window=5, min_count=1, workers=4)  
    return model



In [132]:
sentences = df['final_text'].apply(lambda x: x.split(" "))
w2v_model = train_word2vec(sentences)

In [134]:
# len(w2v_model.wv.index_to_key)
w2v_model.wv.most_similar('supreme')

[('decision', 0.998510479927063),
 ('sixth', 0.9983252286911011),
 ('writ', 0.9982919096946716),
 ('circuit', 0.9981789588928223),
 ('jurisdiction', 0.9979424476623535),
 ('preliminary', 0.9979386925697327),
 ('reverse', 0.9979249835014343),
 ('grant', 0.9978001117706299),
 ('petition', 0.9977887272834778),
 ('federal', 0.9977492690086365)]