In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import contractions
#import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import Perceptron

import nltk
from nltk.corpus import stopwords
from nltk import tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 


nltk.download('punkt', quiet = True)
nltk.download('wordnet', quiet = True)
nltk.download('averaged_perceptron_tagger', quiet = True)
nltk.download('stopwords', quiet = True)



[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:997)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


False

In [2]:
def preprocess_data(X):
    #expand contractions
    X['Text'] = X['Text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
    #join back words
    X['Text'] = [' '.join(map(str, l)) for l in X['Text']]
    #lowercase the reviews
    X['Text'] = X['Text'].str.lower()
    #remove html and url form reviews
    X['Text'] = X['Text'].str.replace(r'\s*https?://\S+(\s+|$)', '', regex=True).str.strip()
    #remove non-alphabetical characters
    X['Text'] = X['Text'].str.replace('[^a-zA-Z]', ' ', regex=True)
    #remove extra spaces
    X['Text'] = X['Text'].replace(r'\s+', ' ', regex=True)
    return X

In [3]:
def tokenize_and_remove_stopwords(X):
    stop_words = set(stopwords.words('english'))
    X['Text'] = X['Text'].apply(tokenize.word_tokenize)
    return X['Text'].apply(lambda x: [word for word in x if word not in stop_words])

In [4]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [5]:
def vectorize_tfidf(X):
    tfidf=TfidfVectorizer(min_df = 50, max_df=0.95, ngram_range = (1,3), max_features=1500, norm='l2')
    X_data = tfidf.fit_transform(X)
    return X_data.toarray()

In [6]:
#read csv file and convert to dataframe
df = pd.read_csv('csv/data.csv')

# Establish base line performance

In [7]:
# preprocess case_text using preocess_text function
data = df[:100]


In [8]:
case_text = preprocess_data(data)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Text'] = X['Text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Text'] = [' '.join(map(str, l)) for l in X['Text']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Text'] = X['Text'].str.lower()
A value is trying to be set on

In [9]:
case_text = tokenize_and_remove_stopwords(case_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Text'] = X['Text'].apply(tokenize.word_tokenize)


In [10]:
case_text = case_text.apply(nltk.tag.pos_tag)


In [11]:
case_text = case_text.apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

In [12]:
lemma = WordNetLemmatizer()
case_text = case_text.apply(lambda x: [lemma.lemmatize(word, tag) for word, tag in x])

In [13]:
case_text = [' '.join(map(str, l)) for l in case_text]

In [14]:
#vectorize the text using tfidf vectorizer
X_data = vectorize_tfidf(case_text)

In [15]:
X_data.shape

(100, 434)

In [16]:
! pip install gensim
import gensim.downloader as api
from gensim.models import Word2Vec



In [17]:
w2v_model = Word2Vec(case_text, vector_size=300, window=5, min_count=10)

In [71]:
def create_word2vec_embeddings(X):
    embeddings = []
    for sentence in X:
        sentence_embedding = np.zeros(300)
        for word in sentence:
            if word in w2v_model.wv:
                sentence_embedding += w2v_model.wv[word]
        embeddings.append(sentence_embedding)
    return np.array(embeddings)

In [19]:
w2v_vectors = create_word2vec_embeddings(case_text)

In [20]:
w2v_vectors.shape

(100, 300)

## TF-IDF Model

In [73]:
#find most similar cases using cosine similarity and tfidf vectors
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_cases(X, case_id, n):
    pairwise_similarities=np.dot(X,X.T)
    cosine_similarities = cosine_similarity(X[case_id].reshape(1,-1), X).flatten()

    highest_score_indices = cosine_similarities.argsort()[-10:][::-1]
    print(highest_score_indices)

    return highest_score_indices

#find most similar cases using cosine similarity and word2vec vectors

find_similar_cases(X_data, 0, 10)


[ 0  5  4  8  6  7  3 44 58  1]


array([ 0,  5,  4,  8,  6,  7,  3, 44, 58,  1])

## Doc2Vec

In [69]:
import gensim
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

def tagged_document(list_of_list_of_words):
       for i, list_of_words in enumerate(list_of_list_of_words):
      		yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

def process_doc2vec_similarity(X,case_id):

	documents = list(tagged_document(X))
		
	model = gensim.models.doc2vec.Doc2Vec(documents, vector_size=300, window=11, min_count=10, epochs=30)


	# Only handle words that appear in the doc2vec pretrained vectors. enwiki_ebow model contains 669549 vocabulary size.
	# tokens = list(filter(lambda x: x in model.wv.vocab.keys(), documents[case_id].words))

	base_vector = model.infer_vector(documents[case_id].words.split())

	vectors = []
	for i, document in enumerate(documents):

		# tokens = list(filter(lambda x: x in model.wv.vocab.keys(), document))
		vector = model.infer_vector(document.words.split())
		vectors.append(vector)

		# print("making vector at index:", i)

	scores = cosine_similarity([base_vector], vectors).flatten()

	# top 10 highest scores
	highest_score_indices = scores.argsort()[-10:][::-1]

	
	print(highest_score_indices)



	highest_score = 0
	highest_score_index = 0
	for i, score in enumerate(scores):
		if highest_score < score:
			highest_score = score
			highest_score_index = i

	most_similar_document = documents[highest_score_index]
	print("Most similar document by Doc2vec with the score:", most_similar_document, highest_score)


In [70]:
process_doc2vec_similarity(case_text, 0)

[ 0 37  9 40 36 71 11 38 12 46]
Most similar document by Doc2vec with the score: TaggedDocument<united state supreme court hunt v palao argue decide january motion make bring record case decide territorial court appeal florida previously admission florida state motion follow mr westcott behalf john hunt submit court certify copy record opinion say court appeal say judgment say case suggest court say court appeal defunct admission territory florida state th march last record paper say court appeal record aforesaid say case place act general assembly say state custody keep clerk supreme court say state also say case case federal jurisdiction move court allow writ error remove say record judgment court direction clerk court direct judge say supreme court say state clerk aforesaid custody say record aforesaid order say record judgment may certify court return say writ error make say clerk say supreme court say state mr chief justice taney deliver opinion court motion make process court bri

In [64]:
process_doc2vec_similarity(case_text,309)

[  0 309 459 366  19 149  22 162 277  52]
Most similar document by Doc2vec with the score: TaggedDocument<united, [0]> 1.0


In [38]:
import gensim
import gensim.downloader as api

def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

def doc2vec_vectors(X):
   documents = list(tagged_document(X[0].split()))
   model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=80)
   model.build_vocab(documents)
   model.train(documents, total_examples=model.corpus_count, epochs=80)

   # model = gensim.models.doc2vec.Doc2Vec(documents, vector_size=300, window=11, min_count=10, epochs=30)
   return np.array([model.infer_vector(doc.split()) for doc in X]),model

In [39]:
doc2vec,model = doc2vec_vectors(case_text)

In [40]:
model.docvecs.most_similar(0)

  model.docvecs.most_similar(0)


[(455, 0.996326208114624),
 (185, 0.9962396621704102),
 (433, 0.9962393641471863),
 (225, 0.9962005615234375),
 (437, 0.9961450695991516),
 (385, 0.9961084127426147),
 (379, 0.99603670835495),
 (330, 0.9960185289382935),
 (414, 0.9960132837295532),
 (200, 0.9960082769393921)]

## Word2Vec Model

In [24]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences

ImportError: dlopen(/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/google/protobuf/pyext/_message.cpython-310-darwin.so, 0x0002): symbol not found in flat namespace (__ZNK6google8protobuf10TextFormat21FastFieldValuePrinter19PrintMessageContentERKNS0_7MessageEiibPNS1_17BaseTextGeneratorE)

In [None]:
#find the max sequence length of word2vec vectors\
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer()
tokenizer.fit_on_texts(case_text)
tokenized_documents=tokenizer.texts_to_sequences(case_text)
tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=5000,padding='post')
vocab_size=len(tokenizer.word_index)+1


(5000,)


In [None]:
w2v_vectors = create_word2vec_embeddings(case_text)

## BERT Model

In [43]:
! conda install -c conda-forge sentence-transformers

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Retrieving notices: ...working... done


In [44]:
! pip install sentence-transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = model.encode(case_text)



ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similar = cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[:]
)
ind = np.argsort(similar[0])[::-1][:5]