In [11]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import contractions
#import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import Perceptron

import nltk
from nltk.corpus import stopwords
from nltk import tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 


nltk.download('punkt', quiet = True)
nltk.download('wordnet', quiet = True)
nltk.download('averaged_perceptron_tagger', quiet = True)
nltk.download('stopwords', quiet = True)



True

In [12]:
def preprocess_data(X):
    #expand contractions
    X['Text'] = X['Text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
    #join back words
    X['Text'] = [' '.join(map(str, l)) for l in X['Text']]
    #lowercase the reviews
    X['Text'] = X['Text'].str.lower()
    #remove html and url form reviews
    X['Text'] = X['Text'].str.replace(r'\s*https?://\S+(\s+|$)', '', regex=True).str.strip()
    #remove non-alphabetical characters
    X['Text'] = X['Text'].str.replace('[^a-zA-Z]', ' ', regex=True)
    #remove extra spaces
    X['Text'] = X['Text'].replace(r'\s+', ' ', regex=True)
    return X

In [13]:
def tokenize_and_remove_stopwords(X):
    stop_words = set(stopwords.words('english'))
    X['Text'] = X['Text'].apply(tokenize.word_tokenize)
    return X['Text'].apply(lambda x: [word for word in x if word not in stop_words])

In [14]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [15]:
def vectorize_tfidf(X):
    tfidf=TfidfVectorizer(min_df = 50, max_df=0.95, ngram_range = (1,3), max_features=1500, norm='l2')
    X_data = tfidf.fit_transform(X)
    return X_data.toarray()

In [16]:
#read csv file and convert to dataframe
df = pd.read_csv('csv/data.csv')

# Establish base line performance

In [17]:
# preprocess case_text using preocess_text function
data = df[:100]


In [18]:
case_text = preprocess_data(data)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Text'] = X['Text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Text'] = [' '.join(map(str, l)) for l in X['Text']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Text'] = X['Text'].str.lower()
A value is trying to be set on

In [19]:
case_text = tokenize_and_remove_stopwords(case_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Text'] = X['Text'].apply(tokenize.word_tokenize)


In [20]:
case_text = case_text.apply(nltk.tag.pos_tag)


In [21]:
case_text = case_text.apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

In [22]:
lemma = WordNetLemmatizer()
case_text = case_text.apply(lambda x: [lemma.lemmatize(word, tag) for word, tag in x])

In [23]:
case_text = [' '.join(map(str, l)) for l in case_text]

In [24]:
#vectorize the text using tfidf vectorizer
X_data = vectorize_tfidf(case_text)

In [25]:
X_data.shape

(100, 434)

In [26]:
import gensim.downloader as api
from gensim.models import Word2Vec

In [27]:
w2v_model = Word2Vec(case_text, vector_size=300, window=5, min_count=10)

In [28]:
def create_word2vec_embeddings(X):
    embeddings = []
    for sentence in X:
        sentence_embedding = np.zeros(300)
        for word in sentence:
            if word in w2v_model.wv:
                sentence_embedding += w2v_model.wv[word]
        embeddings.append(sentence_embedding)
    return np.array(embeddings)

In [29]:
w2v_vectors = create_word2vec_embeddings(case_text)

In [30]:
w2v_vectors.shape

(100, 300)

## TF-IDF Model

In [31]:
import gensim
import gensim.downloader as api

def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

def doc2vec_vectors(X):
    documents = list(tagged_document(X[0].split()))
    model = gensim.models.doc2vec.Doc2Vec(documents, vector_size=300, window=5, min_count=10)
    return np.array([model.infer_vector(doc.split()) for doc in X])

In [32]:
doc2vec = doc2vec_vectors(case_text)

In [33]:
#find most similar cases using cosine similarity and tfidf vectors
def find_similar_cases(X, case_id, n):
    pairwise_similarities=np.dot(X,X.T)
    most_similar = pairwise_similarities[case_id].argsort()[:-n-1:-1]
    return most_similar

#find most similar cases using cosine similarity and word2vec vectors

find_similar_cases(X_data, 0, 4)


array([0, 5, 4, 8], dtype=int64)

## Word2Vec Model

In [34]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [40]:
#find the max sequence length of word2vec vectors\
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer()
tokenizer.fit_on_texts(case_text)
tokenized_documents=tokenizer.texts_to_sequences(case_text)
tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=5000,padding='post')
vocab_size=len(tokenizer.word_index)+1


(5000,)


In [None]:
w2v_vectors = create_word2vec_embeddings(case_text)

## BERT Model

In [41]:
! pip install sentence-transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = model.encode(case_text)

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     ---------------------------------------- 86.0/86.0 kB 2.4 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
     ---------------------------------------- 5.5/5.5 MB 8.8 MB/s eta 0:00:00
Collecting torch>=1.6.0
  Downloading torch-1.13.0-cp39-cp39-win_amd64.whl (167.2 MB)
     -------------------------------------- 167.2/167.2 MB 6.5 MB/s eta 0:00:00
Collecting torchvision
  Downloading torchvision-0.14.0-cp39-cp39-win_amd64.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 11.6 MB/s eta 0:00:00
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 10.2 MB/s eta 0:00:00
Collecting huggingface-hub>=0.4.0
  Using cach

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [42]:
from sklearn.metrics.pairwise import cosine_similarity

similar = cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[:]
)
ind = np.argsort(similar[0])[::-1][:5]

In [43]:
ind

array([ 0, 98,  2,  7, 88], dtype=int64)