References:

1) Author: Borchers, Oliver

https://github.com/oborchers/Fast_Sentence_Embeddings

2) Author: kawine

https://github.com/kawine/usif

3) Author: Radim Řehůřek

https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/phrases.py

# Training Models

In [41]:
# Loading Packages

!pip install fse

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import pandas as pd
import numpy as np
import re

import gensim
import string
import operator
import pickle
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from gensim.utils import any2unicode, any2utf8
from gensim.models import word2vec, KeyedVectors
from gensim.models.phrases import Phraser, Phrases
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

from fse import CSplitIndexedList
from fse.models import Average, uSIF, sif
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()
pd.set_option('display.max_colwidth',-1)
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Preprocess

stop_words = stopwords.words('english')

def prep(text):
  data = str(text).lower()
  data = re.sub(r'\[.*?\]', '', data)
  data = data.translate(str.maketrans("","",string.punctuation.replace('_','').replace('@',''))).strip()
  data = re.sub(r'\d+','',data)
  data = [i for i in data.split() if not any(x in i for x in ['@','http'])]
  data = [i for i in data if not len(i)<2]
  data = [lemmatizer.lemmatize(i) for i in data]
  return data

def prep_s2v(text):
  return [x for x in bigram_phraser_model[prep(text)] if x not in stop_words]

In [3]:
# Loading data and preprocessing

# tqdm._instances.clear()
data = pd.read_csv('/content/drive/MyDrive/Application/Case Studies/Hotel Reviews - Topic Modelling/Hotel_Reviews.csv').sample(n=10000,random_state = 10)
data['text'] = data['Negative_Review']
data['processed_text'] = data['text'].progress_apply(prep)
data = data[['text','processed_text']]

100%|██████████| 10000/10000 [00:02<00:00, 3713.93it/s]


In [4]:
# Training Phraser, Word2Vec, Sentence2Vec Models

tqdm._instances.clear()
bigram_phraser_model = Phrases(data['processed_text'], min_count=3, threshold=2.5)
bigram_features = data['processed_text'].progress_apply(lambda x: [word for word in bigram_phraser_model[x] if word not in stop_words])
w2v_model = word2vec.Word2Vec(bigram_features, size = 100, sg = 0, workers = 8, min_count = 3, window = 5, iter = 100)
indexed_sentences = CSplitIndexedList(list(data['text']), custom_split=prep_s2v)
s2v_model = uSIF(w2v_model, workers=2, lang_freq="en")
s2v_model.train(indexed_sentences)

100%|██████████| 10000/10000 [00:01<00:00, 9843.14it/s]


(9941, 87036)

In [5]:
# Optimizing and saving Sentence2Vec Model

optimized_model_s2v_usif = {}
optimized_model_s2v_usif["word_index"] = dict(zip(s2v_model.wv.index2word,range(len(s2v_model.wv.index2word))))
optimized_model_s2v_usif["word_weights"] = s2v_model.word_weights
optimized_model_s2v_usif["word_vectors"] = s2v_model.wv.vectors
optimized_model_s2v_usif["svd_residuals"] = s2v_model.svd_res

with open('/content/drive/MyDrive/Application/Case Studies/W2V S2V Optimization/optimized_model_s2v_usif.pkl','wb') as f:
  pickle.dump(optimized_model_s2v_usif, f, pickle.HIGHEST_PROTOCOL)

In [6]:
# Optimizing and saving Phraser Model

phraser_dict = {}
phraser_dict["vocab_length"] = len(bigram_phraser_model.vocab)
bigram_phraser_model.vocab = defaultdict(int, dict([(key,value) for key,value in bigram_phraser_model.vocab.items() 
                                                      if any2unicode(key) in optimized_model_s2v_usif["word_index"]]))
phraser_dict["phraser_model"] = bigram_phraser_model

with open('/content/drive/MyDrive/Application/Case Studies/W2V S2V Optimization/optimized_model_bigram_phraser.pkl','wb') as f:
  pickle.dump(phraser_dict, f, pickle.HIGHEST_PROTOCOL)

# Using Models

In [7]:
import pickle
import re
import string
from numpy import float32 as REAL, sum as np_sum, multiply as np_mult
from gensim.utils import any2utf8, any2unicode
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
# Preprocess input sentence and convert to phrases

common_terms = frozenset()
def prep(text):
  data = str(text).lower()
  data = re.sub(r'\[.*?\]', '', data)
  data = data.translate(str.maketrans("","",string.punctuation.replace('_','').replace('@',''))).strip()
  data = re.sub(r'\d+','',data)
  data = [i for i in data.split() if not any(x in i for x in ['@','http'])]
  data = [i for i in data if not len(i)<2]
  data = [lemmatizer.lemmatize(i) for i in data]
  data = [any2utf8(i) for i in data]
  return data

def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
  return (bigram_count - min_count)/worda_count/wordb_count*len_vocab

def score_candidate(word_a, word_b, in_between, phraser_dict):
  word_a_cnt = phraser_dict["phraser_model"].vocab[word_a]
  if word_a_cnt <= 0:
    return None, None

  word_b_cnt = phraser_dict["phraser_model"].vocab[word_b]
  if word_b_cnt <= 0:
    return None, None
  
  phrase = b'_'.join([word_a] + in_between + [word_b])
  phrase_cnt = phraser_dict["phraser_model"].vocab[phrase]

  if phrase_cnt <=0:
    return None, None

  score = original_scorer(worda_count = word_a_cnt, wordb_count = word_b_cnt, bigram_count = phrase_cnt,
                          len_vocab = phraser_dict["vocab_length"],
                          min_count = phraser_dict["phraser_model"].min_count,
                          corpus_word_count = phraser_dict["phraser_model"].corpus_word_count)
  
  if score <= phraser_dict["phraser_model"].threshold:
    return None, None

  return phrase, score


def analyze_sentence(sentence, phraser_dict):
  sentence = prep(sentence)
  start_token, in_between = None, []
  for word in sentence:
    if word not in common_terms:
      if start_token:
        phrase, score = score_candidate(start_token, word, in_between, phraser_dict)
        if score is not None:
          yield phrase
          start_token, in_between = None, []
        else:
          yield start_token
          for w in in_between:
            yield w
          start_token, in_between = word, []
      else:
        start_token, in_between = word, []
    else:
      if start_token:
        in_between.append(word)
      else:
        yield word
  if start_token:
    yield start_token
    for w in in_between:
      yield w

# Compute Sentence Vector for a given input sentence
def compute_sentence_vector(sentence):
  sentence = [any2unicode(word) for word in analyze_sentence(sentence, phraser_dict)]
  if len(sentence) == 0:
    return None
  word_indices = [optimized_model_s2v_usif["word_index"][word] for word in sentence if word in optimized_model_s2v_usif["word_index"]]
  weighted_vector = np_sum(np_mult(optimized_model_s2v_usif["word_vectors"][word_indices],
                                   optimized_model_s2v_usif["word_weights"][word_indices][:,None]), axis=0)
  weighted_vector *= 1/len(word_indices)
  sentence_vector = weighted_vector - weighted_vector.dot(w_comp.transpose()).dot(w_comp)
  return sentence_vector

In [12]:
# Loading the optimized version of Phraser and Sentence2Vec models
phraser_dict = pickle.load(open('/content/drive/MyDrive/Application/Case Studies/W2V S2V Optimization/optimized_model_bigram_phraser.pkl', 'rb'))
optimized_model_s2v_usif = pickle.load(open('/content/drive/MyDrive/Application/Case Studies/W2V S2V Optimization/optimized_model_s2v_usif.pkl', 'rb'))

svd_weights = (optimized_model_s2v_usif['svd_residuals'][0]**2)/(optimized_model_s2v_usif['svd_residuals'][0]**2).sum().astype(REAL)
w_comp = optimized_model_s2v_usif['svd_residuals'][1]*(svd_weights[:,None].astype(REAL))

In [40]:
# Testing on examples
sentence1 = "room was very small"
sentence2 = "Room was a bit small"

vector1 = compute_sentence_vector(sentence1)
vector2 = compute_sentence_vector(sentence2)

cosine_similarity([vector1], [vector2])[0][0]

0.65712184