In [1]:
import gensim
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re

#nltk.download("punkt")
#nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))


In [2]:
#import gensim.downloader as api
#path = api.load("word2vec-google-news-300", return_path=True)
#print(path)

In [3]:
# Load pre-trained Word2Vec model in the word2vec text format
word2vec_model_path = "GoogleNews-vectors-negative300.bin"  # Path to the Word2Vec model file
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

In [4]:
# List of phrases
std_phrases = ["Optimal performance", 
           "Utilise resources",
            "Enhance productivity", 
           "Conduct an analysis", 
           "Maintain a high standard", 
           "Implement best practices", 
           "Ensure compliance", 
           "Streamline operations", 
           "Foster innovation", 
           "Drive growth", 
           "Leverage synergies", 
           "Demonstrate leadership", 
           "Exercise due diligence", 
           "Maximize stakeholder value", 
           "Prioritise tasks", 
           "Facilitate collaboration", 
           "Monitor performance metrics", 
           "Execute strategies", 
           "Gauge effectiveness", 
           "Champion change"]


# Input text
input_text = "In today's meeting, we discussed a variety of issues affecting our department. The weather was unusually sunny, a pleasant backdrop to our serious discussions. We came to the consensus that we need to do better in terms of performance. Sally brought doughnuts, which lightened the mood. It's important to make good use of what we have at our disposal. During the coffee break, we talked about the upcoming company picnic. We should aim to be more efficient and look for ways to be more creative in our daily tasks. Growth is essential for our future, but equally important is building strong relationships with our team members. As a reminder, the annual staff survey is due next Friday. Lastly, we agreed that we must take time to look over our plans carefully and consider all angles before moving forward. On a side note, David mentioned that his cat is recovering well from surgery."


In [5]:
#Text preprocessing, deletion of punctuation marks, stopwords
def cleanText(text):
    text_nopunct = re.sub(r'[^\w\s]','',text.lower())
    word_tokens=nltk.word_tokenize(text_nopunct)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return filtered_sentence

In [6]:
#Converting into the phrase vectors (embeddings)
def phrase_vector(words):
    valid_words = [word for word in words if word in word2vec_model]
    if valid_words:
        sentence_vector = sum(word2vec_model[word] for word in valid_words) / len(valid_words)
        return sentence_vector
    else:
        return None

In [7]:
#Method for generating phrases (2-5 words)
def phrase_generator(sentence):
    words=cleanText(sentence)
    phrases=[]
    for length in range(2, min(6, len(words) + 1)):  
        for i in range(len(words) - length + 1):
            phrase = " ".join(words[i:i+length])
            phrases.append(phrase)
    return phrases

In [8]:
#Method for calculation cosine similarity between 2 vectors
def calculate_similarity(phrase,std_phrase):
    embd_std_phrases=phrase_vector(cleanText(std_phrase))
    embd_phrase=phrase_vector(cleanText(phrase))
    similarity_score = cosine_similarity([embd_std_phrases], [embd_phrase])[0][0]
    return similarity_score

In [9]:
#Display original text with stopwords 
def display_phrases(phrase,input_text):
    result_string = " "
    word_tokens=nltk.word_tokenize(phrase)
    pattern = re.compile(rf"{word_tokens[0]}(.*?){word_tokens[-1]}", re.DOTALL)
    matches = re.findall(pattern, input_text.lower())
    result_string = f"{word_tokens[0]} {' '.join(matches)} {word_tokens[-1]}"
    return result_string.strip()

In [10]:
sentences = nltk.sent_tokenize(input_text)
print("Original Phrases\tRecomended Substitude Phrases\tCosine Similarity Score")
print("-" * 80)
cnt=0
for sentence in sentences:
    phrases_list=phrase_generator(sentence)
    for std_phrase in std_phrases:        
        for phrase in phrases_list:  
            similarity_score =calculate_similarity(phrase,std_phrase)
            if similarity_score>0.6 and similarity_score<1:
                cnt += 1
                print(f"{cnt}\t{display_phrases(phrase,input_text).capitalize()}\t\t{std_phrase.capitalize()}\t\t{similarity_score:.2f}")


Original Phrases	Recomended Substitude Phrases	Cosine Similarity Score
--------------------------------------------------------------------------------
1	Better  in terms of  performance		Optimal performance		0.64
2	Need  to do better in terms of  performance		Optimal performance		0.61
3	Daily   tasks		Prioritise tasks		0.83
4	Creative  in our daily  tasks		Prioritise tasks		0.74
5	Ways  to be more creative in our daily  tasks		Prioritise tasks		0.70
6	Look  for ways to be more creative in our daily  tasks		Prioritise tasks		0.66
7	Growth  is  essential		Drive growth		0.67
8	Growth  is essential for our  future		Drive growth		0.61
