In [1]:
# Uncomment following lines if following packages are not installed in your system
# !pip install termcolor
# !pip install -U gensim
# !pip install nltk

In [1]:

import gensim
from gensim.models import KeyedVectors
import re
import string
import nltk
# nltk.download()
import sys
from nltk.corpus import stopwords
from termcolor import colored

In [2]:
######Global Variables######
type1_colr = 'red' #color code for word starting with ! 
type2_colr = 'blue' #color code for word starting with *
sim_threshold = 0.3 # threshold value of similarity between train and test categories

In [3]:
wv_embeddings = KeyedVectors.load_word2vec_format("GoogleNews-125k.bin.gz", binary=True)

In [37]:
def find_sim_words(s):
    '''
    Finds words starting from '!' and '*' in train data

    Arguments:
    s -- training sentence(s) with some words prefixed with '!' and '*' 

    Returns:
    type1 -- list of words starting from '!'
    type2 -- list of words starting from '*'

    '''
    all_special = re.findall(r'[!*]\w+',s)
    type1=[]
    type2=[]
    for idx,w in enumerate(all_special):
        if w.startswith('!'):
            type1.append(all_special[idx][1:])
        else:
            type2.append(all_special[idx][1:])
    return type1,type2


# def find_similarity_train(t,wv_embeddings,simlar_threshold=0.3):
#     '''
#     Finds similarity between words found in function find_sim_words 

#     Arguments:
#     t -- List of words starting either from ! or *  
#     wv_embeddings -- word embeddings used for this task
#     simlar_threshold -- Minimum required similarity threshold between words to belong to same/similar category 
    
#     Returns:
#     similarity - list of words if their embedding similarity is less than simlar_threshold

#     '''
#     similarity = []
#     for i in range(len(t)-1):
#         sim = wv_embeddings.similarity(t[i],t[i+1])
#         if sim < simlar_threshold:
#             similarity.append([t[i],t[i+1]])
#     return similarity

# def similarity_error(s1,s2):
#     if len(sim1)>0:
#         raise Exception("{} following elements starting with '!' desn't belong to same category".format(sim1))
#     if len(sim2)>0:
#         raise Exception("{} following elements starting with '*' desn't belong to same category".format(sim2))
        
def test_train_similarity(t1,t2,s,wv_embeddings,threshold=0.3):
    '''
    
    Finds similarity between train special words(starting from ! or *) and selected test words
    
    Arguments:
    t1 -- list of words from training data starting from !
    t2 -- list of words from training data starting from *
    s -- list of words from test data after removing punctuation and stop words 
    wv_embeddings -- pre trained embeddings of words used for this task
    threshold -- minimum threshold for words to belong to same category
    
    Returns:
    type1_test -- words from test data that belong to same/similar category as the words starting from ! in train data
    type2_test -- words from test data that belong to same/similar category as the words starting from * in train data    
    
    '''
    type1_test = []
    type2_test = []
    for i in s:
        t1_sim = 0
        t2_sim = 0
        for x in t1:
#             print(x,i)
#             print(wv_embeddings.similarity(x,i),wv_embeddings.similarity(x,first_letter(i,wv_embeddings)))
            if (wv_embeddings.similarity(x,i)>threshold) or (wv_embeddings.similarity(x,first_letter(i,wv_embeddings))>threshold):
                t1_sim+=1
        if len(t1)==t1_sim>0:
            type1_test.append(i)
        for x in t2:
#             print(x,i)
#             print(wv_embeddings.similarity(x,i),wv_embeddings.similarity(x,first_letter(i,wv_embeddings)))
            if (wv_embeddings.similarity(x,i)>threshold) or (wv_embeddings.similarity(x,first_letter(i,wv_embeddings))>threshold):
                t2_sim+=1
        if len(t2)==t2_sim>0:
            type2_test.append(i)           
    return type1_test,type2_test


def remove_punctutaion(s):
    '''
    Removes punctuation and stop words from the test data

    Arguments:
    s -- test sentence(s)
    
    Returns:
    words -- list of words in test sentence without punctuations and stop words

    '''
    for c in string.punctuation:
        s=s.replace(c,"")
    stop = stopwords.words('english')
    words = [i for i in s.split(' ') if i not in stop]
    return words


def add_color(s,words,color):
    '''
    Add color to specified words 

    Arguments:
    s -- sentence containing selected words
    words -- words whose colour needs to be changed
    color -- colour to be used for changing
    
    Returns:
    s -- sentence with updated colour for the selected words
    '''
    for w in words:
        s = s.replace(w,colored(w,color))
    return s


def check_words(words):
    '''
    Check whether selected words exist in the vocabulary. If a word, doesn't exist in the vocabulary, then
    program prints a message and exit.

    Arguments:
    words -- list of words to be checked

    Returns:

    '''
    for w in words:
        try:
            wv_embeddings.most_similar(w)
        except KeyError as e:
            print("Word --> '%s' <-- not found in vocabulary. if you have entered wrong word, kindly update it. "% w)
            sys.exit()
    return words

def prepare_train(train_input,find_sim_words,add_color):
    '''
    Prepare train data by identifying words starting from ! and *, and adding color to these words

    Arguments:
    train_input -- training sentence(s) with some words prefixed with '!' and '*' 
    find_sim_words -- function 'find_sim_words'
    add_color -- function 'add_color'
    
    Returns:
    type1 -- list of words starting from '!'
    type2 -- list of words starting from '*'
    train_input -- updated train input, ready to be displayed

    '''
    type1,type2 = find_sim_words(train_input)
    check_words(type1+type2) #check if all the words exist in vaocabulary. 
                                                  # If not mention the word not found in vocab and exit.
    train_input = add_color(train_input,type1,type1_colr)
    train_input = add_color(train_input,type2,type2_colr)
    train_input = train_input.replace('*|!','')
    train_input = re.sub('[*!]','',train_input)
    return type1, type2, train_input

def first_letter(w,wv_embeddings):
    '''
    Sometimes discripancies between cases of first letter can impact the similarity between words. For example, 
    lion and tiger has similarity around 0.5, however, similarity between Lion and tiger is less than 0.2. This 
    function changes the case of first letter.

    Arguments:
    w -- word, for which case of 1st letter to be updated
    
    Returns:
    w_new -- update word if word exists in the vocab, else the original word
    '''
    if w[0].islower():
        w_new = w[0].upper() + w[1:]
    else: w_new = w[0].lower() + w[1:]
    if w_new not in wv_embeddings.vocab:
#         return w_new
#     w_new = w
#     else: return w
        w_new = w
    return w_new
    
def get_test_output(test_input,type1,type2,remove_punctutaion,check_words,test_train_similarity,add_color):
    '''
    Generates the output

    Arguments:
    test_input -- test input for which categories of word similar to train data, need to be identified
    type1 -- words from train data starting from !
    type2 -- words from train data starting from *
    remove_punctutaion -- function 'remove_punctutaion'
    check_words -- function 'check_words'
    test_train_similarity -- function 'test_train_similarity'
    add_color -- function 'add_color'
    
    Returns:
    op - output data to be displayed
    '''

    test = remove_punctutaion(test_input)
    check_words(test)
#     print(test)
    test1_words,test2_words = test_train_similarity(type1,type2,test,wv_embeddings,threshold = 0.3)
#     print(test1_words,test2_words)
    op = add_color(test_input,test1_words,type1_colr)
    op = add_color(op,test2_words,type2_colr)
    return op

### Enter Train data in following cell

In [42]:
train_input = "I saw a *Lion in the zoo yesterday. Today my *dog is missing. I'm going to !India but I may end up in !Germany."

In [43]:
type1,type2,train_input = prepare_train(train_input,find_sim_words,add_color)
print('Train input after processing -->',train_input)

Train input after processing --> I saw a [34mLion[0m in the zoo yesterday. Today my [34mdog[0m is missing. I'm going to [31mIndia[0m but I may end up in [31mGermany[0m.


-----------------------------------------------------------------------------------------------------------

### Enter Test data in following cell

In [48]:
test_input = "My parrot is crazy. I should have bought a cat instead when i visited Australia."


In [49]:
test_op = get_test_output(test_input,type1,type2,remove_punctutaion,check_words,test_train_similarity,add_color)
print(test_op)

My [34mparrot[0m is crazy. I should have bought a [34mcat[0m instead when i visited [31mAustralia[0m
