In [1]:
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
stopw = set(stopwords.words('english')) 
def readFile(file):
    f = open(file, 'r', encoding='utf-8')
    text = f.read()
    
    # Tokenization - sentences and words
    
    sentences = nltk.sent_tokenize(text)
    
    data = []
    
    for sent in sentences:
        words = nltk.word_tokenize(sent)
        words = [w.lower() for w in words if len(w) > 2 and w not in stopw]
        data.append(words)
    
    return data

In [3]:
text = readFile("Dataset/bollywood_news.txt")

In [4]:
print(text[:3])

[['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018'], ['the', 'deepveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple'], ['from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'ranveer', 'wedding', 'style', 'file']]


## Create Model

In [5]:

model = Word2Vec(text, size=300, window=10, min_count=1)
print(model)

Word2Vec(vocab=915, size=300, alpha=0.025)


In [6]:
words = list(model.wv.vocab)
print(words[:20])

['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018', 'the', 'deepveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave']


In [7]:
# print(model["deepika"].shape)
print(model.wv.__getitem__("deepika").shape)

(300,)


## Create Analogies

In [8]:
def predict_actor(a, b, c, word_vectors):
    """Accepts a triad of words and return d such that a is to b : c is to d"""
    
    a, b, c = a.lower(), b.lower(), c.lower()
    max_similarity = -100
    
    d = None
    
    wa, wb, wc = word_vectors[a], word_vectors[b], word_vectors[c]
    options = ["ranveer", "deepika", "padukone", "singh", "nick", "jonas", "chopra", "priyanka", "virat", "anushka", "ginni"]
    
    for w in options:
        if w in [a, b, c]:
            continue
        
        wv = word_vectors[w]
        sim = cosine_similarity([wb-wa], [wv-wc])
        
        if sim > max_similarity:
            max_similarity = sim
            d = w
    return d

## Test Your Model

In [9]:
triad = ("nick", "priyanka", "virat")

print(predict_actor(*triad, model.wv))

padukone


In [10]:
triad = ("ranveer", "deepika", "nick")

print(predict_actor(*triad, model.wv))

chopra


In [11]:
triad = ("ranveer", "singh", "deepika")
print(predict_actor(*triad, model.wv))

jonas


In [12]:
traid = ("deepika", "padukone", "priyanka")
print(predict_actor(*triad, model.wv))

jonas


In [13]:
traid = ("priyanka", "jonas", "nick")
print(predict_actor(*triad, model.wv))

jonas
