In [16]:
import logging, gensim
import numpy as np
from functools import reduce

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# Use pre-trained embeddings

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format('C:\\Users\A70170\Desktop\datasets\GoogleNews-vectors-negative300.bin.gz', binary=True)

2018-09-20 09:56:43,618 : INFO : loading projection weights from C:\Users\A70170\Desktop\datasets\GoogleNews-vectors-negative300.bin.gz
2018-09-20 09:58:11,609 : INFO : loaded (3000000, 300) matrix from C:\Users\A70170\Desktop\datasets\GoogleNews-vectors-negative300.bin.gz


In [5]:
len(model.wv.vocab)

  """Entry point for launching an IPython kernel.


3000000

In [6]:
# woman + king - man = queen
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

  
2018-09-20 10:04:26,262 : INFO : precomputing L2-norms of word weight vectors


[('queen', 0.7118192911148071)]

In [7]:
# London is to England as Paris is to __
model.wv.most_similar(positive=['Paris', 'England'], negative=['London'], topn=1)

  


[('France', 0.667637825012207)]

In [8]:
# Pick out the oddball word in a sentence
model.wv.doesnt_match("duck bear cat tree".split())

  


'tree'

In [9]:
# Grab a similarity score between 0 and 1
model.wv.similarity('woman', 'man')

  


0.7664012230995352

In [10]:
model.wv.similarity('tree', 'man')

  """Entry point for launching an IPython kernel.


0.22937458713940162

In [11]:
# Helper function to try to grab embeddings for a word and returns None if that word is not found
def get_embedding(string):
    try:
        return model.wv[string]
    except:
        return None

In [12]:
sentences = [
    "this is about a dog",
    "this is about a cat",
    "this is about nothing"
]

In [22]:
# Zero matrix of shape (3300)
vectorized_sentences = np.zeros((len(sentences), 300))
# For every sentence
for i, sentence in enumerate(sentences):
    # Tokenize sentence into words
    words = sentence.split(' ')
    # Embed whichever words that we can 
    embedded_words = [get_embedding(w) for w in words]
    embedded_words = list(filter(lambda x:x is not None, embedded_words))
    # Take a mean of the vectors, to get an estimate vectorization of the entire title
    vectorized_sentence = reduce(lambda x,y:x+y, embedded_words) / len(embedded_words)
    # Change the ith row (in place) to be the ith vectorization
    vectorized_sentences[i:] = vectorized_sentence

  after removing the cwd from sys.path.


In [23]:
vectorized_sentences.shape

(3, 300)

In [24]:
# We want sentence most similar to the reference word "dog"
reference_word = 'dog'

# Take a dot product between the embedding of 'dog' and our vectorized matrix
best_sentence_idx = np.dot(vectorized_sentences, get_embedding(reference_word)).argsort()[-1]

# Output the most relevant sentence
sentences[best_sentence_idx]

  after removing the cwd from sys.path.


'this is about a dog'

In [26]:
# We want sentence most similar to the reference word "cat"
reference_word = 'cat'

# Take a dot product between the embedding of 'cat' and our vectorized matrix
best_sentence_idx = np.dot(vectorized_sentences, get_embedding(reference_word)).argsort()[-1]

# Output the most relevant sentence
sentences[best_sentence_idx]

  after removing the cwd from sys.path.


'this is about a cat'

In [27]:
# We want sentence most similar to the reference word "canine"
reference_word = 'canine'

# Take a dot product between the embedding of 'canine' and our vectorized matrix
best_sentence_idx = np.dot(vectorized_sentences, get_embedding(reference_word)).argsort()[-1]

# Output the most relevant sentence
sentences[best_sentence_idx]

  after removing the cwd from sys.path.


'this is about a dog'

In [28]:
# We want sentence most similar to the reference word "tiger"
reference_word = 'tiger'

# Take a dot product between the embedding of 'tiger' and our vectorized matrix
best_sentence_idx = np.dot(vectorized_sentences, get_embedding(reference_word)).argsort()[-1]

# Output the most relevant sentence
sentences[best_sentence_idx]

  after removing the cwd from sys.path.


'this is about a cat'

In [29]:
# Chapter titles from Sinan's first book, "The Principles of Data Science"
sentences = """How to Sound Like a Data Scientist
Types of Data
The Five Steps of Data Science 
Basic Mathematics
A Gentle Introduction to Probability
Advanced Probability 
Basic Statistics
Advanced Statistics
Communicating Data 
Machine Learning Essentials
Beyond the Essentials
Case Studies """.split('\n')

In [30]:
vectorized_sentencesvectoriz  = np.zeros((len(sentences),300))
for i, sentence in enumerate(sentences):
    words = sentence.split(' ')
    embedded_words = [get_embedding(w) for w in words if get_embedding(w) is not None]
    vectorized_sentence = reduce(lambda x,y:x+y, embedded_words) / len(embedded_words)
    vectorized_sentences[i:] = vectorized_sentence

  after removing the cwd from sys.path.


In [31]:
# Find chapters about math
reference_word = 'math'
best_sentence_idx = np.dot(vectorized_sentences, get_embedding(reference_word)).argsort()[-3:][::-1]
[sentences[b] for b in best_sentence_idx]

  after removing the cwd from sys.path.


['The Five Steps of Data Science ',
 'How to Sound Like a Data Scientist',
 'Types of Data']

In [32]:
# Which chapters are about giving talks about data
reference_word = 'talk'
best_sentence_idx = np.dot(vectorized_sentences, get_embedding(reference_word)).argsort()[-3:][::-1]
[sentences[b] for b in best_sentence_idx]

  after removing the cwd from sys.path.


['How to Sound Like a Data Scientist',
 'Types of Data',
 'The Five Steps of Data Science ']

In [33]:
# Which chapters are about AI  
reference_word = 'AI'
best_sentence_idx = np.dot(vectorized_sentences, get_embedding(reference_word)).argsort()[-3:][::-1]

[sentences[b] for b in best_sentence_idx]

  after removing the cwd from sys.path.


['Types of Data',
 'How to Sound Like a Data Scientist',
 'The Five Steps of Data Science ']