In [2]:
# Wikipedia redirects "Tumor" to "Neoplasm".
# Wikidata has 5 items with the label "Tumor".
# We want to figure out which one of the 5 items is the "Tumor" Wikipedia is referring to.
# We'll use word2vec and cosine similarity to figure this out.

In [None]:
from pyspark.ml.feature import StopWordsRemover, Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
target = { 'id': 'Q1216998', 'label': 'neoplasm',
          'desc': 'abnormal mass of tissue as a result of abnormal growth '
              'or division of cells'}
redirects = [
    {'id': 'Q37752422', 'label': 'Tumor', 'desc': 'river in Russia'},
    {'id': 'Q31537161', 'label': 'Tumor', 'desc': 'mountain in South Africa'},
    {'id': 'Q133212', 'label': 'tumor',
     'desc': 'abnormal growth of tissue forming a mass'},
    {'id': 'Q14753271', 'label': 'Tumor', 'desc': 'genus of insects'},
    {'id': 'Q12777830', 'label': 'Tumor',
     'desc': 'Wikimedia disambiguation page'}
]

In [4]:
targetDF = spark.createDataFrame([(target['id'], target['desc'].split(' '))], ['id', 'text'])
redirectDF = spark.createDataFrame([(x['id'], x['desc'].split(' ')) for x in redirects], ['id', 'text'])

# remove stop words
# Available languages: “danish”, “dutch”, “english”, “finnish”, “french”, “german”,
# “hungarian”, “italian”, “norwegian”, “portuguese”, “russian”, “spanish”, “swedish”
# and “turkish”
# See https://spark.apache.org/docs/2.1.0/ml-features.html#stopwordsremover
# We may want to skip this step for languages that don't have stop words or create
# stopwords and send them to spark.
stopwords = StopWordsRemover.loadDefaultStopWords('english')
remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=stopwords)
targetDF = remover.transform(targetDF)
redirectDF = remover.transform(redirectDF)

# train the model
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="result")
model = word2Vec.fit(redirectDF)
target_result = model.transform(targetDF)
target_vector = target_result.take(1)[0].result.reshape(1, -1)
redirect_results = model.transform(redirectDF)

# calculate cosine similarity
similarities = []
for row in redirect_results.collect():
    print(row)
    vector = row[-1]
    similarity = cosine_similarity(target_vector, vector.reshape(1, -1))
    print('Cosine similarity: %f\n' % (similarity))
    similarities.append((row[0], similarity))
    
similarities.sort(key=lambda x: x[1], reverse=True)
similarities
print('Most similar: %s' % similarities[0][0])

Row(id='Q37752422', text=['river', 'in', 'Russia'], words=['river', 'Russia'], result=DenseVector([-0.0018, 0.0764, 0.095]))
Cosine similarity: 0.498661

Row(id='Q31537161', text=['mountain', 'in', 'South', 'Africa'], words=['mountain', 'South', 'Africa'], result=DenseVector([-0.0912, -0.0128, -0.0671]))
Cosine similarity: -0.754389

Row(id='Q133212', text=['abnormal', 'growth', 'of', 'tissue', 'forming', 'a', 'mass'], words=['abnormal', 'growth', 'tissue', 'forming', 'mass'], result=DenseVector([0.0587, 0.0162, 0.0382]))
Cosine similarity: 0.831477

Row(id='Q14753271', text=['genus', 'of', 'insects'], words=['genus', 'insects'], result=DenseVector([0.028, 0.0923, -0.129]))
Cosine similarity: 0.391865

Row(id='Q12777830', text=['Wikimedia', 'disambiguation', 'page'], words=['Wikimedia', 'disambiguation', 'page'], result=DenseVector([-0.0435, -0.0078, 0.0637]))
Cosine similarity: -0.375902

Most similar: Q133212
