# Word2Vec and Pyspark Similarity

Process and tokenize Webhose article bodies and train a Word2Vec model  using Spark MLLib library. Demonstrate a search query implementation and retrieved article titles.

**Word2Vec Pyspark** 

In [1]:
#!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/8e/b0/bf9020b56492281b9c9d8aae8f44ff51e1bc91b3ef5a884385cb4e389a40/pyspark-3.0.0.tar.gz (204.7MB)
[K     |████████████████████████████████| 204.7MB 64kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 43.0MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.0-py2.py3-none-any.whl size=205044182 sha256=5321e96b41dad0c1710dfd6d8b284c2ea3b9ed9070e442b86f6e000fde174478
  Stored in directory: /root/.cache/pip/wheels/57/27/4d/ddacf7143f8d5b76c45c61ee2e43d9f8492fc5a8e78ebd7d37
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.0


**Install Libraries**

In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
sc = SparkContext() 
sqlContext = SQLContext(sc)
spark = SparkSession(sc)
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from nltk.stem.wordnet import WordNetLemmatizer
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

print("Using Apache Spark Version", sc.version)

Using Apache Spark Version 3.0.0


**Load obtained dataset of Webhose news articles into a Spark dataframe**

In [2]:
crunchbase_df = sqlContext.read.option("header", "true").option("delimiter", ",") \
                    .option("inferSchema", "true") \
                    .json("/content/sample_data/webhose_apple.json")


In [3]:
crunchbase_df.show()

+--------------------+--------------------+--------------------+---------------+--------------------+-------------+--------------------+--------------+--------+-------------+----------+-------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              author|             crawled|            entities|external_images|      external_links|highlightText|highlightThreadTitle|highlightTitle|language|ord_in_thread|parent_url|          published|rating|                text|              thread|               title|                 url|                uuid|
+--------------------+--------------------+--------------------+---------------+--------------------+-------------+--------------------+--------------+--------+-------------+----------+-------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   Roland Hutchinson| 2020-06-03 07:10:13|[[[

**Cleans up and tokenizes article bodies using the RegexTokenizer and Stopword remover functions**

In [4]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load( disable=['parser', 'tagger','ner'] )

def cleanup_pretokenize(text):
    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'http\S+', '', text)
    text = text.replace("'s", " ")
    text = text.replace("n't", " not ")
    text = text.replace("'ve", " have ")
    text = text.replace("'re", " are ")
    text = text.replace("I'm"," I am ")
    text = text.replace("you're"," you are ")
    text = text.replace("You're"," You are ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("%"," percent ")
    return text

lmtzr = WordNetLemmatizer()
def text_cleanup(row):
    desc = row[2].strip().lower()
    tokens = [w.lemma_ for w in nlp(cleanup_pretokenize(desc))]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if len(token) > 3]
    #tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
    row[2] = ' '.join(tokens)
    return row

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'tokens')
swr = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

In [5]:
crunchbase_data = crunchbase_df['uuid','title','text']

In [6]:
df_tokens = regexTokenizer.transform(crunchbase_data)
desc_swr = swr.transform(df_tokens)
desc_swr.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                uuid|               title|                text|              tokens|   tokens_sw_removed|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|958670c1717dd8f1e...|New iPad Air may ...|Apple is expected...|[apple, is, expec...|[apple, expected,...|
|4aa124a2c78843f84...|iOS 14 Will Repor...|iOS 14 Will Suppo...|[ios, 14, will, s...|[ios, 14, support...|
|26ebb8ab008ed759d...|iPhone Looters Be...|in: News iPhone L...|[in, news, iphone...|[news, iphone, lo...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



**Trains a Word2Vec model based on the text column**

In [7]:
def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))+.1)

In [8]:
word2vec = Word2Vec(vectorSize = 300, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(desc_swr)
wordvectors = model.transform(desc_swr)
#wordvectors.select('wordvectors').show(1, truncate = True)
crunchbase_desc = wordvectors.select('uuid','title','wordvectors').rdd.toDF()
crunchbase_desc.show(10)

+--------------------+--------------------+--------------------+
|                uuid|               title|         wordvectors|
+--------------------+--------------------+--------------------+
|958670c1717dd8f1e...|New iPad Air may ...|[-0.0327451773791...|
|4aa124a2c78843f84...|iOS 14 Will Repor...|[0.02522358425096...|
|26ebb8ab008ed759d...|iPhone Looters Be...|[-0.0403018059497...|
|cb43510b88a39af75...|Apple bug exposed...|[-0.0362135766530...|
|cfe464ff046a7ad47...|French govt's Sto...|[-0.0102957411447...|
|f96cdd7df78fdcbe1...|American Companie...|[-0.0279899754667...|
|67dadbbd72117060c...|iOS 13.5.1 vs iOS...|[0.06957130055058...|
|d81d04e2538487a10...|Mr. Ranjeet Sundh...|[0.02859138866347...|
|8c3c8567e9b1ed83b...|Apple TV Users Ca...|[-0.0809914982685...|
|017660f92bfbef23c...|Tech giants conde...|[-0.0061011842118...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [9]:
synonyms = model.findSynonyms("tiktok", 10)   
synonyms.show()

+---------------+-------------------+
|           word|         similarity|
+---------------+-------------------+
|       telegram| 0.5649581551551819|
|      flipboard| 0.5230107307434082|
|        tiktook|   0.51828932762146|
|      instagram| 0.5145618915557861|
|      smartnews|0.49309733510017395|
|       kuaishou| 0.4913390278816223|
|davidphelan2009| 0.4827902317047119|
|         douyin| 0.4825259745121002|
|davidphelantech|   0.47369584441185|
|      pinterest| 0.4644016921520233|
+---------------+-------------------+



**Implements any sample search query**

In [10]:
#chunk = crunchbase_desc.filter(lambda r: r[1]>=0 and r[1]<1000).collect()
chunk = crunchbase_desc.take(50000)
#chunk = crunchbase_desc.collect()

In [11]:
SEARCH_QUERY = "I love bacon cheeseburger"

In [12]:
query_df  = sc.parallelize([(1,SEARCH_QUERY)]).toDF(['index','text'])
query_tok = regexTokenizer.transform(query_df)
query_swr = swr.transform(query_tok)
query_swr.show()
query_vec = model.transform(query_swr)
query_vec = query_vec.select('wordvectors').collect()[0][0]
query_vec

+-----+--------------------+--------------------+--------------------+
|index|                text|              tokens|   tokens_sw_removed|
+-----+--------------------+--------------------+--------------------+
|    1|I love bacon chee...|[i, love, bacon, ...|[love, bacon, che...|
+-----+--------------------+--------------------+--------------------+



DenseVector([-0.055, 0.0312, 0.0163, 0.0527, 0.0418, 0.0508, -0.1263, 0.0107, 0.0522, -0.1525, 0.0123, -0.0097, 0.0569, 0.0047, 0.0283, -0.0125, 0.0006, 0.001, 0.0009, 0.0716, -0.0026, 0.071, -0.0728, 0.0201, -0.0875, -0.0735, 0.0095, 0.0253, 0.0196, 0.0719, -0.1274, -0.0668, -0.0557, 0.0356, 0.033, -0.123, 0.0218, -0.0357, -0.0067, -0.0225, 0.0221, 0.0257, 0.0527, -0.0662, -0.0231, -0.0444, 0.0524, -0.0056, 0.0372, 0.0236, -0.0226, -0.0675, 0.0672, -0.0502, 0.0527, 0.0642, 0.0426, 0.0082, 0.0087, -0.0067, -0.0047, -0.1092, -0.0528, -0.0891, 0.0583, 0.0541, -0.1014, -0.0297, 0.0105, 0.0242, 0.0326, 0.0618, 0.0171, -0.0027, -0.0869, 0.0105, -0.0812, -0.0343, 0.0341, -0.0492, -0.0291, 0.0042, 0.0531, -0.029, -0.0143, 0.0987, -0.0951, 0.0298, -0.0423, 0.0441, -0.03, -0.0296, -0.0522, 0.0748, -0.0488, -0.013, 0.0139, 0.0012, 0.0048, 0.0937, -0.0365, -0.0621, 0.0461, 0.0474, -0.0746, -0.0675, 0.1117, 0.1574, -0.004, -0.0032, -0.0663, 0.0301, -0.0092, -0.0226, 0.0318, -0.0071, 0.0472, 0.0253

**Produces matching article titles**

In [13]:
import numpy as np
sim_rdd = sc.parallelize((i[0], i[1], float(cossim(query_vec, i[2]))) for i in chunk)
sim_df  = sqlContext.createDataFrame(sim_rdd).\
                   withColumnRenamed('_1', 'crunchbase_uuid').\
                   withColumnRenamed('_2', 'title').\
                   withColumnRenamed('_3', 'similarity').\
                   orderBy("similarity", ascending = False)
sim_df.show(5, truncate = False)

+----------------------------------------+--------------------------------------------------------------------+------------------+
|crunchbase_uuid                         |title                                                               |similarity        |
+----------------------------------------+--------------------------------------------------------------------+------------------+
|a4be8c58a900b7afb1ba4a2434484d5e0d503b78|SpongeBob: Patty Pursuit Launches on Apple Arcade                   |0.4610071773652834|
|b31aca513fbe1448f5dabe4d73a91caf7a1ccf2f|Apple TV+’s Central Park Finds the Musical Joy in Being Outside     |0.4521597186787441|
|5926befef806aa08a1bbb0dca8707259a5d0b628|Music and Family Combine in Apple TV's Wonderful Central Park       |0.4511184847199503|
|61f5ad11fa2647c4fa4c05df22e9aed3d0ec9421|Review: ‘Central Park’ is a Refreshing Shot of Family-Friendly Funny|0.4462476544760373|
|ba678715246f4a0d2b0ede51ebdae96a7466b5a0|Stars say 'Central Park' celebrates famil