# WORD EMBEDDINGS 

### word2vec 

In [1]:
! pip install spark-nlp
! pip install nltk



In [2]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /home/jovyan/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
import sparknlp
from nltk.corpus import brown

spark = sparknlp.start()

In [4]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [5]:
def detokenize(sentence):
    text = ''
    for token in sentence:
        if text and any(c.isalnum() for c in token):
            token += ' '
        text += token
    return text

In [6]:
texts = []

for fid in brown.fileids():
    text = [detokenize(s) for s in brown.sents(fid)]
    text = ' '.join(text)
    texts.append((text,))
    
texts = spark.createDataFrame(texts, ['text'])

In [13]:
from pyspark.ml import Pipeline

from sparknlp import DocumentAssembler, Finisher
from sparknlp.annotator import *

In [17]:
assembler = DocumentAssembler()\
.setInputCol('text')\
.setOutputCol('document')

sentence = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentences") \
.setExplodeSentences(True)

tokenizer = Tokenizer()\
.setInputCols(['sentences'])\
.setOutputCol('token')

normalizer = Normalizer()\
.setCleanupPatterns(['[^a-zA-Z.-]+','^[^a-zA-Z]+','[^a-zA-Z]+$',])\
.setInputCols(['token'])\
.setOutputCol('normalized')\
.setLowercase(True)

finisher = Finisher()\
.setInputCols(['normalized'])\
.setOutputCols(['normalized'])\
.setOutputAsArray(True)

pipeline = Pipeline().setStages([assembler, sentence, tokenizer,
                                 normalizer, finisher ]).fit(texts)

In [18]:
sentences = pipeline.transform(texts)
sentences = sentences.select('normalized').collect()
sentences = [r['normalized'] for r in sentences]

print(len(sentences)) # number of sentences

59477


#### word2vec is traditional model which does not take context in account, we won't implement word2vec but we use Glove and BERT methods of word embeddings. BERT takes context in account Golve doesn't but Glove performs better than word2vec. Also SparkNlp gives us pretrained models for better performance. We will use one of those in this case.

### GloVe 

In [20]:
from sparknlp.embeddings import WordEmbeddingsModel

In [21]:
glove = WordEmbeddingsModel.pretrained(name='glove_100d') \
    .setInputCols(['document', 'normalized']) \
    .setOutputCol('embeddings')

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [22]:
pipeline = Pipeline().setStages([assembler, sentence, tokenizer,
                                 normalizer, glove ]).fit(texts)

In [23]:
pipeline.transform(texts).select('embeddings.embeddings') \
.first()['embeddings']

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [-0.22317999601364136,
  -0.527400016784668,
  0.8286799788475037,
  -0.3293200135231018,
  0.7647799849510193,
  0.9371899962425232,
  0.4973900020122528,
  1.4427000284194946,
  0.11255999654531479,
  -0.19446000456809998,
  -1.1759999990463257,
  -0.2777000069618225,
  0.2835800051689148,
  0.0

### BERT 

In [24]:
from sparknlp.pretrained import BertEmbeddings

In [25]:
bert = BertEmbeddings.pretrained()\
.setInputCols(['sentences', 'normalized'])\
.setOutputCol('bert')

bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [26]:
pipeline = Pipeline().setStages([assembler, sentence, tokenizer,
                                 normalizer, bert ]).fit(texts)

In [27]:
pipeline.transform(texts).select('bert.embeddings')\
.first()['embeddings']

[[-1.1980020999908447,
  0.3962576389312744,
  0.7419608235359192,
  0.7973726391792297,
  -1.0004487037658691,
  0.8969652056694031,
  -0.3867361843585968,
  0.17260757088661194,
  -0.26956284046173096,
  -0.3295387625694275,
  0.6897872686386108,
  -0.39754605293273926,
  0.5433205366134644,
  0.7603238224983215,
  0.27005520462989807,
  -1.9060765504837036,
  -0.9963284730911255,
  0.9765754342079163,
  -0.26099157333374023,
  0.369789719581604,
  -0.43807271122932434,
  0.7488493919372559,
  -0.5378984808921814,
  0.8647488951683044,
  -0.14781543612480164,
  0.23633432388305664,
  -0.6737574934959412,
  -0.8317551016807556,
  0.49321049451828003,
  0.22750777006149292,
  0.886649489402771,
  -2.1890995502471924,
  -0.3851615786552429,
  0.6819204092025757,
  0.2723952531814575,
  -0.274985134601593,
  0.8294179439544678,
  -1.1374274492263794,
  -0.9435877203941345,
  -0.20892567932605743,
  0.9044886827468872,
  -0.5634653568267822,
  -0.4172305464744568,
  0.3451480269432068,
  