In [25]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy 
import polars as pl

from datasets import load_dataset
from gensim.models.phrases import Phrases, Phraser

import multiprocessing

from gensim.models import Word2Vec

In [4]:
dataset = load_dataset('wikitext', 'wikitext-2-v1')

In [16]:
data = dataset['train']['text']
data = [d for d in data if d != '']
data = [d.replace("'", '') for d in data]
data = [re.sub(r"[^a-zA-Z0-9]+", ' ', d).lower() for d in data]

In [20]:
sent = [d.split() for d in data]
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [22]:
bigram = Phraser(phrases)
sentences = bigram[sent]

In [23]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

29083

In [24]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['the', 'of', 'unk', 'and', 'in', 'to', 'a', 'was', 's', 'on']

In [26]:
cores = multiprocessing.cpu_count()

In [28]:
w2v_model = Word2Vec(min_count=20,
                     window=3,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [29]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.04 mins


In [30]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 1.64 mins


In [31]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [37]:
w2v_model.wv.most_similar(positive=["cold"])

[('warm', 0.40926748514175415),
 ('cooler', 0.40275728702545166),
 ('temperate', 0.3903796076774597),
 ('harsh', 0.3820638954639435),
 ('unusually', 0.3762211203575134),
 ('clouds', 0.36724525690078735),
 ('climate', 0.3467746376991272),
 ('mild', 0.3439057469367981),
 ('hot', 0.33299770951271057),
 ('dry', 0.32854825258255005)]

In [49]:
w2v_model.wv.most_similar(positive=["boy", "girl"], negative=["man"], topn=3)

[('madonna', 0.37118417024612427),
 ('baby', 0.3553946614265442),
 ('rendition', 0.34274280071258545)]