In [11]:
import gensim
import re
import logging
import numpy as np
import pandas as pd
import multiprocessing

from re import sub
from time import time 
from unidecode import unidecode
from gensim.models import Word2Vec
from collections import defaultdict
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.models.phrases import Phrases, Phraser

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [12]:
file_model = pd.read_csv('musk_clean.csv')
sentences = file_model.text
sentences

0        ['twitter', 'prior', 'musk', 'takeover', 'talk...
1        ['article', 'ay', 'imply', 'tates', 'feature',...
2          ['og', 'musk', 'would', 'uck', 'lives', 'wall']
3        ['would', 'are', 'peak', 'way', 'great', 'powe...
4        ['cannot', 'wait', 'finally', 'excuse', 'hower...
                               ...                        
61285    ['like', 'brain', 'would', 'ead', 'piranhas', ...
61286    ['lying', 'agenda', 'please', 'correct', 'erro...
61287    ['hard', 'would', 'isagree', 'think', 'parody'...
61288    ['yeah', 'think', 'many', 'things', 'lining', ...
61289    ['good', 'choice', 'musk', 'impossible', 'are'...
Name: text, Length: 61290, dtype: object

In [5]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     vector_size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 14:29:49: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-05-20T14:29:49.884169', 'gensim': '4.3.1', 'python': '3.9.9 (v3.9.9:ccb0e6a345, Nov 15 2021, 13:06:05) \n[Clang 13.0.0 (clang-1300.0.29.3)]', 'platform': 'macOS-13.3-arm64-arm-64bit', 'event': 'created'}
INFO - 14:29:49: collecting all words and their counts
INFO - 14:29:49: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 14:29:50: PROGRESS: at sentence #50000, processed 19589894 words, keeping 75 word types
INFO - 14:29:51: collected 75 word types from a corpus of 24174807 raw words and 61290 sentences
INFO - 14:29:51: Creating a fresh vocabulary
INFO - 14:29:51: Word2Vec lifecycle event {'msg': 'effective_min_count=3 retains 72 unique words (96.00% of original 75, drops 3)', 'datetime': '2023-05-20T14:29:51.068284', 'gensim': '4.3.1', 'python': '3.9.9 (v3.9.9:ccb0e6a345, Nov 15 2021, 13:06:05) \n[Clang 13.0.0 (clang-1300.0.29.3)]', 'platfor

Time to build vocab: 0.02 mins


In [7]:
start = time()

w2v_model.train(sentences,
                total_examples=w2v_model.corpus_count,
                epochs=30,
                report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 14:52:36: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 72 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4 shrink_windows=True', 'datetime': '2023-05-20T14:52:36.240795', 'gensim': '4.3.1', 'python': '3.9.9 (v3.9.9:ccb0e6a345, Nov 15 2021, 13:06:05) \n[Clang 13.0.0 (clang-1300.0.29.3)]', 'platform': 'macOS-13.3-arm64-arm-64bit', 'event': 'train'}
INFO - 14:52:37: EPOCH 0 - PROGRESS: at 56.87% examples, 226994 words/s, in_qsize 2, out_qsize 3
INFO - 14:52:37: EPOCH 0: training on 24174807 raw words (398029 effective words) took 1.7s, 228544 effective words/s
INFO - 14:52:38: EPOCH 1 - PROGRESS: at 57.12% examples, 227500 words/s, in_qsize 14, out_qsize 11
INFO - 14:52:39: EPOCH 1: training on 24174807 raw words (398958 effective words) took 1.7s, 230334 effective words/s
INFO - 14:52:40: EPOCH 2 - PROGRESS: at 57.40% examples, 228539 words/s, in_qsize 2, out_qsize 3
INFO - 14:52:41: EPOCH 2: training on 24174807 raw words (39771

Time to train the model: 0.88 mins


In [8]:
w2v_model.save("word2vec.model")

INFO - 14:54:27: Word2Vec lifecycle event {'fname_or_handle': 'word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-05-20T14:54:27.804429', 'gensim': '4.3.1', 'python': '3.9.9 (v3.9.9:ccb0e6a345, Nov 15 2021, 13:06:05) \n[Clang 13.0.0 (clang-1300.0.29.3)]', 'platform': 'macOS-13.3-arm64-arm-64bit', 'event': 'saving'}
INFO - 14:54:27: not storing attribute cum_table
INFO - 14:54:27: saved word2vec.model


In [9]:
from sklearn.cluster import KMeans
word_vectors = Word2Vec.load("word2vec.model").wv

model = KMeans(n_clusters=2,
               max_iter=1000,
               random_state=True,
               n_init=50)
model.fit(X=word_vectors.vectors.astype('double'))

INFO - 14:54:31: loading Word2Vec object from word2vec.model
INFO - 14:54:31: loading wv recursively from word2vec.model.wv.* with mmap=None
INFO - 14:54:31: setting ignored attribute cum_table to None
INFO - 14:54:31: Word2Vec lifecycle event {'fname': 'word2vec.model', 'datetime': '2023-05-20T14:54:31.385925', 'gensim': '4.3.1', 'python': '3.9.9 (v3.9.9:ccb0e6a345, Nov 15 2021, 13:06:05) \n[Clang 13.0.0 (clang-1300.0.29.3)]', 'platform': 'macOS-13.3-arm64-arm-64bit', 'event': 'loaded'}


In [10]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('N', 0.9932374954223633),
 ('G', 0.991351306438446),
 ('F', 0.9886654615402222),
 ('M', 0.9886630177497864),
 ('X', 0.983725368976593),
 ('K', 0.9813916683197021),
 ('H', 0.9813132286071777),
 ('D', 0.9808835983276367),
 ('L', 0.98053377866745),
 ('B', 0.9789462685585022)]