# Word2Vec CBOW



In [1]:
!pip install gensim tqdm



In [2]:
from gensim.models import Word2Vec;
import pandas;
import re;
from tqdm import tqdm;
import numpy;

from google.colab import drive;

In [3]:
drive.mount("/content/drive");

Mounted at /content/drive


## Data Loading and Preparation

This part will took the dataset and load it to pandas.

In [4]:
dataframe = pandas.read_csv("/content/drive/MyDrive/Collab Dataset/nlp-dl-self-assignment/data.csv");
dataframe.head()

Unnamed: 0,sentimen,Tweet,Unnamed: 2
0,-1,lagu bosan apa yang aku save ni huhuhuhuhuhuhu...,
1,-1,kita lanjutkan saja diam ini hingga kau dan ak...,
2,1,doa rezeki tak putus inna haa zaa larizquna ma...,
3,1,makasih loh ntar kita bagi hasil aku 99 9 sisa...,
4,-1,aku tak faham betul jenis orang malaysia yang ...,


This part will pre-process the data by this methodology below:

1. For every sentence within dataframe["Tweet"]
1.1 Remove punctuations from the sentence
1.2 Split the sentence by white spaces
1.3 Save the splitted array of words into array called "data"

In [5]:
data = [];

print("Pre-process tweet data")
for sentence in tqdm(dataframe["Tweet"]):
    punct_regex = r'[^\w\s]';
    sentence = re.sub(punct_regex, "", sentence);

    splitted_sentence = sentence.split(" ");
    data.append(splitted_sentence);

Pre-process tweet data


100%|██████████| 10806/10806 [00:00<00:00, 250835.12it/s]


## Model Training


In [6]:
model = Word2Vec(data, vector_size = 100, window = 5, min_count = 1, workers = 4, sg = 0);

In [7]:
model.save("my_w2v_cbow_model.bin")

## Checking Word Embeddings

In [8]:
word_embedding = model.wv["aku"];

# Get embedding for a sentence (average of word embeddings)
sentence_embedding = numpy.mean([model.wv[word] for word in sentence.split()], axis=0)

# Print embeddings
print("Embedding for 'aku':", word_embedding)
print("Embedding for sentence:", sentence_embedding)

Embedding for 'aku': [-7.0697367e-01  4.7030479e-01  9.6763887e-02  1.0746889e+00
  8.1333882e-01 -2.1019027e+00  2.1456613e-01  3.1800764e+00
 -7.5135279e-01 -1.2411007e+00 -8.4103519e-01 -1.9404410e+00
 -7.1374577e-01  1.8925910e+00  1.0097566e+00 -6.3822842e-01
  9.0855896e-01 -1.4958897e+00  4.2479977e-01 -2.0287948e+00
  1.3261129e+00  6.0954195e-01  8.7748927e-01 -9.7576475e-01
  2.0769100e-01  2.4843700e-01 -1.1233425e+00  4.8612702e-01
 -9.2332780e-01  6.6354734e-01  2.2373037e+00 -1.9914128e-01
  2.4516624e-01 -1.5450888e+00 -1.2155213e-01  7.7873617e-01
  8.6646074e-01  3.5256186e-01  8.7979174e-01 -1.3422451e+00
 -3.7881047e-01 -1.6048138e+00 -8.4712243e-01 -3.7954688e-01
 -4.3819312e-02  4.7392389e-01 -4.8253146e-01 -1.9983907e-01
  8.4360981e-01  8.6481178e-01  1.5872471e+00 -9.9135834e-01
  2.8546140e-02 -1.1858507e+00 -7.7179343e-02  9.3250787e-01
  1.0660596e+00  6.8558186e-02 -3.5738575e-01  1.1303509e+00
 -9.4266677e-01  6.6170841e-01 -1.8496683e-02 -1.0765434e-01
 -1

# How to use pre-trained bin model

In [10]:
model = Word2Vec.load("/content/my_w2v_cbow_model.bin");

word_embedding = model.wv["hahahaha"];

# Get embedding for a sentence (average of word embeddings)
sentence_embedding = numpy.mean([model.wv[word] for word in sentence.split()], axis=0)

# Print embeddings
print("Embedding for 'hahahaha':", word_embedding)
print("Embedding for sentence:", sentence_embedding)

Embedding for 'hahahaha': [-0.18738927  0.1442243   0.03316022  0.29020005  0.22683218 -0.60077
  0.05892345  0.89477766 -0.22453932 -0.34844005 -0.24348073 -0.55942726
 -0.19411692  0.5281236   0.28073955 -0.1886024   0.24290206 -0.4440072
  0.11525843 -0.59573746  0.38506877  0.19162588  0.27489972 -0.27894804
  0.05554562  0.07517253 -0.3183262   0.13347343 -0.27312526  0.19058599
  0.63549614 -0.05007578  0.07939372 -0.42809433 -0.02284859  0.25274742
  0.22368276  0.06020533  0.22574456 -0.40918073 -0.1085393  -0.4524283
 -0.25363633 -0.11836179  0.00925066  0.10730036 -0.15371625 -0.04617687
  0.24828635  0.24411637  0.4564128  -0.30054536 -0.00606474 -0.3391953
 -0.02738485  0.2633174   0.29743156  0.00875307 -0.10586538  0.31810868
 -0.260664    0.18212415 -0.00578182 -0.03827673 -0.32264933  0.43497452
  0.2843057   0.17330006 -0.49393442  0.53989035 -0.05238528 -0.09948247
  0.46569052 -0.01113288  0.33967218 -0.18982595 -0.10650408  0.08757679
 -0.09847882 -0.0599691  -0.545