In [2]:
import pandas as pd
import re
from gensim.models import Word2Vec


In [3]:
data = {
    "article_text": [
        "information retrieval is the process of obtaining information",
        "text mining and information retrieval are related fields",
        "search engine uses information retrieval techniques",
        "word embedding represents words as vectors"
    ]
}

df = pd.DataFrame(data)


In [4]:
def preprocess(text):
    text = text.lower()                     # lowercase
    text = re.sub(r'[^a-z\s]', '', text)    # hapus simbol
    tokens = text.split()                   # tokenisasi
    return tokens

df['tokens'] = df['article_text'].apply(preprocess)
df['tokens']


0    [information, retrieval, is, the, process, of,...
1    [text, mining, and, information, retrieval, ar...
2    [search, engine, uses, information, retrieval,...
3    [word, embedding, represents, words, as, vectors]
Name: tokens, dtype: object

In [5]:
model = Word2Vec(
    sentences=df['tokens'],
    vector_size=100,   # ukuran vektor
    window=5,          # konteks window
    min_count=1,       # minimal kemunculan kata
    workers=4,
    sg=1               # 1 = Skip-gram, 0 = CBOW
)


In [6]:
vector_ir = model.wv['information']
print(vector_ir)
print("Panjang vektor:", len(vector_ir))


[-5.3659163e-04  2.3666486e-04  5.1044468e-03  9.0084802e-03
 -9.3049398e-03 -7.1182284e-03  6.4604352e-03  8.9744320e-03
 -5.0162170e-03 -3.7635425e-03  7.3826737e-03 -1.5332866e-03
 -4.5394995e-03  6.5539097e-03 -4.8626214e-03 -1.8159528e-03
  2.8788224e-03  9.9110615e-04 -8.2872696e-03 -9.4514471e-03
  7.3134988e-03  5.0708596e-03  6.7612301e-03  7.6123292e-04
  6.3512716e-03 -3.4058816e-03 -9.4598409e-04  5.7686744e-03
 -7.5229313e-03 -3.9362879e-03 -7.5127580e-03 -9.2994922e-04
  9.5410654e-03 -7.3208390e-03 -2.3343961e-03 -1.9397155e-03
  8.0780759e-03 -5.9321634e-03  4.5860106e-05 -4.7556497e-03
 -9.6061369e-03  5.0088447e-03 -8.7593012e-03 -4.3927436e-03
 -3.4098714e-05 -2.9472820e-04 -7.6625361e-03  9.6169394e-03
  4.9831397e-03  9.2354324e-03 -8.1610726e-03  4.4982596e-03
 -4.1370410e-03  8.2379940e-04  8.5003525e-03 -4.4652107e-03
  4.5191231e-03 -6.7871823e-03 -3.5488161e-03  9.4014416e-03
 -1.5770510e-03  3.2055390e-04 -4.1423207e-03 -7.6836897e-03
 -1.5076330e-03  2.46934

In [7]:
model.wv.most_similar('information', topn=5)


[('mining', 0.21881316602230072),
 ('uses', 0.21615716814994812),
 ('fields', 0.09310433268547058),
 ('word', 0.09298862516880035),
 ('process', 0.08406124264001846)]

In [8]:
similarity = model.wv.similarity('information', 'retrieval')
print("Similarity:", similarity)


Similarity: -0.010850367
