<a href="https://colab.research.google.com/github/suminarwb/word2vec_thesis/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word2Vec

In [5]:
import os
import pandas as pd
from tqdm.auto import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Prepare Corpus

In [6]:
df = pd.read_csv("re_dataset.csv", encoding='Windows-1252')
df.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0


In [7]:
sentences = [word_tokenize(Tweet.lower()) for Tweet in tqdm(df.Tweet)]
sentences[:5]

HBox(children=(FloatProgress(value=0.0, max=13169.0), HTML(value='')))




[['-',
  'disaat',
  'semua',
  'cowok',
  'berusaha',
  'melacak',
  'perhatian',
  'gue',
  '.',
  'loe',
  'lantas',
  'remehkan',
  'perhatian',
  'yg',
  'gue',
  'kasih',
  'khusus',
  'ke',
  'elo',
  '.',
  'basic',
  'elo',
  'cowok',
  'bego',
  '!',
  '!',
  '!',
  "'"],
 ['rt',
  'user',
  ':',
  'user',
  'siapa',
  'yang',
  'telat',
  'ngasih',
  'tau',
  'elu',
  '?',
  'edan',
  'sarap',
  'gue',
  'bergaul',
  'dengan',
  'cigax',
  'jifla',
  'calis',
  'sama',
  'siapa',
  'noh',
  'licew',
  'juga',
  "'"],
 ['41.',
  'kadang',
  'aku',
  'berfikir',
  ',',
  'kenapa',
  'aku',
  'tetap',
  'percaya',
  'pada',
  'tuhan',
  'padahal',
  'aku',
  'selalu',
  'jatuh',
  'berkali-kali',
  '.',
  'kadang',
  'aku',
  'merasa',
  'tuhan',
  'itu',
  'ninggalkan',
  'aku',
  'sendirian',
  '.',
  'ketika',
  'orangtuaku',
  'berencana',
  'berpisah',
  ',',
  'ketika',
  'kakakku',
  'lebih',
  'memilih',
  'jadi',
  'kristen',
  '.',
  'ketika',
  'aku',
  'anak',
  'te

Training

In [8]:
model = Word2Vec(sentences, size=100, window=2, min_count=3, sg=0, workers=4, iter=1000)

In [9]:
os.makedirs("model/w2v/", exist_ok=True)
model.save("model/w2v/contoh_model.w2v")

Load Model

In [10]:
model = Word2Vec.load("model/w2v/contoh_model.w2v")

Continue training

In [11]:
"Kaum cebong kapir udah keliatan dongoknya dari awal tambah dongok lagi hahahah".lower().split()

['kaum',
 'cebong',
 'kapir',
 'udah',
 'keliatan',
 'dongoknya',
 'dari',
 'awal',
 'tambah',
 'dongok',
 'lagi',
 'hahahah']

In [12]:
contoh_data = [
    ['deklarasi',
      'pilkada',
      '2018',
      'aman',
      'dan',
      'anti',
      'hoax',
      'warga',
      'dukuh',
      'sari',
      'jabon'],
    ['kaum',
      'cebong',
      'kapir',
      'udah',
      'keliatan',
      'dongoknya',
      'dari',
      'awal',
      'tambah',
      'dongok',
      'lagi',
      'hahahah']
]

In [13]:
model.train(contoh_data, total_examples=len(contoh_data), epochs=1)

(22, 23)

In [14]:
model.save("model/w2v/contoh_model.w2v")

In [15]:
w2v = model.wv

w2v.index2word

['user',
 ',',
 '.',
 "'",
 '?',
 ';',
 'yg',
 'dan',
 'di',
 '#',
 '!',
 '...',
 'yang',
 ':',
 'itu',
 'ini',
 'ada',
 '(',
 'rt',
 ')',
 'jokowi',
 'ya',
 "''",
 'presiden',
 'jadi',
 'dari',
 'url',
 'aja',
 'orang',
 'bisa',
 'islam',
 'gak',
 'ga',
 'agama',
 'sama',
 'aku',
 'indonesia',
 'juga',
 '``',
 'kita',
 'dia',
 'ke',
 'dengan',
 'mau',
 'tidak',
 'tapi',
 'apa',
 'saya',
 'untuk',
 '&',
 'lagi',
 'bukan',
 'pak',
 'nya',
 'kalo',
 'akan',
 'lu',
 'cebong',
 'gue',
 'si',
 'tak',
 'kalau',
 'cina',
 'mereka',
 'pada',
 'lo',
 'atau',
 'lebih',
 'gubernur',
 'asing',
 'udah',
 'kan',
 'banyak',
 'adalah',
 'semua',
 'dalam',
 'rakyat',
 'karena',
 'negara',
 'rezim',
 'kristen',
 'dgn',
 'sudah',
 'buat',
 '2019gantipresiden',
 'amp',
 'jangan',
 'baru',
 'pki',
 'komunis',
 'gua',
 'punya',
 'ahok',
 'anda',
 'ekonomi',
 'harus',
 'oleh',
 'masih',
 'dulu',
 'sih',
 'hanya',
 'allah',
 'kok',
 'ulama',
 'cuma',
 'budaya',
 'kafir',
 'tdk',
 'ð',
 'kamu',
 'pilkada',
 'g

In [16]:
w2v.vectors

array([[-3.9280134e-01, -1.6588101e+00,  9.5378363e-01, ...,
         3.3521360e-01, -4.3679872e-01,  1.3048853e-01],
       [-1.0376625e+00, -1.1512564e-01,  4.8504961e-01, ...,
         1.5375288e-01, -5.7503796e-01, -2.2115375e-01],
       [-6.0902745e-01, -5.5789262e-01,  6.5633893e-01, ...,
         6.1145560e-03, -4.7054327e-01, -3.7204331e-01],
       ...,
       [ 9.0536818e-02,  2.4880285e+00, -9.8139483e-01, ...,
         1.7567397e+00,  1.6208062e+00, -1.2055549e-01],
       [-4.2510238e+00, -3.9164156e-01, -7.4219310e-01, ...,
         1.2816662e-01,  1.7060778e+00,  3.7895328e-01],
       [-7.9146940e-01,  5.6028157e-01, -1.1514118e+00, ...,
         2.3538932e-01,  2.6488397e-01, -4.5705863e-04]], dtype=float32)

In [17]:
w2v.vector_size

100

In [30]:
w2v["goblok"]

array([-0.7708222 , -0.43149805,  2.463286  , -1.1012535 , -0.92488796,
        0.25767776, -1.4365212 ,  1.40173   ,  0.25019965,  1.6799302 ,
        1.5853355 , -0.39803883, -0.47243562,  0.5645968 , -1.7748227 ,
        1.8798733 ,  3.4864824 ,  2.1162317 , -2.0766811 ,  0.09738442,
       -1.375027  , -0.99651974, -0.17478625, -0.44363126, -1.2156191 ,
       -2.066548  , -2.7958062 , -3.5743513 ,  0.49649093, -2.8651671 ,
       -1.1874176 , -1.5303837 , -1.5523103 ,  3.6681721 , -0.30347556,
        0.31592852, -1.3968124 , -2.1515625 ,  1.4841522 , -0.8415857 ,
        0.709291  ,  0.9705112 ,  0.34370586,  0.4472792 ,  2.8928049 ,
       -2.2686703 , -1.9268979 ,  3.3325756 , -2.0720603 ,  0.33544412,
        2.305171  ,  4.108708  ,  0.384902  ,  0.65149087, -0.17254652,
        1.3695755 ,  0.51219624, -1.2069936 ,  0.79390454, -1.9859614 ,
       -3.0793333 , -0.5433971 , -1.730737  ,  0.5149281 , -0.37531698,
        3.067021  ,  1.1441159 ,  2.8406816 ,  1.0500903 ,  0.93

Similiar word

In [37]:
w2v.similar_by_word("goblok", topn=5)

[('dongo', 0.3798598647117615),
 ('jahat', 0.3776898682117462),
 ('bego', 0.36662396788597107),
 ('emang', 0.36342936754226685),
 ('pintar', 0.35950368642807007)]

higher order visualization

In [20]:
from umap import UMAP
import numpy as np
import pandas as pd
import plotly.express as px

In [21]:
X = UMAP().fit_transform(w2v.vectors)

In [22]:
df = pd.DataFrame(X, columns=["umap1", "umap2"])
df["text"] = w2v.index2word

In [23]:
df

Unnamed: 0,umap1,umap2,text
0,-4.203959,3.895638,user
1,-3.779823,3.773434,","
2,-3.680412,3.916084,.
3,-4.418407,3.498641,'
4,-3.913689,3.792019,?
...,...,...,...
8071,-10.747257,5.941481,menyucikan
8072,-7.758178,4.520923,\nteriak2
8073,-7.615200,4.464147,mslh
8074,-7.830553,4.659325,looo


In [24]:
fig = px.scatter(df, x="umap1", y="umap2", text="text")
fig.update_traces(textposition='top center')
fig.update_layout(
    height=800,
    title_text='Reduced word2vec visualization'
)
fig.show()