In [26]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import numpy as np
from python_file.w2v_utils import *
import pandas as pd
import collections

In [2]:
words, word_to_vec_map = read_glove_vecs('DATASET/W2V_150.txt')

# 1. Cosine similarity

In [3]:
def cosine_similarity(u, v):
    distance = 0.0

    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u,v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(u * u))
    
    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(v * v))
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u * norm_v)
    
    return cosine_similarity

In [4]:
file='DATASET/Visim-400.txt'
data_split = pd.read_csv(file, sep=" ")

In [5]:
words1 = []
words2 = []
for i in data_split['Word1\tWord2\tPOS\tSim1\tSim2\tSTD']:
    chunks = i.split('\t')
    words1.append(chunks[0])
    words2.append(chunks[1])

In [9]:
sims = []
for i_w in range(len(words1)):
    try:
        s_w1 = word_to_vec_map[words1[i_w]]
        s_w2 = word_to_vec_map[words2[i_w]]
        sim = cosine_similarity(s_w1, s_w2)
        print(words1[i_w],words2[i_w])
        print(sim)
        sims.append(sim)
    except KeyError:
        pass


biến ngập
-0.004912339469670027
nhà_thi_đấu nhà
0.08252318329211765
động tĩnh
0.2770859598682774
khuyết ưu
0.1767986283562671
thủ_pháp biện_pháp
0.40236612919430603
kết_duyên thành_hôn
0.4630084020140722
cấp_tiến bảo_thủ
0.25694700889961236
nước_lớn nguy_hiểm
0.18519202240211619
hoa nhị
0.2230696045254854
bất_lợi thuận_lợi
0.5348913029963392
phân_ly sum_họp
0.07770108930396218
diễu_hành tuần_hành
0.6438835401838533
cao_trào thoái_trào
0.36103255277281715
thịnh suy
0.059602370849219266
con_voi ngà
0.23895503536998694
cá_chuối cá_quả
0.16647413914742723
đổ_bê_tông biếu_xén
-0.1728400720961947
con_đẻ con_ruột
0.6417888034717878
uỷ_nhiệm phân_công
0.2875352779105757
bay_lượn chuyển_động
0.3881971694440945
khêu_gợi khơi_gợi
0.2559647287960097
thăng_bằng cân_bằng
0.48043194238556186
chọn_lựa tuyển_chọn
0.47242629887420834
có_ích hữu_ích
0.662409385234064
cứng_rắn nhỏ_nhắn
0.22523314998168
lỗ_đen hố_đen
0.7340683342360697
vùi đóng
-0.013609544519483876
tính_tình tính_khí
0.6878254420258124
mụ

# 2. K-nearest words

In [15]:
def k_nearest_words(word, all_words):
    all_sims = []
    top_k = dict()
    try:
        word_vector = word_to_vec_map[word]
        for idx in all_words:
            try:
                idx_vector = word_to_vec_map[idx]
                similarity = cosine_similarity(word_vector, idx_vector)
                top_k.update({idx:similarity})
            except KeyError:
                pass
        return (word, sorted(top_k.items(), key=lambda x: x[1], reverse=True)[:10]) 
    except KeyError:
        pass

In [16]:
for e_w in words1:
    print(k_nearest_words(e_w, words2),'\n')

('biến', [('loại_bỏ', 0.39982202546609236), ('trừ_khử', 0.3577853111161979), ('phát_triển', 0.35468234283556915), ('cải_biến', 0.3499645711349526), ('thuần_hoá', 0.32730778322543275), ('tập_hợp', 0.2970481096627491), ('đảo_ngược', 0.29586987173526696), ('biến_đổi', 0.2715603489232827), ('chuyển_hướng', 0.25468541368467756), ('khơi_gợi', 0.2513066830497953)]) 

('nhà_thi_đấu', [('khán_đài', 0.382081003073068), ('tập_luyện', 0.3760700761773554), ('trận_đấu', 0.3380625391209835), ('sân_khấu', 0.2549420655275232), ('trận_chiến', 0.22814374864687503), ('tiểu_học', 0.21454221562473896), ('giở_chứng', 0.18710773491409094), ('đêm', 0.18002264612377802), ('công_trình_phụ', 0.1772424345078385), ('bảng_tổng_sắp', 0.17701107414308503)]) 

('động', [('nhị', 0.28317334174810116), ('huyệt', 0.28082542192849613), ('tĩnh', 0.2770859598682774), ('hữu_tình', 0.2752016691050024), ('trạng_thái', 0.25779429897218265), ('giảm_xóc', 0.2528922326467602), ('cửa_sông', 0.24725027107005226), ('âm_thanh', 0.247224

('bằng_chứng', [('chứng_cứ', 0.728347957294409), ('tài_liệu', 0.5240658319702595), ('nguyên_do', 0.40140287385975804), ('phẩm_chất', 0.3101297373357707), ('biện_pháp', 0.30737569470737797), ('phi_nghĩa', 0.3070210757599062), ('tung_tích', 0.2885306290155611), ('đĩa_bay', 0.2416547133623973), ('người_bị_hại', 0.23555511772043902), ('bình_luận', 0.23474507807607625)]) 

('hạ_màn', [('mở_màn', 0.48128296649134483), ('thành_hôn', 0.38272991256771477), ('thoái_trào', 0.3478225779747293), ('dời', 0.22831761154295635), ('mất', 0.22730031625209843), ('cạn', 0.2102917941578103), ('ngấm', 0.20826240782244823), ('cuối_cùng', 0.2009589340450491), ('ồn_ào', 0.19821844069934472), ('giở_chứng', 0.19535665346292277)]) 

('ca_tụng', [('khen', 0.4421157101237601), ('đền_đáp', 0.441323985097661), ('nói', 0.3494175568178161), ('vặn_vẹo', 0.32428820518310764), ('cải_biến', 0.3165453540558331), ('thuần_hoá', 0.3129452862926807), ('biểu_lộ', 0.304156764581453), ('giấu', 0.303335462229177), ('kìm_nén', 0.3011

('cá_hồng', [('cá_quả', 0.5338537478050818), ('rau_quả', 0.4433508726689447), ('rau', 0.3586093199048912), ('bánh_giò', 0.34916378321864505), ('cây_hoa_màu', 0.29556583935045416), ('xuất_xứ', 0.2785295905239845), ('tế_bào', 0.24468932536836446), ('bệnh_tật', 0.24229711211334645), ('răng', 0.24192172304763865), ('lô', 0.2334562442724884)]) 

('bằng', [('xuyên', 0.1946120546019391), ('tẩm', 0.19038165307460791), ('gọn_ghẽ', 0.18740666891998814), ('trong', 0.1853972127918262), ('biểu_lộ', 0.17412186379235225), ('trên', 0.16803248448498304), ('cấp_phát', 0.16715166484354918), ('loại_bỏ', 0.16162132932011825), ('để_dành', 0.1606478271911813), ('phân_bổ', 0.16012930934569689)]) 

('tĩnh_dưỡng', [('tĩnh_dưỡng', 1.0), ('tập_luyện', 0.4364856222311624), ('trông_nom', 0.3930407978311433), ('chăm_sóc', 0.32362353156576185), ('sum_họp', 0.31572805440629687), ('nuôi_dưỡng', 0.31351349848273113), ('đoàn_tụ', 0.2988725794391695), ('thả_lỏng', 0.2771440248593115), ('cải_tạo', 0.2743779923258784), ('mỏ

('điều_hành', [('lãnh_đạo', 0.35763513139116254), ('phân_công', 0.3565616440334476), ('phát_triển', 0.3330068069029153), ('chăm_sóc', 0.30517328455519466), ('phân_bổ', 0.28713856946043614), ('trông_nom', 0.27723394504654214), ('tuyển_chọn', 0.2714453610108174), ('suy_xét', 0.24537122193579816), ('trừ_khử', 0.23658345031117953), ('lộn_xộn', 0.22291050371112003)]) 

None 

('tự_do', [('rộng_rãi', 0.36972316741059286), ('thả_lỏng', 0.3689931398149896), ('cân_bằng', 0.26919106127786946), ('lợi', 0.2619854110343203), ('chuyển_động', 0.23274517284782864), ('bí_mật', 0.23250607641505946), ('thuận_lợi', 0.21977251493928493), ('mới_mẻ', 0.21610841415822118), ('bảo_thủ', 0.21445162294847167), ('thông_suốt', 0.19553280304273607)]) 

('thu_ngân', [('bìa', 0.2879985507662735), ('gọn_gàng', 0.2678460318775316), ('thường_lệ', 0.26254955026192073), ('ung_dung', 0.2476070594803435), ('dụng_cụ', 0.23023154239591442), ('phân_công', 0.22554205381567455), ('trang_sức', 0.2145474046150375), ('quần_áo', 0.21

('hạn_hán', [('lụt_lội', 0.6742159346587001), ('bệnh_tật', 0.40610574273809), ('thua_lỗ', 0.3833514013290569), ('ngập', 0.34557084261193505), ('biến_đổi', 0.3443188068476998), ('khô_kiệt', 0.3372227819530151), ('tai_hoạ', 0.3245363015988043), ('nhiễm_khuẩn', 0.31983286222013196), ('ách_tắc', 0.3026986056750762), ('cây_hoa_màu', 0.282582731126053)]) 

('lưỡng_lự', [('cân_nhắc', 0.4423547838132342), ('chệch_choạc', 0.367428899155318), ('suy_xét', 0.3364765145057857), ('mỏi_mệt', 0.3168267524984822), ('thoái_trào', 0.3145184680229755), ('thụ_động', 0.309815458130227), ('ngúng_nguẩy', 0.30327059436741016), ('khiếm_nhã', 0.30105906641777386), ('ác_cảm', 0.29815859491387137), ('nhích', 0.2935827501556912)]) 

('tung', [('thả', 0.4374870609572257), ('ngấm', 0.3987977302733555), ('lộ', 0.35654846944574353), ('phát', 0.34300909662184437), ('bay', 0.325330212503116), ('thâm_nhập', 0.3195929483542107), ('nguỵ_trang', 0.3123220562247388), ('chuyển_hướng', 0.3102251496578722), ('vặn_vẹo', 0.3012687

# 3. Synonym-antonym classification

In [17]:
ant = open('DATASET/Antonym_vietnamese.txt', 'r').read().strip('\n').split('\n')
syn = open('DATASET/Synonym_vietnamese.txt', 'r').read().strip('\n').split('\n')

In [18]:
corpus = []
for pair in ant:
    corpus.extend(pair.split(' '))
for pair in syn:
    corpus.extend(pair.split(' '))
corpus = list(set(corpus))
corpus = {word: i for (i, word) in enumerate(corpus)}
corpus['UNK'] = len(corpus)
X = []
y = []
print(len(corpus))

14150


In [19]:
corpus

{'': 0,
 'cởi': 1,
 'sửa_chữa': 2,
 'hồng_hoa': 3,
 'thoả_thuê': 4,
 'ốm_nhách': 5,
 'ăn_mặc': 6,
 'trái_phiếu': 7,
 'cật_lực': 8,
 'thòi': 9,
 'chia_cắt': 10,
 'mó_máy': 11,
 'cho_qua': 12,
 'gọi': 13,
 'phá_lưới': 14,
 'chợn': 15,
 'cùn_đời': 16,
 'nhớ_mong': 17,
 'đen_sì': 18,
 'cộm': 19,
 'sột_soạt': 20,
 'kí_sự': 21,
 'phát_đơn': 22,
 'nổi_tam_bành': 23,
 'bảo_thủ': 24,
 'để_ý': 25,
 'ánh_ỏi': 26,
 'khư_khư': 27,
 'điêu_trá': 28,
 'vùng_vẫy': 29,
 'tam_giác': 30,
 'lầm_rầm': 31,
 'mao_mạch': 32,
 'am_hiểu': 33,
 'nạnh': 34,
 'đan': 35,
 'vang_dậy': 36,
 'tức_thì': 37,
 'thẳng_tuột': 38,
 'chiến_lược_quân_sự': 39,
 'chứa_chấp': 40,
 'ương_ngạnh': 41,
 'dành': 42,
 'tàu_bè': 43,
 'đảo': 44,
 'thăng_thiên': 45,
 'thước_ta': 46,
 'chả_hạn': 47,
 'dăn_dúm': 48,
 'chi_phí': 49,
 'chăm_chú': 50,
 'bõ_bèn': 51,
 'luật_bài_trung': 52,
 'gợn': 53,
 'chấm_hỏi': 54,
 'lạnh_giá': 55,
 'chí_thân': 56,
 'mất_công': 57,
 'tiếng_nói': 58,
 'phơi': 59,
 'tạp_phẩm': 60,
 'do_dự': 61,
 'hanh_hao': 62

In [20]:
for pair in ant:
    pair = pair.split(' ')
    if len(pair) == 2: # contain 2 words
        w1, w2 = pair[0], pair[1]
        # one-hot encoding
        one_hot1 = np.zeros(len(corpus))
        one_hot1[corpus[w1]] = 1
        one_hot2 = np.zeros(len(corpus))
        one_hot2[corpus[w2]] = 1
        X.append([one_hot1, one_hot2])
        y.append(0)

In [21]:
for pair in syn:
    pair = pair.split(' ')
    if len(pair) == 2: # contain 2 words
        w1, w2 = pair[0], pair[1]
        one_hot1 = np.zeros(len(corpus))
        one_hot1[corpus[w1]] = 1
        one_hot2 = np.zeros(len(corpus))
        one_hot2[corpus[w2]] = 1
        X.append([one_hot1, one_hot2])
        y.append(1)

In [22]:
X = np.array(X)
y = np.array(y)
X = X.reshape(X.shape[0], -1)
print(X.shape, y.shape)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

(13562, 28300) (13562,)


In [23]:
epochs = 100

clf = MLPClassifier(hidden_layer_sizes=(256, 100, 100, 50), random_state=1, max_iter=epochs, verbose=True).fit(X_train, y_train)
pred = clf.predict(X_val)
score = f1_score(pred, y_val)
print("F1 for validate dataset is: ", score)

Iteration 1, loss = 0.43750922
Iteration 2, loss = 0.17219359
Iteration 3, loss = 0.04485404
Iteration 4, loss = 0.02017860
Iteration 5, loss = 0.01047814
Iteration 6, loss = 0.00826773
Iteration 7, loss = 0.00615374
Iteration 8, loss = 0.00371194
Iteration 9, loss = 0.00207706
Iteration 10, loss = 0.00115505
Iteration 11, loss = 0.00098236
Iteration 12, loss = 0.00087462
Iteration 13, loss = 0.00079998
Iteration 14, loss = 0.00083895
Iteration 15, loss = 0.00074640
Iteration 16, loss = 0.00072955
Iteration 17, loss = 0.00071223
Iteration 18, loss = 0.00075186
Iteration 19, loss = 0.00067084
Iteration 20, loss = 0.00067832
Iteration 21, loss = 0.00068254
Iteration 22, loss = 0.00065496
Iteration 23, loss = 0.00064031
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
F1 for validate dataset is:  0.9605234460196292


In [24]:
#TEST WITH ViCon-400 DATASET
nouns ='DATASET/ViCon-400/400_noun_pairs.txt'
verbs = 'DATASET/ViCon-400/400_verb_pairs.txt'
adjs = 'DATASET/ViCon-400/600_adj_pairs.txt'

nouns_split = pd.read_csv(nouns, sep=" ")
verbs_split = pd.read_csv(verbs, sep=" ")
adjs_split = pd.read_csv(adjs, sep=" ")

In [25]:
all_words = []
labels = []
X_test, y_test = [], []

#Concat three dataset to one
for i in nouns_split['Word1\tWord2\tRelation']: 
    w1, w2, label = i.split('\t')
    one_hot1 = np.zeros(len(corpus))
    if corpus.get(w1) is None:
        w1 = 'UNK'
    if corpus.get(w2) is None:
        w2 = 'UNK'
    one_hot1[corpus[w1]] = 1
    one_hot2 = np.zeros(len(corpus))
    one_hot2[corpus[w2]] = 1
    X_test.append([one_hot1, one_hot2])
    label = 1 if label=='SYN' else 0
    y_test.append(label)

for i in verbs_split['Word1\tWord2\tRelation']: 
    w1, w2, label = i.split('\t')
    one_hot1 = np.zeros(len(corpus))
    if corpus.get(w1) is None:
        w1 = 'UNK'
    if corpus.get(w2) is None:
        w2 = 'UNK'
    one_hot1[corpus[w1]] = 1
    one_hot2 = np.zeros(len(corpus))
    one_hot2[corpus[w2]] = 1
    X_test.append([one_hot1, one_hot2])
    label = 1 if label=='SYN' else 0
    y_test.append(label)
    
for i in adjs_split['Word1\tWord2\tRelation']: 
    w1, w2, label = i.split('\t')
    one_hot1 = np.zeros(len(corpus))
    if corpus.get(w1) is None:
        w1 = 'UNK'
    if corpus.get(w2) is None:
        w2 = 'UNK'
    one_hot1[corpus[w1]] = 1
    one_hot2 = np.zeros(len(corpus))
    one_hot2[corpus[w2]] = 1
    X_test.append([one_hot1, one_hot2])
    label = 1 if label=='SYN' else 0
    y_test.append(label)

In [27]:
X_test = np.array(X_test)
y_test = np.array(y_test)
X_test = X_test.reshape(X_test.shape[0], -1)
pred = clf.predict(X_test)
f1_score = f1_score(pred, y_test)
recall = recall_score(pred, y_test)
precision = precision_score(pred, y_test)

print("The F1-score of Vicon-400 dataset is: "+ str(f1_score*100) + "%")
print("Recall is:", recall)
print("Precision is: ", precision)

The F1-score of Vicon-400 dataset is: 97.65791341376863%
Recall is: 0.9703808180535967
Precision is:  0.9828571428571429


In [36]:
mis_cls = X_test[np.where(y_test+pred==1)]
mis_cls = np.reshape(mis_cls, (mis_cls.shape[0], 2, -1))

In [37]:
mis_cls.shape

(33, 2, 14150)