In [15]:
words = []
vectors = []

with open("../word2vec/W2V_150.txt") as f:
    cnt = 0
    for line in f.readlines():
        if cnt <=1: 
            cnt+=1
            continue
        word, vector = line.split(" ", 1)
        words.append(word)
        vectors.append(vector)

print(f"length of words and vectors: {len(words)}, {len(vectors)}")

length of words and vectors: 77021, 77021


In [16]:
dict_word_2_vec = dict(zip((words), (vectors)))

In [5]:
def load_train_data(path: str) -> list:
    list_word_pair = [] 
    with open(path, 'r' , encoding = 'utf-8') as f: 
        for line in f: 
            word_pair = line[0: len(line) - 1].split(" ")
            list_word_pair.append([word_pair[0], word_pair[1]])
    return list_word_pair 

In [6]:
antonyms_list = load_train_data("../antonym-synonym set/Antonym_vietnamese.txt")
synonyms_list = load_train_data("../antonym-synonym set/Synonym_vietnamese.txt")

In [7]:
antonyms_list

[['ác', 'hiền'],
 ['ác', 'thiện'],
 ['ác_cảm', 'thiện_cảm'],
 ['ác_độc', 'hiền_lành'],
 ['ác_độc', 'hiền_từ'],
 ['ác_tính', 'lành_tính'],
 ['ác_ý', 'thiện_ý'],
 ['ám_muội', 'minh_bạch'],
 ['an_tâm', 'lo_lắng'],
 ['anh_dũng', 'hèn'],
 ['anh_dũng', 'hèn_nhát'],
 ['ăn_khách', 'ế'],
 ['ẩm', 'khô'],
 ['ẩm', 'khô_ráo'],
 ['ẩm', 'ráo'],
 ['ẩm_thấp', 'khô_ráo'],
 ['ẩm_ướt', 'khô_ráo'],
 ['ấm_áp', 'lạnh_lẽo'],
 ['ẩn', 'hiện'],
 ['bác', 'chấp_nhận'],
 ['bác', 'chấp_thuận'],
 ['bác_bỏ', 'chấp_nhận'],
 ['bác_bỏ', 'chấp_thuận'],
 ['bạc', 'hậu'],
 ['bạc_đãi', 'trọng_đãi'],
 ['bạc_màu', 'màu_mỡ'],
 ['bạc_mầu', 'màu_mỡ'],
 ['bãi_nhiệm', 'bổ_dụng'],
 ['bãi_nhiệm', 'bổ_nhiệm'],
 ['bại', 'thành'],
 ['bại', 'thắng'],
 ['bán', 'mua'],
 ['bao_dung', 'hẹp_hòi'],
 ['bao_vây', 'giải_vây'],
 ['báo_ân', 'báo_oán'],
 ['báo_ân', 'báo_thù'],
 ['báo_ân', 'trả_thù'],
 ['bần_tiện', 'hào_phóng'],
 ['bẩn', 'sạch'],
 ['bẩn', 'sạch_sẽ'],
 ['bẩn_thỉu', 'sạch'],
 ['bẩn_thỉu', 'sạch_sẽ'],
 ['bận', 'rảnh'],
 ['bận', 'rảnh_ran

In [17]:
def get_vec_from_word(word):
    if dict_word_2_vec.get(word) is None:
        return None
    res = (dict_word_2_vec.get(word).split())
    return [float(l) for l in res ]

In [18]:
def get_feature_of_pair_word(word1, word2):
    vector1 = get_vec_from_word(word1)
    vector2 = get_vec_from_word(word2)

    if vector1 is not None and vector2 is not None:
        return vector1 + vector2
    else:
        return None

In [19]:
def generate_train_data():
    x_train = []
    y_train = []

    for word_pair in synonyms_list:
        feature = get_feature_of_pair_word(word_pair[0], word_pair[1])
        if feature is not None:
            y_train.extend([1])
            x_train.append(feature)

    for word_pair in antonyms_list:
        feature = get_feature_of_pair_word(word_pair[0], word_pair[1])
        if feature is not None:
            y_train.extend([0])
            x_train.append(feature)
    
    return x_train, y_train

In [20]:
def generate_test_data(path):
    x_test = []
    y_test = []
    word_pair_list= []

    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            splited = line[0:-1].split("\t")
            word_pair_list.append([splited[0], splited[1], splited[2]])

    for word_pair in word_pair_list:
        feature = get_feature_of_pair_word(word_pair[0], word_pair[1])
        if feature is not None:
            y_test.extend([1] if word_pair[2] == "SYN" else [0])
            x_test.append(feature)

    return x_test, y_test

In [21]:
x_train, y_train = generate_train_data()

In [22]:
from sklearn.linear_model import LogisticRegression
classifier_model = LogisticRegression()

classifier_model.fit(x_train, y_train)

In [23]:
from sklearn.metrics import classification_report

In [24]:
x_test, y_test = generate_test_data("../datasets/ViCon-400/600_adj_pairs.txt")
y_pred = classifier_model.predict(x_test)

print(classification_report(y_test, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.84      0.66      0.74       282
           1       0.67      0.85      0.75       227

    accuracy                           0.74       509
   macro avg       0.76      0.75      0.74       509
weighted avg       0.76      0.74      0.74       509

