In [10]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from IPython.display import display
from sklearn.neural_network import MLPClassifier

In [2]:
# Data manipulation
import pandas as pd
import numpy as np


# Splitting data
from sklearn.model_selection import train_test_split

N_FOLDS = 5
MAX_EVALS = 5

In [3]:
from scipy import stats
from scipy import spatial
from gensim.models import KeyedVectors


In [4]:
word2vec = KeyedVectors.load_word2vec_format('../word2vec/W2V_150.txt', binary=False)

In [5]:
print(word2vec['a_dua'])
import random
print(np.random.randn(150))

[-1.096853    0.1238883   1.713642    2.17519    -0.5785002  -2.027245
  0.4998034  -0.7409065  -1.753515   -1.095152    0.7456962   1.484329
  1.235524    1.025137   -0.4033328  -0.9089296  -0.2022127   0.9581299
  0.6957457   1.385916    0.4481485   1.238998    0.1075599   3.29651
  1.315556    0.4587597   2.277924   -1.313732    0.9624746  -0.2428055
 -0.9468833   0.8783497   0.3211554  -0.5890836   0.9027938  -1.178614
 -0.7399899  -1.011717    1.766519   -2.162154    0.9139196  -2.295288
 -1.206698   -2.222441   -1.830276   -0.4457759   1.122905    0.3227312
 -0.03411892 -0.7967375   0.1875879   0.3892174  -0.7192345  -0.5432259
 -0.6546717  -0.7268866   0.5100322   0.9201084   0.2751797   1.194452
  0.5092697  -0.3654743  -1.434594   -1.484043    1.105507    0.02227577
 -1.812047   -1.204937   -3.369251    0.6383803   0.1732161  -0.478294
 -0.5762202   0.1084443  -0.1128882   0.647718   -0.3459211   0.5320156
  0.239954   -2.538636   -0.7471865   1.441165    0.02344817 -0.9431902

In [6]:
def get_emb(s):
    if s in word2vec:
        return word2vec[s]
    else:
        # print("not found ", s)
        return np.random.randn(150)

In [168]:
# load test data
p1 = pd.read_table("../datasets/ViCon-400/400_noun_pairs.txt", sep="\t")
p2 = pd.read_table("../datasets/ViCon-400/400_verb_pairs.txt", sep="\t")
p3 = pd.read_table("../datasets/ViCon-400/600_adj_pairs.txt", sep="\t")

test_data = pd.concat([p1, p2, p3])

test_data['label'] = test_data.apply(lambda row: 1 if row['Relation'] == 'ANT' else 0, axis=1)
test_data = test_data.reset_index().drop(columns = ['Relation', 'index'])
display(test_data)
test_features = test_data.copy()

test_features['first_emb'] = test_features.apply(lambda row: get_emb(row['Word1']), axis=1)
test_features['second_emb'] = test_features.apply(lambda row: get_emb(row['Word2']), axis=1)

test_features['cos'] = test_features.apply(lambda row: 1 - spatial.distance.cosine(row['first_emb'], row['second_emb']), axis=1)
test_features['norm_of_diff'] = test_features.apply(lambda row: np.linalg.norm(row['first_emb'] - row['second_emb']), axis=1)
test_features['pearson'] = test_features.apply(lambda row: pearson(row['first_emb'], row['second_emb']), axis=1)
test_features['spearman_rank'] = test_features.apply(lambda row: spearman_rank(row['first_emb'], row['second_emb']), axis=1)
test_features['dot_product'] = test_features.apply(lambda row: dot_product(row['first_emb'], row['second_emb']), axis=1)
test_features['euclidean'] = test_features.apply(lambda row: euclidean(row['first_emb'], row['second_emb']), axis=1)

test_features['emb_miss'] = test_features.apply(lambda row: 0 if (row['Word1'] in word2vec and row['Word2'] in word2vec) else 1, axis=1)

X_test = test_features.drop(columns = ['Word1', 'Word2', 'first_emb', 'second_emb', 'label'])
y_test = np.array(test_features['label'].astype(np.int32)).reshape((-1, ))
X_test

Unnamed: 0,Word1,Word2,label
0,khoái_lạc,nỗi_đau,1
1,yếu_kém,sức_mạnh,1
2,thanh_danh,ô_nhục,1
3,dây,dọc,0
4,bằng_cớ,chứng_cớ,0
...,...,...,...
1395,ấm_áp,lạnh_lẽo,1
1396,mập,ngẳng,1
1397,chóng,lâu,1
1398,chậm,sớm,1


Unnamed: 0,cos,norm_of_diff,pearson,spearman_rank,dot_product,euclidean,emb_miss
0,0.245766,16.834578,0.249734,0.271264,40.296856,-15.834578,0
1,0.171372,18.374029,0.171366,0.166683,34.760422,-17.374029,0
2,0.088986,16.266695,0.088963,0.034229,12.025043,-15.266695,0
3,0.141662,16.180637,0.150397,0.135496,21.552877,-15.180637,0
4,-0.149954,21.862047,-0.149358,-0.144826,-30.237897,-20.862047,1
...,...,...,...,...,...,...,...
1395,0.629572,12.678332,0.628367,0.613533,136.498199,-11.678332,0
1396,0.042300,19.060446,0.043895,0.039385,8.018007,-18.060446,1
1397,0.075829,21.526340,0.086772,0.106567,19.005964,-20.526340,0
1398,0.415295,14.935099,0.414778,0.404510,75.847366,-13.935099,0


In [127]:

def pearson(e1, e2):
    return stats.pearsonr(e1, e2)[0]

def spearman_rank(e1, e2):
    return stats.spearmanr(e1, e2)[0]

def dot_product(e1, e2):
    return np.dot(e1, e2)

def euclidean(e1, e2):
    return 1 - spatial.distance.euclidean(e1, e2)

In [175]:

# load train data and remove leakage data 

anto = pd.read_table("../antonym-synonym set/Antonym_vietnamese.txt", sep=" ", header=None)
syn = pd.read_table("../antonym-synonym set/Synonym_vietnamese.txt", sep=" ", header=None)
display(anto.tail())
display(syn.tail())
anto["label"] = 1
syn["label"] = 0
data = pd.concat([syn, anto])
data.columns = ["first", "second", "label"]

data = data[~data.set_index(['first','second']).index.isin(test_features.set_index(['Word1','Word2']).index)].reset_index().drop(columns="index")

features = data.copy()
features['first_emb'] = features.apply(lambda row: get_emb(row['first']), axis=1)
features['second_emb'] = features.apply(lambda row: get_emb(row['second']), axis=1)

# features = features[features['first_emb'] != "missing"]
# features = features[features['second_emb'] != "missing"]

display(features)

features['cos'] = features.apply(lambda row: 1 - spatial.distance.cosine(row['first_emb'], row['second_emb']), axis=1)
features['norm_of_diff'] = features.apply(lambda row: np.linalg.norm(row['first_emb'] - row['second_emb']), axis=1)
features['pearson'] = features.apply(lambda row: pearson(row['first_emb'], row['second_emb']), axis=1)
features['spearman_rank'] = features.apply(lambda row: spearman_rank(row['first_emb'], row['second_emb']), axis=1)
features['dot_product'] = features.apply(lambda row: dot_product(row['first_emb'], row['second_emb']), axis=1)
features['euclidean'] = features.apply(lambda row: euclidean(row['first_emb'], row['second_emb']), axis=1)
features['emb_miss'] = features.apply(lambda row: 0 if (row['first'] in word2vec and row['second'] in word2vec) else 1, axis=1)

X = features.drop(columns = ['first', 'second', 'first_emb', 'second_emb', 'label'])
y = np.array(features['label'].astype(np.int32)).reshape((-1, ))

display(X)

Unnamed: 0,0,1
1995,bẩn_thỉu,sạch_sẽ
1996,bận,rảnh
1997,bận,rảnh_rang
1998,bận,rảnh_rỗi
1999,bận,rỗi


Unnamed: 0,0,1
11557,yêu_mến,mến_yêu
11558,yêu_quí,yêu_quý
11559,yêu_thương,thương_yêu
11560,yếu_tố,nguyên_tố
11561,yếu_tố,nhân_tố


Unnamed: 0,first,second,label,first_emb,second_emb
0,a_dua,a_tòng,0,"[-1.096853, 0.1238883, 1.713642, 2.17519, -0.5...","[-0.6841270145344219, 0.6210153314595848, 1.82..."
1,a_dua,vào_hùa,0,"[-1.096853, 0.1238883, 1.713642, 2.17519, -0.5...","[-0.1686348, 0.5090736, -0.06419589, 0.2851182..."
2,a_ma_tơ,tài_tử,0,"[-0.010034900921348604, -0.6676683288266474, 0...","[-0.8984336, 0.04499664, -1.577926, -1.10538, ..."
3,a_tòng,vào_hùa,0,"[-1.982758876623794, 0.23818018281052428, 1.16...","[-0.1686348, 0.5090736, -0.06419589, 0.2851182..."
4,à_ơi,ạ_ơi,0,"[1.133512, 0.4620706, -0.5596249, 1.021484, 0....","[0.41024300375332723, 0.7200859575705622, 0.41..."
...,...,...,...,...,...
11599,báo_ân,báo_thù,1,"[0.4099637, 0.02193844, -0.5297902, 0.2062404,...","[-0.8177037, 0.4749404, -1.174358, -0.679406, ..."
11600,bần_tiện,hào_phóng,1,"[0.6604001453911763, -0.04054231411991766, 0.3...","[1.40416, 2.75442, 0.2111574, 0.9635316, 1.135..."
11601,bận,rảnh,1,"[0.3057049, 3.127424, -1.285929, 0.6471007, -0...","[0.0487476, 1.200912, 0.1983149, -1.580838, 0...."
11602,bận,rảnh_rỗi,1,"[0.3057049, 3.127424, -1.285929, 0.6471007, -0...","[0.2521331, 0.4609424, -1.257721, -0.9162588, ..."


Unnamed: 0,cos,norm_of_diff,pearson,spearman_rank,dot_product,euclidean,emb_miss
0,-0.045687,19.548653,-0.048868,-0.012749,-8.332974,-18.548653,1
1,0.288864,14.071492,0.287837,0.272092,34.252293,-13.071491,0
2,-0.046811,19.542981,-0.047176,-0.048800,-8.502854,-18.542981,1
3,0.109036,14.184158,0.107692,0.036227,11.427121,-13.184158,1
4,-0.162591,14.634972,-0.163899,-0.180564,-13.339455,-13.634972,1
...,...,...,...,...,...,...,...
11599,0.044962,17.528137,0.053884,0.080386,5.950936,-16.528139,0
11600,0.055960,18.720365,0.058679,0.069843,10.366479,-17.720365,1
11601,0.471292,18.200443,0.471963,0.438350,146.509811,-17.200443,0
11602,0.479427,18.138960,0.479693,0.475966,150.581177,-17.138960,0


In [185]:
from sklearn.linear_model import *
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import random 

kf = KFold(n_splits=10, shuffle=True)

cnt = 1
'''
clf = MLPClassifier(hidden_layer_sizes=(2, 1), 
                                       solver='adam', 
                                       learning_rate='invscaling', 
                                       learning_rate_init=0.2,
                                      max_iter=1000)
'''
clfs = [make_pipeline(StandardScaler(), LogisticRegression(solver="liblinear", max_iter=10000, class_weight="balanced",random_state=random.randint(0, 10000)))] * kf.n_splits               
result = list()

for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    
    clfs[cnt - 1].fit(X.iloc[train_index], y[train_index])
    score = f1_score(y[test_index], clfs[cnt - 1].predict(X.iloc[test_index]))
    print(score)
    result.append(score)
    cnt += 1


print("average F1-score: ", np.asarray(result).mean())

Fold:1, Train set: 10443, Test set:1161
0.2207505518763797
Fold:2, Train set: 10443, Test set:1161
0.19117647058823528
Fold:3, Train set: 10443, Test set:1161
0.20361990950226244
Fold:4, Train set: 10443, Test set:1161
0.19999999999999998
Fold:5, Train set: 10444, Test set:1160
0.22174840085287845
Fold:6, Train set: 10444, Test set:1160
0.2537634408602151
Fold:7, Train set: 10444, Test set:1160
0.25165562913907286
Fold:8, Train set: 10444, Test set:1160
0.18009478672985782
Fold:9, Train set: 10444, Test set:1160
0.19532908704883226
Fold:10, Train set: 10444, Test set:1160
0.2100656455142232
average F1-score:  0.2128203922111957


In [186]:
display(X_test)
np.set_printoptions(threshold=10000000)
for i in range(kf.n_splits):
    print(f1_score(y_test, clfs[i].predict(X_test)))
print(y_test)
print(clfs[0].predict(X_test))


Unnamed: 0,cos,norm_of_diff,pearson,spearman_rank,dot_product,euclidean,emb_miss
0,0.245766,16.834578,0.249734,0.271264,40.296856,-15.834578,0
1,0.171372,18.374029,0.171366,0.166683,34.760422,-17.374029,0
2,0.088986,16.266695,0.088963,0.034229,12.025043,-15.266695,0
3,0.141662,16.180637,0.150397,0.135496,21.552877,-15.180637,0
4,-0.149954,21.862047,-0.149358,-0.144826,-30.237897,-20.862047,1
...,...,...,...,...,...,...,...
1395,0.629572,12.678332,0.628367,0.613533,136.498199,-11.678332,0
1396,0.042300,19.060446,0.043895,0.039385,8.018007,-18.060446,1
1397,0.075829,21.526340,0.086772,0.106567,19.005964,-20.526340,0
1398,0.415295,14.935099,0.414778,0.404510,75.847366,-13.935099,0


0.7206740116655866
0.7206740116655866
0.7206740116655866
0.7206740116655866
0.7206740116655866
0.7206740116655866
0.7206740116655866
0.7206740116655866
0.7206740116655866
0.7206740116655866
[1 1 1 0 0 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 0 0 0 1 0
 1 1 0 1 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 0 1 1 1 1 0 0 0
 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 1 1 0 1 0 1 0 0 1
 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0
 1 0 1 0 1 1 1 0 1 1 0 0 1 0 1 0 1 0 0 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0 1 0 1
 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 1 0
 0 1 1 0 1 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 1 0 1 0 1
 0 1 1 1 0 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 1 1
 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1
 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 0 1
 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 0 1 0 1