In [1]:
import pickle
import numpy as np

In [2]:
# if we use the updated ngrams: ngrams that do not include white spaces
updated_ngrams = True

In [3]:
processed_dataset_path = '/mounts/data/proj/yihong/newhome/ConceptNetwork/eva/sentence_classification'
if updated_ngrams:
    processed_dataset_path = '/mounts/data/proj/yihong/newhome/ConceptNetwork/eva/sentence_classification/updated'
    
train_size = 860
number_of_languages = 50

with open(f"{processed_dataset_path}/{number_of_languages}/train/{train_size}/train.pickle", 'rb') as handle:
    train_set_to_store = pickle.load(handle)

with open(f"{processed_dataset_path}/{number_of_languages}/valid/valid.pickle", 'rb') as handle:
    valid_set_to_store = pickle.load(handle)

with open(f"{processed_dataset_path}/{number_of_languages}/test/test.pickle", 'rb') as handle:
    test_set_to_store = pickle.load(handle)

In [4]:
print(f"Number of languages in train: {len(train_set_to_store)}")
print(f"Number of languages in valid: {len(valid_set_to_store)}")
print(f"Number of languages in test: {len(test_set_to_store)}")

Number of languages in train: 1262
Number of languages in valid: 1262
Number of languages in test: 1262


In [5]:
# load concept networks
from gensim.models import KeyedVectors

emb_dim = 200
num_epochs = 10

embedding_path = f"/mounts/data/proj/yihong/newhome/ConceptNetwork/network_related/" + \
                 f"expandednet_vectors_{emb_dim}_{num_epochs}.wv"

if updated_ngrams:
    embedding_path = f"/mounts/data/proj/yihong/newhome/ConceptNetwork/network_related/" + \
                     f"expandednet_vectors_minlang_{number_of_languages}_{emb_dim}_{num_epochs}_updated.wv"

loaded_n2v = KeyedVectors.load(embedding_path)

In [6]:
# transform the concept (string) representations to dense vectors
mapping = {'Grace': 1, 'Violence': 2, 'Sin': 3, 'Recommendation': 4, 'Faith': 5, 'Description': 6}

In [7]:
def return_data_labels(data_dict, embedding_m, label_mapping_dict, lang):
    labels = []
    representations = []

    for pair in data_dict:

        # transform strings
        strings = pair[1]
        representation = np.zeros(embedding_m.vector_size)

        for string in strings:
            representation += np.array(embedding_m[string]) if lang == 'eng' \
            else np.array(embedding_m[f"{lang}:{string}"])
        representations.append(representation)

        # transform labels
        label = pair[2]
        labels.append(label_mapping_dict[label])    
    
    return representations, labels

In [8]:
def creating_train_test_set(lang, label_mapping_dict, embedding_m, train_dict, test_dict, valid_dict=None):
    representations_train, labels_train = return_data_labels(train_dict[lang], embedding_m, 
                                                             label_mapping_dict, lang)
    representations_test, labels_test = return_data_labels(test_dict[lang], embedding_m, 
                                                           label_mapping_dict, lang)
    if valid_dict is None:
        return representations_train, labels_train, representations_test, labels_test
    else:
        representations_valid, labels_valid = return_data_labels(valid_dict[lang], embedding_m, 
                                                                 label_mapping_dict, lang)
        return representations_train, labels_train, representations_valid, labels_valid, \
               representations_test, labels_test

In [9]:
# define the training language
train_language = 'eng'

r_train, l_train, r_valid, l_valid, r_test, l_test = creating_train_test_set(train_language, mapping, loaded_n2v, 
                                                                             train_set_to_store,
                                                                             test_set_to_store,
                                                                             valid_set_to_store, 
                                                                             )

In [10]:
print(f"Train data shape: {np.shape(r_train)}")
print(f"Train labels shape: {len(l_train)}")
print(f"Valid data shape: {np.shape(r_valid)}")
print(f"Valid labels shape: {len(l_valid)}")
print(f"Test data shape: {np.shape(r_test)}")
print(f"Test labels shape: {len(l_test)}")

Train data shape: (860, 200)
Train labels shape: 860
Valid data shape: (106, 200)
Valid labels shape: 106
Test data shape: (111, 200)
Test labels shape: 111


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

model_name = 'lr'
if model_name == 'lr':
    model = LogisticRegression(random_state=114514)
elif model_name == 'svm':
    model = SVC(kernel='rbf', random_state=114514)
elif model_name == 'knn':
    model = KNeighborsClassifier(n_neighbors=20)
model.fit(r_train, l_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=114514)

In [12]:
predictions = model.predict(r_test)

In [13]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

print(f"Train and test on {train_language} data: ")
print(f"Accuracy score: {accuracy_score(l_test, predictions)}")
print(f"Precision score: {precision_score(l_test, predictions, average='macro')}")
print(f"Recall score: {recall_score(l_test, predictions, average='macro')}")
print(f"Macro F1 score: {f1_score(l_test, predictions, average='macro')}")

Train and test on eng data: 
Accuracy score: 0.6216216216216216
Precision score: 0.607183467929082
Recall score: 0.6226967036290992
Macro F1 score: 0.6104063224097981


In [14]:
zero_shot_langs = list(train_set_to_store.keys())
print(f"Number of zero-shot languages: {len(zero_shot_langs) - 1}")  # filter english

f1_performance = {}

save_file_name = f"./{train_size}_{train_language}_{model_name}_zero_shot_results_minlang_{number_of_languages}.txt"

if updated_ngrams:
    save_file_name = f"./{train_size}_{train_language}_{model_name}_zero_shot_results_updated_minlang_{number_of_languages}.txt"
    
with open(save_file_name, 'w', encoding='utf-8') as f:
    for test_lang in zero_shot_langs:
        if test_lang == train_language:
            continue
        _, _, r_test, l_test = creating_train_test_set(test_lang, mapping, loaded_n2v, 
                                                       train_set_to_store, test_set_to_store)
        
        if len(r_test) < 50:
            continue
            # some languages do not have any available test sets
        
        sent = f"{test_lang}:"
        print(sent)
        f.write(sent + '\n')
        
        zero_shot_predictions = model.predict(r_test)
        
        sent = f"Accuracy score: {round(accuracy_score(l_test, zero_shot_predictions), 2)}"
        print(sent)
        f.write(sent + '\n')
        
        sent = f"Precision score: {round(precision_score(l_test, zero_shot_predictions, average='macro'), 2)}"
        print(sent)
        f.write(sent + '\n')
        
        sent = f"Recall score: {round(recall_score(l_test, zero_shot_predictions, average='macro'), 2)}"
        print(sent)
        f.write(sent + '\n')
        
        sent = f"Macro F1 score: {round(f1_score(l_test, zero_shot_predictions, average='macro'), 2)}"
        print(sent)
        f.write(sent + '\n')
        
        f1_performance[test_lang] = round(f1_score(l_test, zero_shot_predictions, average='macro'), 2)
        
        sent = ''
        print(sent)
        f.write(sent + '\n')

Number of zero-shot languages: 1261
aai:
Accuracy score: 0.55
Precision score: 0.54
Recall score: 0.53
Macro F1 score: 0.53

aak:
Accuracy score: 0.42
Precision score: 0.42
Recall score: 0.42
Macro F1 score: 0.41

aau:
Accuracy score: 0.58
Precision score: 0.56
Recall score: 0.55
Macro F1 score: 0.55

aaz:
Accuracy score: 0.58
Precision score: 0.58
Recall score: 0.58
Macro F1 score: 0.58

abt:
Accuracy score: 0.49
Precision score: 0.48
Recall score: 0.5
Macro F1 score: 0.48

abx:
Accuracy score: 0.54
Precision score: 0.51
Recall score: 0.52
Macro F1 score: 0.51

aby:
Accuracy score: 0.47
Precision score: 0.43
Recall score: 0.42
Macro F1 score: 0.42

acd:
Accuracy score: 0.41
Precision score: 0.4
Recall score: 0.37
Macro F1 score: 0.37

ace:
Accuracy score: 0.51
Precision score: 0.49
Recall score: 0.5
Macro F1 score: 0.49

acf:
Accuracy score: 0.52
Precision score: 0.5
Recall score: 0.53
Macro F1 score: 0.51

ach:
Accuracy score: 0.46
Precision score: 0.43
Recall score: 0.45
Macro F1 sc

aze:
Accuracy score: 0.49
Precision score: 0.47
Recall score: 0.47
Macro F1 score: 0.46

azg:
Accuracy score: 0.5
Precision score: 0.48
Recall score: 0.48
Macro F1 score: 0.47

azz:
Accuracy score: 0.51
Precision score: 0.5
Recall score: 0.52
Macro F1 score: 0.5

bak:
Accuracy score: 0.58
Precision score: 0.57
Recall score: 0.57
Macro F1 score: 0.56

bam:
Accuracy score: 0.45
Precision score: 0.44
Recall score: 0.46
Macro F1 score: 0.44

ban:
Accuracy score: 0.51
Precision score: 0.52
Recall score: 0.52
Macro F1 score: 0.5

bao:
Accuracy score: 0.57
Precision score: 0.57
Recall score: 0.55
Macro F1 score: 0.53

bar:
Accuracy score: 0.5
Precision score: 0.49
Recall score: 0.48
Macro F1 score: 0.48

bav:
Accuracy score: 0.48
Precision score: 0.46
Recall score: 0.45
Macro F1 score: 0.45

bba:
Accuracy score: 0.49
Precision score: 0.47
Recall score: 0.49
Macro F1 score: 0.48

bbb:
Accuracy score: 0.45
Precision score: 0.45
Recall score: 0.45
Macro F1 score: 0.43

bbj:
Accuracy score: 0.48


cag:
Accuracy score: 0.5
Precision score: 0.47
Recall score: 0.47
Macro F1 score: 0.46

cak:
Accuracy score: 0.51
Precision score: 0.51
Recall score: 0.51
Macro F1 score: 0.48

cao:
Accuracy score: 0.49
Precision score: 0.46
Recall score: 0.45
Macro F1 score: 0.46

cap:
Accuracy score: 0.55
Precision score: 0.57
Recall score: 0.55
Macro F1 score: 0.55

caq:
Accuracy score: 0.5
Precision score: 0.5
Recall score: 0.51
Macro F1 score: 0.49

car:
Accuracy score: 0.51
Precision score: 0.52
Recall score: 0.51
Macro F1 score: 0.5

cas:
Accuracy score: 0.55
Precision score: 0.51
Recall score: 0.53
Macro F1 score: 0.52

cat:
Accuracy score: 0.52
Precision score: 0.5
Recall score: 0.51
Macro F1 score: 0.5

cav:
Accuracy score: 0.51
Precision score: 0.5
Recall score: 0.5
Macro F1 score: 0.49

cax:
Accuracy score: 0.45
Precision score: 0.48
Recall score: 0.43
Macro F1 score: 0.44

cbc:
Accuracy score: 0.55
Precision score: 0.53
Recall score: 0.53
Macro F1 score: 0.52

cbi:
Accuracy score: 0.55
Pre

deu:
Accuracy score: 0.48
Precision score: 0.47
Recall score: 0.48
Macro F1 score: 0.47

dgc:
Accuracy score: 0.5
Precision score: 0.46
Recall score: 0.45
Macro F1 score: 0.45

dgi:
Accuracy score: 0.51
Precision score: 0.5
Recall score: 0.48
Macro F1 score: 0.49

dgr:
Accuracy score: 0.54
Precision score: 0.55
Recall score: 0.56
Macro F1 score: 0.53

dgz:
Accuracy score: 0.53
Precision score: 0.51
Recall score: 0.52
Macro F1 score: 0.51

dhm:
Accuracy score: 0.53
Precision score: 0.52
Recall score: 0.52
Macro F1 score: 0.52

dig:
Accuracy score: 0.47
Precision score: 0.44
Recall score: 0.45
Macro F1 score: 0.44

dik:
Accuracy score: 0.48
Precision score: 0.51
Recall score: 0.49
Macro F1 score: 0.49

dip:
Accuracy score: 0.53
Precision score: 0.53
Recall score: 0.51
Macro F1 score: 0.51

dis:
Accuracy score: 0.54
Precision score: 0.56
Recall score: 0.57
Macro F1 score: 0.55

dje:
Accuracy score: 0.55
Precision score: 0.53
Recall score: 0.55
Macro F1 score: 0.53

djk:
Accuracy score: 0.

gng:
Accuracy score: 0.48
Precision score: 0.45
Recall score: 0.46
Macro F1 score: 0.45

gnn:
Accuracy score: 0.5
Precision score: 0.55
Recall score: 0.5
Macro F1 score: 0.51

gnw:
Accuracy score: 0.51
Precision score: 0.5
Recall score: 0.49
Macro F1 score: 0.49

gof:
Accuracy score: 0.51
Precision score: 0.51
Recall score: 0.5
Macro F1 score: 0.5

gog:
Accuracy score: 0.5
Precision score: 0.5
Recall score: 0.51
Macro F1 score: 0.49

gor:
Accuracy score: 0.52
Precision score: 0.52
Recall score: 0.51
Macro F1 score: 0.51

gqr:
Accuracy score: 0.48
Precision score: 0.51
Recall score: 0.51
Macro F1 score: 0.47

grt:
Accuracy score: 0.53
Precision score: 0.51
Recall score: 0.5
Macro F1 score: 0.5

gso:
Accuracy score: 0.48
Precision score: 0.46
Recall score: 0.48
Macro F1 score: 0.46

gub:
Accuracy score: 0.5
Precision score: 0.48
Recall score: 0.5
Macro F1 score: 0.47

guc:
Accuracy score: 0.58
Precision score: 0.59
Recall score: 0.56
Macro F1 score: 0.54

gud:
Accuracy score: 0.55
Precis

irk:
Accuracy score: 0.56
Precision score: 0.54
Recall score: 0.55
Macro F1 score: 0.53

iry:
Accuracy score: 0.45
Precision score: 0.41
Recall score: 0.46
Macro F1 score: 0.42

isd:
Accuracy score: 0.5
Precision score: 0.45
Recall score: 0.44
Macro F1 score: 0.44

isl:
Accuracy score: 0.48
Precision score: 0.47
Recall score: 0.46
Macro F1 score: 0.46

ita:
Accuracy score: 0.5
Precision score: 0.5
Recall score: 0.51
Macro F1 score: 0.5

itv:
Accuracy score: 0.47
Precision score: 0.46
Recall score: 0.43
Macro F1 score: 0.44

ium:
Accuracy score: 0.51
Precision score: 0.5
Recall score: 0.48
Macro F1 score: 0.47

ivb:
Accuracy score: 0.59
Precision score: 0.56
Recall score: 0.54
Macro F1 score: 0.55

ivv:
Accuracy score: 0.5
Precision score: 0.45
Recall score: 0.46
Macro F1 score: 0.46

iws:
Accuracy score: 0.46
Precision score: 0.44
Recall score: 0.45
Macro F1 score: 0.43

ixl:
Accuracy score: 0.46
Precision score: 0.43
Recall score: 0.45
Macro F1 score: 0.43

izr:
Accuracy score: 0.5
Pr

kpx:
Accuracy score: 0.61
Precision score: 0.6
Recall score: 0.63
Macro F1 score: 0.61

kpz:
Accuracy score: 0.55
Precision score: 0.55
Recall score: 0.55
Macro F1 score: 0.54

kqe:
Accuracy score: 0.52
Precision score: 0.49
Recall score: 0.51
Macro F1 score: 0.49

kqo:
Accuracy score: 0.41
Precision score: 0.4
Recall score: 0.4
Macro F1 score: 0.39

kqp:
Accuracy score: 0.42
Precision score: 0.41
Recall score: 0.42
Macro F1 score: 0.41

kqs:
Accuracy score: 0.53
Precision score: 0.52
Recall score: 0.55
Macro F1 score: 0.53

kqy:
Accuracy score: 0.56
Precision score: 0.52
Recall score: 0.53
Macro F1 score: 0.52

krc:
Accuracy score: 0.56
Precision score: 0.55
Recall score: 0.56
Macro F1 score: 0.55

kri:
Accuracy score: 0.42
Precision score: 0.4
Recall score: 0.4
Macro F1 score: 0.4

krj:
Accuracy score: 0.56
Precision score: 0.53
Recall score: 0.53
Macro F1 score: 0.53

ksc:
Accuracy score: 0.53
Precision score: 0.51
Recall score: 0.5
Macro F1 score: 0.48

ksd:
Accuracy score: 0.5
Pre

mak:
Accuracy score: 0.52
Precision score: 0.52
Recall score: 0.49
Macro F1 score: 0.5

mal:
Accuracy score: 0.48
Precision score: 0.49
Recall score: 0.46
Macro F1 score: 0.46

mam:
Accuracy score: 0.54
Precision score: 0.52
Recall score: 0.52
Macro F1 score: 0.52

maq:
Accuracy score: 0.57
Precision score: 0.55
Recall score: 0.57
Macro F1 score: 0.55

mar:
Accuracy score: 0.54
Precision score: 0.52
Recall score: 0.53
Macro F1 score: 0.52

mau:
Accuracy score: 0.5
Precision score: 0.46
Recall score: 0.45
Macro F1 score: 0.45

mav:
Accuracy score: 0.5
Precision score: 0.51
Recall score: 0.47
Macro F1 score: 0.48

maw:
Accuracy score: 0.54
Precision score: 0.55
Recall score: 0.54
Macro F1 score: 0.53

maz:
Accuracy score: 0.46
Precision score: 0.45
Recall score: 0.44
Macro F1 score: 0.44

mbb:
Accuracy score: 0.54
Precision score: 0.51
Recall score: 0.52
Macro F1 score: 0.51

mbc:
Accuracy score: 0.51
Precision score: 0.48
Recall score: 0.49
Macro F1 score: 0.48

mbd:
Accuracy score: 0.3

mqj:
Accuracy score: 0.59
Precision score: 0.56
Recall score: 0.58
Macro F1 score: 0.56

mqy:
Accuracy score: 0.57
Precision score: 0.57
Recall score: 0.56
Macro F1 score: 0.56

mri:
Accuracy score: 0.51
Precision score: 0.54
Recall score: 0.52
Macro F1 score: 0.51

mrw:
Accuracy score: 0.47
Precision score: 0.46
Recall score: 0.46
Macro F1 score: 0.45

msa:
Accuracy score: 0.56
Precision score: 0.54
Recall score: 0.54
Macro F1 score: 0.54

msb:
Accuracy score: 0.46
Precision score: 0.44
Recall score: 0.44
Macro F1 score: 0.44

mse:
Accuracy score: 0.58
Precision score: 0.57
Recall score: 0.55
Macro F1 score: 0.56

msk:
Accuracy score: 0.58
Precision score: 0.58
Recall score: 0.57
Macro F1 score: 0.56

msm:
Accuracy score: 0.46
Precision score: 0.44
Recall score: 0.44
Macro F1 score: 0.42

msy:
Accuracy score: 0.56
Precision score: 0.55
Recall score: 0.53
Macro F1 score: 0.52

mta:
Accuracy score: 0.59
Precision score: 0.58
Recall score: 0.57
Macro F1 score: 0.56

mtg:
Accuracy score: 

njm:
Accuracy score: 0.58
Precision score: 0.59
Recall score: 0.56
Macro F1 score: 0.56

njn:
Accuracy score: 0.53
Precision score: 0.53
Recall score: 0.52
Macro F1 score: 0.52

njo:
Accuracy score: 0.46
Precision score: 0.47
Recall score: 0.48
Macro F1 score: 0.46

njz:
Accuracy score: 0.55
Precision score: 0.53
Recall score: 0.53
Macro F1 score: 0.53

nko:
Accuracy score: 0.53
Precision score: 0.53
Recall score: 0.52
Macro F1 score: 0.52

nlc:
Accuracy score: 0.61
Precision score: 0.61
Recall score: 0.59
Macro F1 score: 0.59

nld:
Accuracy score: 0.46
Precision score: 0.43
Recall score: 0.44
Macro F1 score: 0.43

nma:
Accuracy score: 0.5
Precision score: 0.49
Recall score: 0.5
Macro F1 score: 0.49

nmf:
Accuracy score: 0.42
Precision score: 0.43
Recall score: 0.44
Macro F1 score: 0.41

nmo:
Accuracy score: 0.43
Precision score: 0.43
Recall score: 0.45
Macro F1 score: 0.43

nmz:
Accuracy score: 0.55
Precision score: 0.5
Recall score: 0.49
Macro F1 score: 0.49

nnb:
Accuracy score: 0.4

poy:
Accuracy score: 0.55
Precision score: 0.52
Recall score: 0.54
Macro F1 score: 0.52

ppk:
Accuracy score: 0.54
Precision score: 0.54
Recall score: 0.53
Macro F1 score: 0.53

ppo:
Accuracy score: 0.61
Precision score: 0.58
Recall score: 0.58
Macro F1 score: 0.58

prf:
Accuracy score: 0.54
Precision score: 0.51
Recall score: 0.52
Macro F1 score: 0.51

pri:
Accuracy score: 0.55
Precision score: 0.53
Recall score: 0.54
Macro F1 score: 0.53

prk:
Accuracy score: 0.43
Precision score: 0.42
Recall score: 0.44
Macro F1 score: 0.42

prs:
Accuracy score: 0.56
Precision score: 0.53
Recall score: 0.51
Macro F1 score: 0.51

pse:
Accuracy score: 0.57
Precision score: 0.56
Recall score: 0.57
Macro F1 score: 0.56

ptp:
Accuracy score: 0.45
Precision score: 0.45
Recall score: 0.44
Macro F1 score: 0.45

ptu:
Accuracy score: 0.55
Precision score: 0.54
Recall score: 0.54
Macro F1 score: 0.53

pua:
Accuracy score: 0.54
Precision score: 0.48
Recall score: 0.47
Macro F1 score: 0.47

pwg:
Accuracy score: 

sps:
Accuracy score: 0.49
Precision score: 0.43
Recall score: 0.43
Macro F1 score: 0.43

spy:
Accuracy score: 0.46
Precision score: 0.46
Recall score: 0.45
Macro F1 score: 0.45

sqi:
Accuracy score: 0.38
Precision score: 0.36
Recall score: 0.39
Macro F1 score: 0.37

sri:
Accuracy score: 0.48
Precision score: 0.47
Recall score: 0.49
Macro F1 score: 0.47

srm:
Accuracy score: 0.38
Precision score: 0.36
Recall score: 0.35
Macro F1 score: 0.35

srn:
Accuracy score: 0.38
Precision score: 0.36
Recall score: 0.37
Macro F1 score: 0.36

srp:
Accuracy score: 0.48
Precision score: 0.44
Recall score: 0.44
Macro F1 score: 0.43

srq:
Accuracy score: 0.49
Precision score: 0.47
Recall score: 0.47
Macro F1 score: 0.46

ssd:
Accuracy score: 0.54
Precision score: 0.53
Recall score: 0.53
Macro F1 score: 0.52

ssg:
Accuracy score: 0.5
Precision score: 0.48
Recall score: 0.48
Macro F1 score: 0.48

ssw:
Accuracy score: 0.56
Precision score: 0.54
Recall score: 0.56
Macro F1 score: 0.54

ssx:
Accuracy score: 0

tuk:
Accuracy score: 0.53
Precision score: 0.52
Recall score: 0.5
Macro F1 score: 0.5

tum:
Accuracy score: 0.56
Precision score: 0.56
Recall score: 0.59
Macro F1 score: 0.57

tuo:
Accuracy score: 0.53
Precision score: 0.5
Recall score: 0.52
Macro F1 score: 0.51

tur:
Accuracy score: 0.56
Precision score: 0.55
Recall score: 0.56
Macro F1 score: 0.54

twi:
Accuracy score: 0.43
Precision score: 0.41
Recall score: 0.41
Macro F1 score: 0.4

twu:
Accuracy score: 0.61
Precision score: 0.58
Recall score: 0.58
Macro F1 score: 0.58

txu:
Accuracy score: 0.53
Precision score: 0.52
Recall score: 0.53
Macro F1 score: 0.52

tyv:
Accuracy score: 0.51
Precision score: 0.49
Recall score: 0.47
Macro F1 score: 0.46

tzh:
Accuracy score: 0.53
Precision score: 0.52
Recall score: 0.52
Macro F1 score: 0.52

tzj:
Accuracy score: 0.53
Precision score: 0.51
Recall score: 0.52
Macro F1 score: 0.51

tzo:
Accuracy score: 0.55
Precision score: 0.52
Recall score: 0.5
Macro F1 score: 0.51

ubr:
Accuracy score: 0.48


zae:
Accuracy score: 0.55
Precision score: 0.51
Recall score: 0.51
Macro F1 score: 0.51

zai:
Accuracy score: 0.51
Precision score: 0.48
Recall score: 0.49
Macro F1 score: 0.47

zam:
Accuracy score: 0.56
Precision score: 0.53
Recall score: 0.51
Macro F1 score: 0.52

zao:
Accuracy score: 0.53
Precision score: 0.51
Recall score: 0.49
Macro F1 score: 0.49

zar:
Accuracy score: 0.54
Precision score: 0.51
Recall score: 0.51
Macro F1 score: 0.5

zas:
Accuracy score: 0.49
Precision score: 0.47
Recall score: 0.47
Macro F1 score: 0.45

zat:
Accuracy score: 0.55
Precision score: 0.52
Recall score: 0.53
Macro F1 score: 0.52

zav:
Accuracy score: 0.49
Precision score: 0.45
Recall score: 0.45
Macro F1 score: 0.44

zaw:
Accuracy score: 0.48
Precision score: 0.48
Recall score: 0.47
Macro F1 score: 0.47

zca:
Accuracy score: 0.49
Precision score: 0.51
Recall score: 0.47
Macro F1 score: 0.48

zho:
Accuracy score: 0.58
Precision score: 0.57
Recall score: 0.59
Macro F1 score: 0.57

zia:
Accuracy score: 0

In [15]:
len(f1_performance)

1249

In [16]:
# computing the average
total_scores = []
for lang, f1 in f1_performance.items():
    total_scores.append(f1)
    if f1 <0.1:
        print(lang, f1)

print(f"Average F1 score on all {len(total_scores)} langauges is {sum(total_scores)/len(total_scores)}")

Average F1 score on all 1249 langauges is 0.4872057646116899


In [20]:
# updated (200-dim, minlang=1)
# eng, 860, linear regression: 0.5054328832406668
# arb, 860, linear regression: 0.46376489277204214
# zho, 860, linear regression: 0.49533756949960256

# updated (200-dim, minlang=5)
# eng, 860, linear regression: 0.4925178713264494
# arb, 860, linear regression: 0.4745591739475786
# zho, 860, linear regression: 0.4944400317712471

# updated (200-dim, minlang=10)
# eng, 860, linear regression: 0.4831771247021451
# arb, 860, linear regression: 0.46275615567911144
# zho, 860, linear regression: 0.5144479745830025

# updated (200-dim, minlang=20)
# eng, 860, linear regression: 0.48216838760921454
# arb, 860, linear regression: 0.45776012708498937
# zho, 860, linear regression: 0.4618109610802235

# updated (200-dim, minlang=50)
# eng, 860, linear regression: 0.4872057646116899
# arb, 860, linear regression: 0.469714058776808
# zho, 860, linear regression: 0.4858856235107226
# rus, 860, linear regression: 0.47795869737887264

# updated (200-dim, minlang=100)
# eng, 860, linear regression: 0.44710881652104995
# arb, 860, linear regression: 0.44469420174741947
# zho, 860, linear regression: 0.45768864177919083