In [None]:
import pandas as pd
from my_utils import *

In [None]:
source_domains = ["wiki", "news", "religious", "combined"]
models = {}
vectorizers = {}
results = {}

for domain in source_domains:
    txts_dev, golds_dev = read_data(f"../langid4/data/domain.0.{domain}.dev")

    nb = load_model(f"../models/naive_bayes/{domain}")
    vectorizer = load_model(f"../models/naive_bayes/vectorizers/{domain}")
    models[domain] = nb
    vectorizers[domain] = vectorizer

    x_dev = vectorizer.transform(txts_dev)

    pred = nb.predict(x_dev)
    pred = list(map(lambda x: str(x), pred))

    df = pd.DataFrame({"txt":txts_dev,"gold":golds_dev,"pred":pred})
    results[domain] = df


In [None]:
# save results to csv
for (domain_name, df) in list(zip(results.keys(), list(results.values()))):
    df.to_csv(f"../results/error_analysis/{domain_name}.csv")

In [None]:
# all pairs directed

all_pairs = {}

for domain in source_domains:
    golds = results[domain]["gold"]
    preds = results[domain]["pred"]

    for i in range(len(golds)):
        if golds[i] != preds[i]:
            pair = (golds[i], preds[i])
            if pair in all_pairs:
                all_pairs[pair] += 1
            else:
                all_pairs[pair] = 1

all_pairs_sorted = sorted(all_pairs.items(), key=lambda x: x[1], reverse=True)
all_pairs_sorted

In [None]:
# all pairs undirected

all_pairs = {}

domain = "combined"

golds = results[domain]["gold"]
preds = results[domain]["pred"]

for i in range(len(golds)):
    if golds[i] != preds[i]:
        if (golds[i], preds[i]) in all_pairs:
            all_pairs[(golds[i], preds[i])] += 1
        elif (preds[i], golds[i]) in all_pairs:
            all_pairs[(preds[i], golds[i])] += 1
        else:
            all_pairs[(golds[i], preds[i])] = 1

all_pairs_sorted = sorted(all_pairs.items(), key=lambda x: x[1], reverse=True)
all_pairs_sorted[:30]

In [None]:
from swadesh.swadesh import Swadesh

path = "/home/victor/Documents/ITU/Thesis/langid4/swadesh/data/swadesh_merged"
sw = Swadesh(path)

for ((label1, label2), count) in all_pairs_sorted[:30]:
    label1 = clean_language_name(label1)
    label2 = clean_language_name(label2)

    dist_score = sw.get_similarity(label1, label2)

    # dists = get_lang_dists(label1, label2)
    # if dists["lang2vec"] is None or dists["lang2vec"] == 0:
    #     dist_score = dists["lang2vec_knn"]
    # else:
    #     dist_score = dists["lang2vec"]
    
    try:
        dist_score = "{:.4f}".format(dist_score)
    except:
        pass

    # print(f"{label1}-{label2} ({count}): Language similarity ({dist_score})")

    print(f"{label1}-{label2} & {dist_score} &  \\\\")

In [None]:
from pathlib import Path

# find missing langs in swadesh list
path = Path("/home/victor/Documents/ITU/Thesis/langid4/swadesh/data/swadesh_merged")
pairs = all_pairs_sorted[:30]

for (pair, _) in pairs:
    for lang in pair:
        print(lang)
        file_path = path / f"{clean_language_name(lang)}.txt"
        if not file_path.exists():
            print(clean_language_name(lang))
        

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

top = all_pairs_sorted[:15]

labels = [tuple(map(clean_language_name, pair)) for pair, _ in top]
labels = [f"{pair[0]}-{pair[1]}" for pair in labels]

values = [value for _, value in top]

plt.figure(figsize=(12, 6))
sns.barplot(x=values, y=labels)

plt.xticks(rotation=45)
plt.tight_layout()
plt.title('Number of errors per language pair') 

plt.show()



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

top = all_pairs_sorted[:15]

labels = [tuple(map(clean_language_name, pair)) for pair, _ in top]
labels = [f"{pair[0]}-{pair[1]}" for pair in labels]

values = [value for _, value in top]

plt.figure(figsize=(8, 8))
sns.barplot(x=values, y=labels)
plt.xlabel("Number of errors", fontsize=16)
plt.ylabel("Language pairs", fontsize=16)

plt.xticks(rotation=45, fontsize=14)
plt.yticks(rotation=0, fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 1])

plt.savefig(f"Errorcounts.pdf", format="pdf", bbox_inches="tight")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

top = all_pairs_sorted[:15]

labels = [tuple(map(clean_language_name, pair)) for pair, _ in top]
labels = [f"{pair[0]}-{pair[1]}" for pair in labels]

values = [value for _, value in top]

plt.figure(figsize=(8, 8))
sns.barplot(x=labels, y=values)
plt.ylabel("Number of errors", fontsize=16)
plt.xlabel("Language pairs", fontsize=16)

plt.xticks(rotation=45, fontsize=14)
plt.yticks(rotation=0, fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 1])

plt.savefig(f"Errorcounts.pdf", format="pdf", bbox_inches="tight")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_style("whitegrid")

top = all_pairs_sorted[:15]

labels = [tuple(map(clean_language_name, pair)) for pair, _ in top]
labels = [f"{pair[0]}-{pair[1]}" for pair in labels]

values = [value for _, value in top]

rank = np.arange(1, len(values) + 1) 
zipfian_curve = max(values) / rank

plt.figure(figsize=(8, 8))
sns.barplot(x=labels, y=values)

plt.plot(labels, zipfian_curve, color="red", marker="o", linestyle="--", linewidth=2)

plt.ylabel("Number of errors", fontsize=20)
plt.xlabel("Language pairs", fontsize=20)
plt.xticks(rotation=45, fontsize=17)
plt.yticks(rotation=0, fontsize=17)

plt.tight_layout(rect=[0, 0, 1, 1])
plt.savefig("Errorcounts.pdf", format="pdf", bbox_inches="tight")

plt.show()


In [None]:
pairs_per_domain = {}

for domain in source_domains:
    pairs = {}

    golds = results[domain]["gold"]
    preds = results[domain]["pred"]

    for i in range(len(golds)):
        if golds[i] != preds[i]:
            pair = (golds[i], preds[i])
            if pair in pairs:
                pairs[pair] += 1
            else:
                pairs[pair] = 1

    pairs_sorted = sorted(pairs.items(), key=lambda x: x[1], reverse=True)
    pairs_per_domain[domain] = pairs_sorted

In [None]:
pairs_per_domain['combined'][:20]

### Plot to see domain effect on number of errors

In [None]:
pairs_per_domain['religious']

In [None]:
targets = [x for (x, _) in pairs_per_domain['combined']]

number_of_errors_per_domain_by_language_pair = {}

for target in targets:
    all_domains = {}
    for domain in source_domains[:-1]:
        for (language, n_errors) in pairs_per_domain[domain]:
            if (language == target):
                all_domains[domain] = n_errors
                break
            all_domains[domain] = 0
    number_of_errors_per_domain_by_language_pair[target] = all_domains
    print(target, all_domains)

In [None]:
print(number_of_errors_per_domain_by_language_pair)

In [None]:
number_of_errors_per_domain_by_language_pair[('__label__cat', '__label__rus')]

In [None]:
def get_error_difference_in_top_2_domains_normalized(x):
    (_, domains) = x
    errors_list = domains.values()
    errors_list_sorted = sorted(errors_list, reverse=True)
    if sum(errors_list) == 0:
        return 0
    
    # print(errors_list_sorted[0], errors_list_sorted[1], sum(errors_list))
    return (errors_list_sorted[0] - errors_list_sorted[1])/sum(errors_list)

sorted_list = sorted(list(number_of_errors_per_domain_by_language_pair.items()), key=get_error_difference_in_top_2_domains_normalized, reverse=True)

[(langs, get_error_difference_in_top_2_domains_normalized((langs, domains))) for (langs, domains) in sorted_list]

-------------------------

In [None]:
r = results['news']
a = r[(r["gold"] == "__label__rus") & (r["gold"] == "__label__cat")]
a

In [None]:
r = results['combined']
a = r[(r["gold"] == "__label__ukr") & (r["pred"] == "__label__bul")]
a


# Compiling all data

In [None]:
source_domains = ["wiki", "news", "religious"]
extensions = ["train", "dev"]

all_data = pd.DataFrame()

for domain in source_domains:
    for extension in extensions:
        txts, golds = read_data(f"../langid4/data/domain.0.{domain}.{extension}")
        all_data = pd.concat([all_data, pd.DataFrame({"txt":txts,"gold":golds})], ignore_index=True)

all_data


In [None]:
res = all_data[all_data["gold"] == "__label__hrv"]
res

In [None]:
all_data.nunique()