In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import accuracy_score

In [2]:
def read_data(path):
    """
    Takes as input a path to a conll-like file, with the label in the first 
    column, and the text in the second. It returns a list of all input texts
    and a separate list with all gold labels.
    """
    txts = []
    golds = []
    for line in open(path):
        tok = line.strip().split('\t')
        txts.append(tok[1])
        golds.append(tok[0])
    return txts, golds

In [3]:
train_path = "../langid4/data/domain.0.combined.train"
txts_train, golds_train = read_data(train_path)

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1,4), preprocessor=lambda x: x, analyzer="char", tokenizer=lambda x: x)

x_train = count_vectorizer.fit_transform(txts_train)


In [None]:
mnb = MultinomialNB()
mnb.fit(x_train, golds_train)

In [6]:
dev_path = "../langid4/data/domain.0.combined.dev"
txts_dev, golds_dev = read_data(dev_path)
x_dev = count_vectorizer.transform(txts_dev)

y_pred = mnb.predict(x_dev)

In [None]:
accuracy_score(golds_dev, y_pred)

In [None]:
source_domains = ["wiki", "news", "religious", "combined"]
test_domains = ["wiki", "news", "religious", "rights", "social"]
seed_number = 0
scores = []

for source_domain in source_domains:

    train_path = f"../langid4/data/domain.{seed_number}.{source_domain}.train"
    txts_train, golds_train = read_data(train_path)

    count_vectorizer = CountVectorizer(ngram_range=(1,4), preprocessor=lambda x: x, analyzer="char", tokenizer=lambda x: x)
    x_train = count_vectorizer.fit_transform(txts_train)

    mnb = MultinomialNB()
    mnb.fit(x_train, golds_train)

    for test_domain in test_domains:
        
        dev_path = f"../langid4/data/domain.{seed_number}.{test_domain}.dev"
        txts_dev, golds_dev = read_data(dev_path)
        x_dev = count_vectorizer.transform(txts_dev)

        y_pred = mnb.predict(x_dev)
        accuracy = accuracy_score(golds_dev, y_pred)
        scores.append(accuracy)
        print((f"{source_domain} / {test_domain}: {accuracy}"))

print(scores)

In [None]:
# scores = [0.9632319716164879, 0.9024782608695652, 0.8670869565217392, 0.8294947994056464, 0.8211305007587253, 0.8913527354275099, 0.9572028985507246, 0.8564202898550725, 0.8298662704309064, 0.7926783004552352, 0.8508192057302563, 0.844231884057971, 0.9676956521739131, 0.7968053491827637, 0.7987481031866465, 0.9560858869010761, 0.9571304347826087, 0.9659710144927536, 0.8577265973254086, 0.8537556904400607]
source_domains = ["combined"]
# test_domains = ["wiki", "news", "religious", "rights", "social"]
scores = [0.8542332602547152, 0.893927536231884, 0.8888260869565218, 0.7035661218424963, 0.6817147192716236]
score_matrix = np.resize(scores, (len(source_domains), len(test_domains)))
score_matrix *= 100

ax = sns.heatmap(score_matrix, annot=True, cmap="viridis", yticklabels=source_domains, xticklabels=test_domains, fmt=".1f", vmin=50, vmax=100, annot_kws={'size': 16})
plt.xticks(rotation=45, fontsize=14)
plt.yticks(rotation=0, fontsize=14)
ax.set_ylabel("Source", fontsize=16)
ax.set_xlabel("Target", fontsize=16)

plt.savefig(f"NaiveBayes.pdf", format="pdf", bbox_inches="tight")

#### With glot500 tokenizer

In [None]:
from transformers import AutoTokenizer
import tqdm

tokenizer = AutoTokenizer.from_pretrained('cis-lmu/glot500-base')

In [None]:
source_domains = ["wiki", "news", "religious", "combined"]
test_domains = ["wiki", "news", "religious", "rights", "social"]
seed_number = 1
scores_glot500_tokenizer = []

for source_domain in source_domains:

    train_path = f"../langid4/data/domain.{seed_number}.{source_domain}.train"
    txts_train, golds_train = read_data(train_path)

    count_vectorizer = CountVectorizer(ngram_range=(1,2), preprocessor=lambda x: x, analyzer="word", tokenizer=lambda x: tokenizer.tokenize(x))
    x_train = count_vectorizer.fit_transform(txts_train)

    mnb = MultinomialNB()
    mnb.fit(x_train, golds_train)

    for test_domain in test_domains:
        
        dev_path = f"../langid4/data/domain.{seed_number}.{test_domain}.dev"
        txts_dev, golds_dev = read_data(dev_path)
        x_dev = count_vectorizer.transform(txts_dev)

        y_pred = mnb.predict(x_dev)
        accuracy = accuracy_score(golds_dev, y_pred)
        scores_glot500_tokenizer.append(accuracy)
        print((f"{source_domain} / {test_domain}: {accuracy}"))

print(scores_glot500_tokenizer)

In [None]:
# scores = [0.9632319716164879, 0.9024782608695652, 0.8670869565217392, 0.8294947994056464, 0.8211305007587253, 0.8913527354275099, 0.9572028985507246, 0.8564202898550725, 0.8298662704309064, 0.7926783004552352, 0.8508192057302563, 0.844231884057971, 0.9676956521739131, 0.7968053491827637, 0.7987481031866465, 0.9560858869010761, 0.9571304347826087, 0.9659710144927536, 0.8577265973254086, 0.8537556904400607]
source_domains = ["wiki", "news", "religious", "combined"]
test_domains = ["wiki", "news", "religious", "rights", "social"]

# scores = [0.9720683480327348, 0.8947246376811594, 0.8568695652173913, 0.7722882615156018, 0.6608497723823976, 0.8905159573577895, 0.9647391304347827, 0.8489275362318841, 0.7778603268945022, 0.6079286798179059, 0.8443090723478319, 0.8378115942028985, 0.975463768115942, 0.7444279346210996, 0.6570561456752656, 0.9728214482954831, 0.968, 0.978304347826087, 0.8086924219910847, 0.7463960546282246]
score_matrix = np.resize(scores_glot500_tokenizer, (len(source_domains), len(test_domains)))
score_matrix *= 100

ax = sns.heatmap(score_matrix, annot=True, cmap="viridis", yticklabels=source_domains, xticklabels=test_domains, fmt=".1f", vmin=50, vmax=100)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
ax.set_ylabel("Source")
ax.set_xlabel("Target")

plt.title("Naive Bayes with GLOT500 tokenizer")
plt.savefig(f"NaiveBayesGlot500.pdf", format="pdf", bbox_inches="tight")