In [1]:
from pathlib import Path
import Levenshtein

In [2]:
class Swadesh:
    def __init__(self, path) -> None:
        self.path = Path(path)


    def read_swadesh(self, lang):
        english = []
        translation = []

        for line in open(self.path / f"{lang}.txt"):
            line = line.split(':')
            eng = line[0]
            trans = line[1].split('\t')[0].split('|')[0].strip()

            english.append(eng)
            translation.append(trans)

        return english, translation


    def get_common_words(self, eng1, trans1, eng2, trans2):
        common_eng = set(eng1) & set(eng2)

        dict1 = {k: v for k, v in zip(eng1, trans1) if k in common_eng}
        dict2 = {k: v for k, v in zip(eng2, trans2) if k in common_eng}

        common_words_1 = [dict1[k] for k in common_eng]
        common_words_2 = [dict2[k] for k in common_eng]
        
        return common_words_1, common_words_2
    

    def get_similarity(self, lang1: str, lang2: str):
        eng1, trans1 = self.read_swadesh(lang1)
        eng2, trans2 = self.read_swadesh(lang2)

        lang1_words, lang2_words = self.get_common_words(eng1, trans1, eng2, trans2)

        similarities = [Levenshtein.ratio(w1, w2) for w1, w2 in zip(lang1_words, lang2_words)]
        mean_similarity = sum(similarities) / len(similarities)
        return mean_similarity


In [None]:
path = "/home/victor/Documents/ITU/Thesis/langid4/swadesh/data/swadesh_merged"
sw = Swadesh(path)

cat_eng, cat_trans = sw.read_swadesh("/home/victor/Documents/ITU/Thesis/langid4/swadesh/data/swadesh_merged/cat.txt")
spa_eng, spa_trans = sw.read_swadesh("/home/victor/Documents/ITU/Thesis/langid4/swadesh/data/swadesh_merged/spa.txt")

In [None]:
a, b = sw.get_common_words(cat_eng, cat_trans, spa_eng, spa_trans)

list(zip(a,b))

In [None]:
sw = Swadesh(path)
sw.get_similarity("srp", "mkd")  # serbian macedonian

In [None]:
sw.get_similarity("ukr", "bul")

In [None]:
sw.get_similarity("cat", "spa")

In [None]:
sw.get_similarity("rus", "ukr")

In [None]:
sw.get_similarity("eng", "gle")

In [None]:
sw.get_similarity("srp", "hrv")  # different scripts but similar? Check srp.txt and hrv.txt, serbian croatian

In [None]:
sw.get_similarity("kin", "run")  # some languages not available: kinyarwanda rundi

In [28]:
eng, _ = sw.read_swadesh("eng")

In [None]:
for word in eng:
    print(word)

#### Panlex

In [5]:
import pandas as pd

panlex_data = pd.read_csv("../langid4/panlex/panlex.csv", sep=';')

In [None]:
panlex_data.columns

In [None]:
test = panlex_data[panlex_data['639-3'] =="fra"]
test