In [1]:
from pyfasttext import FastText
import pycld2 as cld2
import pandas as pd

In [2]:
with open("../data/lang_data.txt", "r") as f:
    text_l, cld2_l, ft_l, g_l = [], [], [], []
    s = ''
    for i in f:
        s += i
        if ' |end\n' in s:
            text, cld2_, ft, g = s.strip().rsplit(" ||| ", 3)
            text_l.append(text)
            cld2_l.append(cld2_)
            ft_l.append(ft)
            g_l.append(g.replace(" |end", ""))
            s=''
            
data = pd.DataFrame({"text": text_l, "cld2": cld2_l, "ft": ft_l, "google": g_l})

In [3]:
model = FastText('../model/lid.176.bin')

In [4]:
def cld2_pred(text_):
    try:
        temp = cld2.detect(text_)
        return temp[2][0][1]
    except:
        return "unc"
    
def ft_pred(text_):
    try:
        return text_[0]
    except:
        return 'unc'

In [5]:
data.head()

Unnamed: 0,cld2,ft,google,text
0,ru,ru,ru,каждый раз переебывает с того как в один момен...
1,en,en,en,Safari\n#sandiegosafaripark #sandiego #safari ...
2,en,en,en,Shinsuke Nakamura knew there was only one Wres...
3,es,es,es,Esto es muy pero muy cierto
4,ru,ru,ru,"Полузащитник ""Манчестер Юнайтед"" Поль Погба за..."


In [6]:
%timeit data["cld2"] = [cld2_pred(i) for i in data["text"].values.tolist()]

6.5 s ± 810 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit data["ft"] = [ft_pred(i) for i in model.predict(data["text"].values.tolist())]

7.66 s ± 411 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
def lang_summary(lang, col):
    prec = (data.loc[data[col] == lang, "google"] == data.loc[data[col] == lang, col]).mean()
    rec = (data.loc[data["google"] == lang, "google"] == data.loc[data["google"] == lang, col]).mean()
    return round(prec, 3), round(rec, 3), round(2*prec*rec / (prec + rec),3)

In [9]:
results = {}
for approach in ["cld2", "ft"]:
    results[approach] = {}
    for l in data["google"].value_counts().index[:20]:
        results[approach][l] = lang_summary(l, approach)

In [10]:
res = pd.DataFrame.from_dict(results)

In [11]:
res["cld2_prec"], res["cld2_rec"], res["cld2_f1"] = res["cld2"].apply(lambda x: [x[0], x[1], x[2]]).str
res["ft_prec"], res["ft_rec"], res["ft_f1"] = res["ft"].apply(lambda x: [x[0], x[1], x[2]]).str
res.drop(columns=["cld2", "ft"], inplace=True)

In [12]:
arrays = [['cld2', 'cld2', 'cld2', 'ft', 'ft', 'ft'],
          ['precision', 'recall', 'f1_score', 'precision', 'recall', 'f1_score']]

tuples = list(zip(*arrays))

res.columns = pd.MultiIndex.from_tuples(tuples, names=["approach", "metrics"])

In [15]:
res

approach,cld2,cld2,cld2,ft,ft,ft
metrics,precision,recall,f1_score,precision,recall,f1_score
ar,0.992,0.725,0.838,0.918,0.697,0.793
az,0.95,0.752,0.839,0.888,0.547,0.677
bg,0.529,0.136,0.217,0.286,0.178,0.219
en,0.949,0.844,0.894,0.885,0.869,0.877
es,0.987,0.653,0.786,0.709,0.814,0.758
fr,0.991,0.713,0.829,0.53,0.803,0.638
id,0.763,0.543,0.634,0.481,0.404,0.439
it,0.975,0.466,0.631,0.519,0.778,0.622
ja,0.994,0.899,0.944,0.602,0.842,0.702
ka,0.962,0.995,0.979,0.959,0.905,0.931
