# Creation of Language Training Profiles

## 1. Required Imports

In [90]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

## 2. Read Wili-2018 Dataset and filter it for 37 European Languages

In [134]:
data = pd.read_csv('/Users/tofii/Downloads/Language Detection/wili-2018/x_train.txt', sep='\r\n', header=None, engine='python')
res = pd.read_csv('/Users/tofii/Downloads/Language Detection/wili-2018/y_train.txt', sep='\r\n', header=None, engine='python')
langs = ['eng','deu','fra','ita','spa']
#'nld','swe']
#,'tur','lat','dan','por','fra','bos','bul','gle','be-tarask','spa','isl','cym','srp','ita','fin','cat','ron','ces','rus','lit','sqi','slk','ukr','hrv','eng','mlt','hye','nno','hun','deu','mwl','pol','ell','ltz']
data['1'] = res
data.columns = ['text', 'lang']
data = data[data['lang'].isin(langs)].reset_index(drop=True)
data.head()

Unnamed: 0,text,lang
0,Association de recherche et de sauvegarde de l...,fra
1,"En Navidad de 1974, poco después de que interp...",spa
2,La chirurgie comprenant principalement l'ablat...,fra
3,"Dès les années 1920, les communes voisines d'A...",fra
4,"L'ufficio progettazione, sembra ormai certo, s...",ita


## 3. Print filtered Lang Samples and Shape of Dataframe

In [135]:
print(data.head())
data.shape

                                                text lang
0  Association de recherche et de sauvegarde de l...  fra
1  En Navidad de 1974, poco después de que interp...  spa
2  La chirurgie comprenant principalement l'ablat...  fra
3  Dès les années 1920, les communes voisines d'A...  fra
4  L'ufficio progettazione, sembra ormai certo, s...  ita


(2500, 2)

## 4. Preprocess Lang Data(Remove punctuations and digits)

In [136]:
data['text'] = data['text'].str.replace(r'[^\w\s]+', '')
data['text'] = data['text'].str.replace(r'[\d]+', '')
data['text'] = data['text'].astype(str).str.lower()
data.shape

(2500, 2)

## 4. Tokenize Lang Data to Prepare Lang Profiles

In [137]:
#data = data.groupby('lang')['text'].apply(' '.join).reset_index()
data['text'] = data.text.apply(word_tokenize)
data.shape

(2500, 2)

## 5. Function to Create N-GRAMS

In [138]:
def word2ngrams(text, n=3):
  """ Convert word into character ngrams. """
  return [text[i:i+n] for i in range(len(text)-n+1)]

## 6. Generate Lang Profiles using Mono, Bi and Tri GRAMS

In [139]:
ngrams = []
data['ngrams'] = ''
for lang in data.text:
    lang_ngram = []
    for word in lang:
        lang_ngram = lang_ngram + word2ngrams(word,1) + word2ngrams(word,2) + word2ngrams(word, 3)
    lang_ngram = sorted(list(set(lang_ngram)), key=len)
    ngrams.append(lang_ngram)

for i in range(len(ngrams)):
    data.at[i, 'ngrams'] = ngrams[i]
data.shape

(2500, 3)

In [140]:
data.head()

Unnamed: 0,text,lang,ngrams
0,"[association, de, recherche, et, de, sauvegard...",fra,"[b, y, e, é, t, m, v, s, r, l, p, u, g, d, h, ..."
1,"[en, navidad, de, poco, después, de, que, inte...",spa,"[b, y, e, á, é, t, m, v, s, r, p, l, u, q, g, ..."
2,"[la, chirurgie, comprenant, principalement, la...",fra,"[b, e, x, é, t, m, v, s, r, l, p, u, g, d, h, ..."
3,"[dès, les, années, les, communes, voisines, da...",fra,"[x, r, l, p, g, e, v, s, u, m, ê, q, n, è, à, ..."
4,"[lufficio, progettazione, sembra, ormai, certo...",ita,"[b, e, t, m, v, s, r, l, p, u, g, d, h, i, o, ..."


## 7. Write Generated Lang Profiles in JSON

In [141]:
data.to_json('/Users/tofii/Downloads/Language Detection/wili-2018/lang_profiles.json', orient='values')

# Detect Language using Test Language Profiles

## 1. Load Test Data

In [142]:
test_data = pd.read_csv('/Users/tofii/Downloads/Language Detection/wili-2018/x_test.txt', sep='\r\n', header=None, engine='python')
test_res = pd.read_csv('/Users/tofii/Downloads/Language Detection/wili-2018/y_test.txt', sep='\r\n', header=None, engine='python')
test_langs = ['nld']
    #'swe','nld','tur','lat','dan','por','fra','bos','bul','gle','be-tarask','spa','isl','cym','srp','ita','fin','cat','ron','ces','rus','lit','sqi','slk','ukr','hrv','eng','mlt','hye','nno','hun','deu','mwl','pol','ell','ltz']
test_data['1'] = test_res
test_data.columns = ['text', 'lang']
test_data = test_data[test_data['lang'].isin(langs)].reset_index(drop=True)
test_data.head()

Unnamed: 0,text,lang
0,La ciudad de San Cristóbal es sede del Hospita...,spa
1,Les supporters de l'ASM Clermont Auvergne ont ...,fra
2,Anton (or Antonius) Maria Schyrleus (also Schy...,eng
3,"Ralph Staub est un réalisateur, producteur, sc...",fra
4,"Spesso usata quale motto o incitazione, ha il ...",ita


## 2. Print filtered Lang Test Samples and Shape of Dataframe

In [143]:
print(test_data.head())
test_data.shape

                                                text lang
0  La ciudad de San Cristóbal es sede del Hospita...  spa
1  Les supporters de l'ASM Clermont Auvergne ont ...  fra
2  Anton (or Antonius) Maria Schyrleus (also Schy...  eng
3  Ralph Staub est un réalisateur, producteur, sc...  fra
4  Spesso usata quale motto o incitazione, ha il ...  ita


(2500, 2)

## 3. Preprocess Test Lang Data(Remove punctuations and digits)

In [144]:
test_data['text'] = test_data['text'].str.replace(r'[^\w\s]+', '')
test_data['text'] = test_data['text'].str.replace(r'[\d]+', '')
test_data['text'] = test_data['text'].astype(str).str.lower()
test_data.head()

Unnamed: 0,text,lang
0,la ciudad de san cristóbal es sede del hospita...,spa
1,les supporters de lasm clermont auvergne ont r...,fra
2,anton or antonius maria schyrleus also schyrl ...,eng
3,ralph staub est un réalisateur producteur scén...,fra
4,spesso usata quale motto o incitazione ha il s...,ita


## 4. Tokenize Test Lang Data to Prepare Lang Profiles

In [145]:
test_data['text'] = test_data.text.apply(word_tokenize)
test_data.head()

Unnamed: 0,text,lang
0,"[la, ciudad, de, san, cristóbal, es, sede, del...",spa
1,"[les, supporters, de, lasm, clermont, auvergne...",fra
2,"[anton, or, antonius, maria, schyrleus, also, ...",eng
3,"[ralph, staub, est, un, réalisateur, producteu...",fra
4,"[spesso, usata, quale, motto, o, incitazione, ...",ita


## 5. Generate Test Lang Profiles using Mono, Bi and Tri GRAMS

In [146]:
ngrams = []
test_data['ngrams'] = ''
for lang in test_data.text:
    lang_ngram = []
    for word in lang:
        lang_ngram = lang_ngram + word2ngrams(word,1) + word2ngrams(word,2) + word2ngrams(word, 3)
    lang_ngram = sorted(list(set(lang_ngram)), key=len)
    ngrams.append(lang_ngram)
#print(ngrams)
for i in range(len(ngrams)):
    test_data.at[i, 'ngrams'] = ngrams[i]
test_data.head()

Unnamed: 0,text,lang,ngrams
0,"[la, ciudad, de, san, cristóbal, es, sede, del...",spa,"[b, e, á, t, m, v, s, r, l, p, u, g, d, h, i, ..."
1,"[les, supporters, de, lasm, clermont, auvergne...",fra,"[e, x, é, t, m, v, s, ç, r, l, p, u, q, g, d, ..."
2,"[anton, or, antonius, maria, schyrleus, also, ...",eng,"[b, y, e, x, t, m, v, z, s, r, l, p, u, š, g, ..."
3,"[ralph, staub, est, un, réalisateur, producteu...",fra,"[b, j, e, é, t, m, s, r, p, l, u, g, d, h, i, ..."
4,"[spesso, usata, quale, motto, o, incitazione, ...",ita,"[e, t, m, v, s, r, p, l, u, q, g, d, h, i, o, ..."


In [147]:
test_data.shape

(2500, 3)

In [148]:
print(data['ngrams'][0])
print(test_data['ngrams'][0])

['b', 'y', 'e', 'é', 't', 'm', 'v', 's', 'r', 'l', 'p', 'u', 'g', 'd', 'h', 'i', 'o', 'c', 'n', 'f', 'a', 'è', 'ap', 'eg', 'is', 'un', 'oi', 're', 'cl', 'ar', 'ec', 'ra', 'de', 'au', 'as', 'bi', 'oc', 'gr', 'vo', 'hi', 'at', 'he', 'er', 'ns', 'rs', 'ir', 'ph', 'ie', 've', 'ro', 'bl', 'si', 'et', 'rc', 'so', 'li', 'du', 'og', 'ia', 'sa', 'mo', 'nc', 'ga', 'la', 'ss', 'io', 'lh', 'on', 'st', 'mé', 'en', 'hr', 'iè', 'ye', 'ci', 'fr', 'rf', 'an', 'ib', 'sy', 'sh', 'ti', 'ém', 'ch', 'rd', 'nf', 'da', 'ce', 'to', 'le', 'èc', 'uv', 'rec', 'hrf', 'auv', 'hie', 'nfr', 'tio', 'che', 'anc', 'cia', 'lhi', 'ion', 'erc', 'oir', 'sso', 'dun', 'rsh', 'ech', 'rch', 'yen', 'soc', 'her', 'ass', 'siè', 'lio', 'ega', 'rap', 'gar', 'ssy', 'moi', 'toi', 'ois', 'ècl', 'ard', 'ist', 'cle', 'ars', 'his', 'bli', 'nce', 'oci', 'mém', 'voi', 'sye', 'ans', 'aph', 'ire', 'sau', 'émo', 'phi', 'dan', 'fra', 'uve', 'bib', 'iog', 'sto', 'ran', 'ati', 'ogr', 'rde', 'gra', 'iat', 'ièc', 'ibl', 'enf', 'roi', 'veg', 'shr', 

In [149]:
data.shape

(2500, 3)

In [None]:
UB = 0.95
LB = 0.92

for i in range(len(test_data)):
    for j in range(len(data)):
        data.at[j, 'match_rate_lang_'+str(i)] = len(set(test_data.iloc[i]['ngrams']).intersection(data.iloc[j]['ngrams']))/len(data.iloc[j]['ngrams'])
data.head()

In [129]:
print('Language\tScore')
for i in range(len(test_data)):
    print(data.loc[data['match_rate_lang_'+str(i)].idxmax()]['lang']+'\t'+str(data.loc[data['match_rate_lang_'+str(i)].idxmax()]['match_rate_lang_'+str(i)]))

Language	Score
nld	0.5915492957746479
swe	0.9767441860465116
swe	0.5657894736842105
swe	0.5664335664335665
nld	0.7853107344632768
nld	0.42613636363636365
nld	0.7810650887573964
swe	0.8275862068965517
swe	0.6713286713286714
swe	0.7045454545454546
nld	0.42962962962962964
swe	0.43243243243243246
swe	0.6063829787234043
swe	0.7158469945355191
nld	0.3898305084745763
swe	0.923469387755102
swe	0.7784090909090909
nld	0.6637931034482759
nld	0.7928994082840237
swe	0.7954545454545454
nld	0.5638297872340425
nld	0.6513761467889908
nld	0.6190476190476191
nld	0.562962962962963
nld	0.6206896551724138
nld	0.5460526315789473
swe	0.9678899082568807
swe	0.45495495495495497
swe	0.5257142857142857
swe	0.8026315789473685
swe	0.8768472906403941
nld	0.4619289340101523
swe	0.9906542056074766
nld	0.7276595744680852
swe	0.46835443037974683
nld	0.43783783783783786
swe	0.5070422535211268
swe	0.7114093959731543
swe	0.9736842105263158
nld	0.6699029126213593
nld	0.6810344827586207
swe	0.7738095238095238
nld	0.655172413

nld	0.7909604519774012
nld	0.52
nld	0.5422535211267606
swe	0.9732620320855615
nld	0.5302013422818792
nld	0.373015873015873
nld	0.6379310344827587
swe	0.7857142857142857
nld	0.5862068965517241
nld	0.47413793103448276
nld	0.47413793103448276
nld	0.6991869918699187
swe	0.9039301310043668
nld	0.75
swe	0.7612903225806451
swe	0.7557251908396947
swe	0.9719626168224299
nld	0.5927835051546392
swe	0.7065217391304348
swe	0.9732620320855615
nld	0.7793103448275862
nld	0.5684210526315789
nld	0.4878048780487805
nld	0.5086206896551724
swe	0.5530973451327433
swe	0.5893416927899686
swe	0.9634703196347032
swe	0.8928571428571429
nld	0.5639097744360902
nld	0.7588235294117647
nld	0.5263157894736842
nld	0.757396449704142
nld	0.5298013245033113
nld	0.6616541353383458
swe	0.9797297297297297
swe	0.9819004524886877
swe	0.624113475177305
nld	0.6190476190476191
swe	0.8103448275862069
nld	0.8806584362139918
nld	0.5140845070422535
swe	0.5490196078431373
nld	0.8806584362139918
swe	0.9672897196261683
swe	0.54225352112

swe	0.6163522012578616
swe	0.7197452229299363
nld	0.7302631578947368
nld	0.5086206896551724
swe	0.8275862068965517
nld	0.5759493670886076
nld	0.4968553459119497
swe	0.4935897435897436
nld	0.47368421052631576
nld	0.46710526315789475
nld	0.4897959183673469
nld	0.4391891891891892
nld	0.5555555555555556
swe	0.5288135593220339
swe	0.47183098591549294
nld	0.8265895953757225
swe	0.592057761732852
nld	0.6120689655172413
swe	0.7692307692307693
swe	1.0
swe	0.9625668449197861
nld	0.7810650887573964
swe	0.9408866995073891
nld	0.6459143968871596
swe	0.7677165354330708
nld	0.47904191616766467
swe	0.6547619047619048
nld	0.6111111111111112
swe	0.990521327014218
swe	0.8068181818181818
swe	0.9411764705882353
nld	0.47413793103448276
nld	0.6052631578947368
nld	0.8734939759036144
nld	0.4396551724137931
swe	0.7988165680473372
nld	0.48623853211009177
swe	0.8275862068965517
nld	0.7218045112781954
swe	0.8655913978494624
swe	0.9864253393665159
swe	0.9625668449197861
nld	0.5
swe	0.4444444444444444
swe	0.92903225

In [130]:
results = []
for i in range(len(test_data)):
    results.append({'Language':data.loc[data['match_rate_lang_'+str(i)].idxmax()]['lang'], 'Score':data.loc[data['match_rate_lang_'+str(i)].idxmax()]['match_rate_lang_'+str(i)]})
res_df = pd.DataFrame(results)
res_df.head()

Unnamed: 0,Language,Score
0,nld,0.591549
1,swe,0.976744
2,swe,0.565789
3,swe,0.566434
4,nld,0.785311


In [133]:
res_df['Score'].groupby(res_df['Language']).mean()

Language
nld    0.628586
swe    0.727538
Name: Score, dtype: float64