# Creation of Language Training Profiles

## 1. Required Imports

In [442]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

## 2. Read Wili-2018 Dataset and filter it for 34 European Languages

In [443]:
data = pd.read_csv('/Users/ashwinkumar/Desktop/MLProj/wili-2018/x_train.txt', sep='\r\n', header=None, engine='python')
res = pd.read_csv('/Users/ashwinkumar/Desktop/MLProj/wili-2018/y_train.txt', sep='\r\n', header=None, engine='python')
langs = ['slk']
#'nld','swe']
#,'tur','lat','dan','por','fra','bos','bul','gle','be-tarask','spa','isl','cym','srp','ita','fin','cat','ron','ces','rus','lit','sqi','slk','ukr','hrv','eng','mlt','hye','nno','hun','deu','mwl','pol','ell','ltz']
data['1'] = res
data.columns = ['text', 'lang']
data = data[data['lang'].isin(langs)].reset_index(drop=True)
data.head()

Unnamed: 0,text,lang
0,január 2000 – po voľbách sa časť poslancov zvo...,slk
1,Kniha obsahuje kompiláciu výrokov a rozhovorov...,slk
2,Waters bývalých spoluhráčov zažaloval za neopr...,slk
3,Třebovice je obec v Česku v okrese Ústí nad Or...,slk
4,Vydávala periodiká Časopis Muzeálnej slovenske...,slk


## 3. Print filtered Lang Samples and Shape of Dataframe

In [444]:
print(data.head())
data.shape

                                                text lang
0  január 2000 – po voľbách sa časť poslancov zvo...  slk
1  Kniha obsahuje kompiláciu výrokov a rozhovorov...  slk
2  Waters bývalých spoluhráčov zažaloval za neopr...  slk
3  Třebovice je obec v Česku v okrese Ústí nad Or...  slk
4  Vydávala periodiká Časopis Muzeálnej slovenske...  slk


(500, 2)

## 4. Preprocess Lang Data(Remove punctuations and digits)

In [445]:
data['text'] = data['text'].str.replace(r'[^\w\s]+', '')
data['text'] = data['text'].str.replace(r'[\d]+', '')
data['text'] = data['text'].astype(str).str.lower()
data.shape

(500, 2)

## 4. Tokenize Lang Data to Prepare Lang Profiles

In [446]:
#data = data.groupby('lang')['text'].apply(' '.join).reset_index()
data['text'] = data.text.apply(word_tokenize)
data.shape

(500, 2)

## 5. Function to Create N-GRAMS

In [447]:
def word2ngrams(text, n=2):
  """ Convert word into character ngrams. """
  return [text[i:i+n] for i in range(len(text)-n+1)]

## 6. Generate Lang Profiles using Mono, Bi and Tri GRAMS

In [448]:
ngrams = []
data['ngrams'] = ''
for lang in data.text:
    lang_ngram = []
    for word in lang:
        lang_ngram = lang_ngram + word2ngrams(word,1) + word2ngrams(word,2) 
    lang_ngram = sorted(list(set(lang_ngram)), key=len)
    ngrams.append(lang_ngram)

for i in range(len(ngrams)):
    data.at[i, 'ngrams'] = ngrams[i]
data.shape

(500, 3)

In [449]:
data.head()

Unnamed: 0,text,lang,ngrams
0,"[január, po, voľbách, sa, časť, poslancov, zvo...",slk,"[í, a, e, r, š, ľ, t, ž, á, o, ô, ť, z, c, b, ..."
1,"[kniha, obsahuje, kompiláciu, výrokov, a, rozh...",slk,"[í, a, g, e, ó, r, š, t, ž, á, o, ô, ť, z, c, ..."
2,"[waters, bývalých, spoluhráčov, zažaloval, za,...",slk,"[í, a, e, g, ó, ď, r, š, t, w, ž, á, o, ť, z, ..."
3,"[třebovice, je, obec, v, česku, v, okrese, úst...",slk,"[í, a, e, r, ľ, t, ž, á, o, z, c, b, n, p, č, ..."
4,"[vydávala, periodiká, časopis, muzeálnej, slov...",slk,"[í, a, e, r, t, á, o, z, b, n, p, č, l, é, ý, ..."


## 7. Write Generated Lang Profiles in JSON

In [450]:
data.to_json('/Users/ashwinkumar/Desktop/MLProj/wili-2018/lang_profiles.json', orient='values')

# Detect Language using Test Language Profiles

## 1. Load Test Data

In [451]:
test_data = pd.read_csv('/Users/ashwinkumar/Desktop/MLProj/wili-2018/x_test.txt', sep='\r\n', header=None, engine='python')
test_res = pd.read_csv('/Users/ashwinkumar/Desktop/MLProj/wili-2018/y_test.txt', sep='\r\n', header=None, engine='python')
test_langs = ['slk']
    #'swe','nld','tur','lat','dan','por','fra','bos','bul','gle','be-tarask','spa','isl','cym','srp','ita','fin','cat','ron','ces','rus','lit','sqi','slk','ukr','hrv','eng','mlt','hye','nno','hun','deu','mwl','pol','ell','ltz']
test_data['1'] = test_res
test_data.columns = ['text', 'lang']
test_data = test_data[test_data['lang'].isin(langs)].reset_index(drop=True)
test_data.head()

Unnamed: 0,text,lang
0,Podujal sa preto rozlúštiť tento paradox. Syst...,slk
1,V roku 2002 ho prezident George W. Bush vymeno...,slk
2,"V hudbe sa za Underground označuje smer, ktorý...",slk
3,Do roku 1919 mladé talenty zo Slovenska navšte...,slk
4,George Hammond nastúpil ako generál v prvom di...,slk


## 2. Print filtered Lang Test Samples and Shape of Dataframe

In [452]:
print(test_data.head())
test_data.shape

                                                text lang
0  Podujal sa preto rozlúštiť tento paradox. Syst...  slk
1  V roku 2002 ho prezident George W. Bush vymeno...  slk
2  V hudbe sa za Underground označuje smer, ktorý...  slk
3  Do roku 1919 mladé talenty zo Slovenska navšte...  slk
4  George Hammond nastúpil ako generál v prvom di...  slk


(500, 2)

## 3. Preprocess Test Lang Data(Remove punctuations and digits)

In [453]:
test_data['text'] = test_data['text'].str.replace(r'[^\w\s]+', '')
test_data['text'] = test_data['text'].str.replace(r'[\d]+', '')
test_data['text'] = test_data['text'].astype(str).str.lower()
test_data.head()

Unnamed: 0,text,lang
0,podujal sa preto rozlúštiť tento paradox syste...,slk
1,v roku ho prezident george w bush vymenoval d...,slk
2,v hudbe sa za underground označuje smer ktorý ...,slk
3,do roku mladé talenty zo slovenska navštevova...,slk
4,george hammond nastúpil ako generál v prvom di...,slk


## 4. Tokenize Test Lang Data to Prepare Lang Profiles

In [454]:
test_data['text'] = test_data.text.apply(word_tokenize)
test_data.head()

Unnamed: 0,text,lang
0,"[podujal, sa, preto, rozlúštiť, tento, paradox...",slk
1,"[v, roku, ho, prezident, george, w, bush, vyme...",slk
2,"[v, hudbe, sa, za, underground, označuje, smer...",slk
3,"[do, roku, mladé, talenty, zo, slovenska, navš...",slk
4,"[george, hammond, nastúpil, ako, generál, v, p...",slk


## 5. Generate Test Lang Profiles using Mono, Bi and Tri GRAMS

In [455]:
ngrams = []
test_data['ngrams'] = ''
for lang in test_data.text:
    lang_ngram = []
    for word in lang:
        lang_ngram = lang_ngram + word2ngrams(word,1) + word2ngrams(word,2) 
    lang_ngram = sorted(list(set(lang_ngram)), key=len)
    ngrams.append(lang_ngram)
#print(ngrams)
for i in range(len(ngrams)):
    test_data.at[i, 'ngrams'] = ngrams[i]
test_data.head()

Unnamed: 0,text,lang,ngrams
0,"[podujal, sa, preto, rozlúštiť, tento, paradox...",slk,"[í, a, e, r, š, ľ, t, ž, á, o, ť, z, c, b, n, ..."
1,"[v, roku, ho, prezident, george, w, bush, vyme...",slk,"[a, g, e, r, š, t, w, á, o, ť, z, c, b, n, p, ..."
2,"[v, hudbe, sa, za, underground, označuje, smer...",slk,"[í, a, g, e, ó, r, š, ľ, t, ž, á, o, z, c, b, ..."
3,"[do, roku, mladé, talenty, zo, slovenska, navš...",slk,"[í, a, e, ó, r, š, t, ž, á, o, ť, z, c, b, n, ..."
4,"[george, hammond, nastúpil, ako, generál, v, p...",slk,"[í, a, g, e, ď, r, t, w, ž, á, o, ô, ť, z, c, ..."


In [456]:
test_data.shape

(500, 3)

In [457]:
print(data['ngrams'][0])
print(test_data['ngrams'][0])

['í', 'a', 'e', 'r', 'š', 'ľ', 't', 'ž', 'á', 'o', 'ô', 'ť', 'z', 'c', 'b', 'n', 'p', 'č', 'l', 'ý', 'i', 'm', 'k', 'd', 's', 'u', 'ú', 'j', 'h', 'v', 'ča', 'ol', 'tr', 'em', 'ún', 'va', 'án', 'kú', 'ul', 'ľb', 'od', 'uá', 'mi', 'li', 'mo', 'áš', 'ši', 'nu', 'no', 'ik', 'ož', 'en', 'zu', 'ti', 'co', 'sd', 'aj', 'dz', 'ck', 'an', 'ko', 'ch', 'vo', 'ní', 'po', 'at', 'sť', 'in', 'de', 'os', 'do', 'oľ', 'sv', 'ím', 'rá', 'zv', 'ťa', 'ic', 'ni', 'sk', 'vú', 'sl', 'ja', 've', 'mk', 'ôv', 'ji', 'im', 'ri', 'dn', 'dk', 'st', 'nc', 'ša', 'za', 're', 'la', 'ži', 'ed', 'il', 'oj', 'ra', 'ok', 'me', 'du', 'ác', 'át', 'nd', 'lo', 'zi', 'ný', 'kr', 'ýc', 'it', 'sa', 'ku', 'bá', 'lá', 'le', 'ár', 'ur', 'iu', 'ns', 'as', 'iv', 'es', 'pô', 'vr', 'al', 'ov']
['í', 'a', 'e', 'r', 'š', 'ľ', 't', 'ž', 'á', 'o', 'ť', 'z', 'c', 'b', 'n', 'p', 'x', 'č', 'l', 'ý', 'i', 'm', 'y', 'k', 'd', 's', 'u', 'ú', 'j', 'h', 'v', 'by', 'dš', 'em', 'ol', 'kt', 'ar', 'ej', 'sú', 'vý', 'ky', 'ne', 'od', 'mi', 'li', 'uj', 'uk

In [458]:
data.shape

(500, 3)

In [459]:
UB = 0.95
LB = 0.92

for i in range(len(test_data)):
    for j in range(len(data)):
        data.at[j, 'match_rate_lang_'+str(i)] = len(set(test_data.iloc[i]['ngrams']).intersection(data.iloc[j]['ngrams']))/len(data.iloc[j]['ngrams'])
data.head()

Unnamed: 0,text,lang,ngrams,match_rate_lang_0,match_rate_lang_1,match_rate_lang_2,match_rate_lang_3,match_rate_lang_4,match_rate_lang_5,match_rate_lang_6,...,match_rate_lang_490,match_rate_lang_491,match_rate_lang_492,match_rate_lang_493,match_rate_lang_494,match_rate_lang_495,match_rate_lang_496,match_rate_lang_497,match_rate_lang_498,match_rate_lang_499
0,"[január, po, voľbách, sa, časť, poslancov, zvo...",slk,"[í, a, e, r, š, ľ, t, ž, á, o, ô, ť, z, c, b, ...",0.607692,0.515385,0.492308,0.623077,0.746154,0.484615,0.538462,...,0.415385,0.569231,0.676923,0.4,0.361538,0.684615,0.476923,0.4,0.715385,0.407692
1,"[kniha, obsahuje, kompiláciu, výrokov, a, rozh...",slk,"[í, a, g, e, ó, r, š, t, ž, á, o, ô, ť, z, c, ...",0.475113,0.361991,0.41629,0.511312,0.615385,0.343891,0.393665,...,0.361991,0.488688,0.615385,0.312217,0.289593,0.61086,0.429864,0.289593,0.59276,0.321267
2,"[waters, bývalých, spoluhráčov, zažaloval, za,...",slk,"[í, a, e, g, ó, ď, r, š, t, w, ž, á, o, ť, z, ...",0.5,0.39011,0.423077,0.521978,0.681319,0.384615,0.423077,...,0.384615,0.532967,0.692308,0.362637,0.351648,0.675824,0.412088,0.313187,0.67033,0.373626
3,"[třebovice, je, obec, v, česku, v, okrese, úst...",slk,"[í, a, e, r, ľ, t, ž, á, o, z, c, b, n, p, č, ...",0.660714,0.5,0.580357,0.642857,0.741071,0.544643,0.589286,...,0.419643,0.571429,0.714286,0.446429,0.410714,0.705357,0.517857,0.535714,0.75,0.446429
4,"[vydávala, periodiká, časopis, muzeálnej, slov...",slk,"[í, a, e, r, t, á, o, z, b, n, p, č, l, é, ý, ...",0.653846,0.490385,0.548077,0.682692,0.75,0.557692,0.461538,...,0.451923,0.692308,0.798077,0.5,0.413462,0.730769,0.538462,0.432692,0.740385,0.442308


In [460]:
print('Language\tScore')
for i in range(len(test_data)):
    print(data.loc[data['match_rate_lang_'+str(i)].idxmax()]['lang']+'\t'+str(data.loc[data['match_rate_lang_'+str(i)].idxmax()]['match_rate_lang_'+str(i)]))

Language	Score
slk	0.7289719626168224
slk	0.6428571428571429
slk	0.651685393258427
slk	0.7653061224489796
slk	0.8909090909090909
slk	0.6261682242990654
slk	0.6206896551724138
slk	0.723404255319149
slk	0.7264150943396226
slk	0.7916666666666666
slk	0.8426966292134831
slk	0.5833333333333334
slk	0.7037037037037037
slk	0.625
slk	0.7641509433962265
slk	0.651685393258427
slk	0.6701030927835051
slk	0.6292134831460674
slk	0.7191011235955056
slk	0.7254901960784313
slk	0.8085106382978723
slk	0.7962962962962963
slk	0.594059405940594
slk	0.7528089887640449
slk	0.6608695652173913
slk	0.9401709401709402
slk	0.6063829787234043
slk	0.7078651685393258
slk	0.6039603960396039
slk	0.6454545454545455
slk	0.7528089887640449
slk	0.5619047619047619
slk	0.7446808510638298
slk	0.6636363636363637
slk	0.6527777777777778
slk	0.6506024096385542
slk	0.7289719626168224
slk	0.7435897435897436
slk	0.5263157894736842
slk	0.5046728971962616
slk	0.635593220338983
slk	0.6506849315068494
slk	0.5154639175257731
slk	0.65822784

slk	0.7647058823529411
slk	0.6138613861386139
slk	0.7980769230769231
slk	0.8018867924528302
slk	0.8363636363636363
slk	0.7272727272727273
slk	0.851063829787234
slk	0.5490196078431373
slk	0.6956521739130435
slk	0.7128712871287128
slk	0.7663551401869159
slk	0.8194444444444444
slk	0.7352941176470589
slk	0.8061224489795918
slk	0.5730337078651685
slk	0.7211538461538461
slk	0.5862068965517241
slk	0.7391304347826086
slk	0.8144329896907216
slk	0.5138888888888888
slk	0.7391304347826086
slk	0.8545454545454545
slk	0.5670103092783505
slk	0.5979381443298969
slk	0.6111111111111112
slk	0.696969696969697
slk	0.4473684210526316
slk	0.5046728971962616
slk	0.6440677966101694
slk	0.8454545454545455
slk	0.5972222222222222
slk	0.5490196078431373
slk	0.5
slk	0.7872340425531915
slk	0.5833333333333334
slk	0.6632653061224489
slk	0.6434782608695652
slk	0.5217391304347826
slk	0.5660377358490566
slk	0.5074626865671642
slk	0.6261682242990654
slk	0.46846846846846846
slk	0.5607476635514018
slk	0.7884615384615384
slk	

In [461]:
results = []
for i in range(len(test_data)):
    results.append({'Language':data.loc[data['match_rate_lang_'+str(i)].idxmax()]['lang'], 'Score':data.loc[data['match_rate_lang_'+str(i)].idxmax()]['match_rate_lang_'+str(i)]})
res_df = pd.DataFrame(results)
res_df.head()

Unnamed: 0,Language,Score
0,slk,0.728972
1,slk,0.642857
2,slk,0.651685
3,slk,0.765306
4,slk,0.890909


In [462]:
res_df['Score'].groupby(res_df['Language']).mean()

Language
slk    0.686813
Name: Score, dtype: float64