# Creation of Language Training Profiles

## 1. Required Imports

In [2]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

## 2. Read Wili-2018 Dataset and filter it for 34 European Languages

In [699]:
data = pd.read_csv('/Users/ashwinkumar/Desktop/MLProj/wili-2018/x_train.txt', sep='\r\n', header=None, engine='python')
res = pd.read_csv('/Users/ashwinkumar/Desktop/MLProj/wili-2018/y_train.txt', sep='\r\n', header=None, engine='python')
langs = ['slk']
#'nld','swe']
#,'tur','lat','dan','por','fra','bos','bul','gle','be-tarask','spa','isl','cym','srp','ita','fin','cat','ron','ces','rus','lit','sqi','slk','ukr','hrv','eng','mlt','hye','nno','hun','deu','mwl','pol','ell','ltz']
data['1'] = res
data.columns = ['text', 'lang']
data = data[data['lang'].isin(langs)].reset_index(drop=True)
data.head()

Unnamed: 0,text,lang
0,január 2000 – po voľbách sa časť poslancov zvo...,slk
1,Kniha obsahuje kompiláciu výrokov a rozhovorov...,slk
2,Waters bývalých spoluhráčov zažaloval za neopr...,slk
3,Třebovice je obec v Česku v okrese Ústí nad Or...,slk
4,Vydávala periodiká Časopis Muzeálnej slovenske...,slk


## 3. Print filtered Lang Samples and Shape of Dataframe

In [700]:
print(data.head())
data.shape

                                                text lang
0  január 2000 – po voľbách sa časť poslancov zvo...  slk
1  Kniha obsahuje kompiláciu výrokov a rozhovorov...  slk
2  Waters bývalých spoluhráčov zažaloval za neopr...  slk
3  Třebovice je obec v Česku v okrese Ústí nad Or...  slk
4  Vydávala periodiká Časopis Muzeálnej slovenske...  slk


(500, 2)

## 4. Preprocess Lang Data(Remove punctuations and digits)

In [701]:
data['text'] = data['text'].str.replace(r'[^\w\s]+', '')
data['text'] = data['text'].str.replace(r'[\d]+', '')
data['text'] = data['text'].astype(str).str.lower()
data.shape

(500, 2)

## 4. Tokenize Lang Data to Prepare Lang Profiles

In [702]:
#data = data.groupby('lang')['text'].apply(' '.join).reset_index()
data['text'] = data.text.apply(word_tokenize)
data.shape

(500, 2)

## 5. Function to Create N-GRAMS

In [703]:
def word2ngrams(text, n=1):
  """ Convert word into character ngrams. """
  return [text[i:i+n] for i in range(len(text)-n+1)]

## 6. Generate Lang Profiles using Mono, Bi and Tri GRAMS

In [704]:
ngrams = []
data['ngrams'] = ''
for lang in data.text:
    lang_ngram = []
    for word in lang:
        lang_ngram = lang_ngram + word2ngrams(word,1) 
    lang_ngram = sorted(list(set(lang_ngram)), key=len)
    ngrams.append(lang_ngram)

for i in range(len(ngrams)):
    data.at[i, 'ngrams'] = ngrams[i]
data.shape

(500, 3)

In [705]:
data.head()

Unnamed: 0,text,lang,ngrams
0,"[január, po, voľbách, sa, časť, poslancov, zvo...",slk,"[l, v, ž, k, n, á, a, ú, p, š, r, c, z, ť, m, ..."
1,"[kniha, obsahuje, kompiláciu, výrokov, a, rozh...",slk,"[l, v, ž, k, é, n, á, a, ú, p, š, r, c, z, m, ..."
2,"[waters, bývalých, spoluhráčov, zažaloval, za,...",slk,"[l, v, ž, k, ä, é, n, á, a, ú, p, š, r, c, z, ..."
3,"[třebovice, je, obec, v, česku, v, okrese, úst...",slk,"[l, v, ž, k, n, á, a, ú, p, r, c, z, m, č, d, ..."
4,"[vydávala, periodiká, časopis, muzeálnej, slov...",slk,"[l, v, k, é, n, á, a, ú, p, r, z, m, č, d, e, ..."


## 7. Write Generated Lang Profiles in JSON

In [706]:
data.to_json('/Users/ashwinkumar/Desktop/MLProj/wili-2018/lang_profiles.json', orient='values')

# Detect Language using Test Language Profiles

## 1. Load Test Data

In [707]:
test_data = pd.read_csv('/Users/ashwinkumar/Desktop/MLProj/wili-2018/x_test.txt', sep='\r\n', header=None, engine='python')
test_res = pd.read_csv('/Users/ashwinkumar/Desktop/MLProj/wili-2018/y_test.txt', sep='\r\n', header=None, engine='python')
test_langs = ['slk']
    #'swe','nld','tur','lat','dan','por','fra','bos','bul','gle','be-tarask','spa','isl','cym','srp','ita','fin','cat','ron','ces','rus','lit','sqi','slk','ukr','hrv','eng','mlt','hye','nno','hun','deu','mwl','pol','ell','ltz']
test_data['1'] = test_res
test_data.columns = ['text', 'lang']
test_data = test_data[test_data['lang'].isin(langs)].reset_index(drop=True)
test_data.head()

Unnamed: 0,text,lang
0,Podujal sa preto rozlúštiť tento paradox. Syst...,slk
1,V roku 2002 ho prezident George W. Bush vymeno...,slk
2,"V hudbe sa za Underground označuje smer, ktorý...",slk
3,Do roku 1919 mladé talenty zo Slovenska navšte...,slk
4,George Hammond nastúpil ako generál v prvom di...,slk


## 2. Print filtered Lang Test Samples and Shape of Dataframe

In [708]:
print(test_data.head())
test_data.shape

                                                text lang
0  Podujal sa preto rozlúštiť tento paradox. Syst...  slk
1  V roku 2002 ho prezident George W. Bush vymeno...  slk
2  V hudbe sa za Underground označuje smer, ktorý...  slk
3  Do roku 1919 mladé talenty zo Slovenska navšte...  slk
4  George Hammond nastúpil ako generál v prvom di...  slk


(500, 2)

## 3. Preprocess Test Lang Data(Remove punctuations and digits)

In [709]:
test_data['text'] = test_data['text'].str.replace(r'[^\w\s]+', '')
test_data['text'] = test_data['text'].str.replace(r'[\d]+', '')
test_data['text'] = test_data['text'].astype(str).str.lower()
test_data.head()

Unnamed: 0,text,lang
0,podujal sa preto rozlúštiť tento paradox syste...,slk
1,v roku ho prezident george w bush vymenoval d...,slk
2,v hudbe sa za underground označuje smer ktorý ...,slk
3,do roku mladé talenty zo slovenska navštevova...,slk
4,george hammond nastúpil ako generál v prvom di...,slk


## 4. Tokenize Test Lang Data to Prepare Lang Profiles

In [710]:
test_data['text'] = test_data.text.apply(word_tokenize)
test_data.head()

Unnamed: 0,text,lang
0,"[podujal, sa, preto, rozlúštiť, tento, paradox...",slk
1,"[v, roku, ho, prezident, george, w, bush, vyme...",slk
2,"[v, hudbe, sa, za, underground, označuje, smer...",slk
3,"[do, roku, mladé, talenty, zo, slovenska, navš...",slk
4,"[george, hammond, nastúpil, ako, generál, v, p...",slk


## 5. Generate Test Lang Profiles using Mono, Bi and Tri GRAMS

In [711]:
ngrams = []
test_data['ngrams'] = ''
for lang in test_data.text:
    lang_ngram = []
    for word in lang:
        lang_ngram = lang_ngram + word2ngrams(word,1) 
    lang_ngram = sorted(list(set(lang_ngram)), key=len)
    ngrams.append(lang_ngram)
#print(ngrams)
for i in range(len(ngrams)):
    test_data.at[i, 'ngrams'] = ngrams[i]
test_data.head()

Unnamed: 0,text,lang,ngrams
0,"[podujal, sa, preto, rozlúštiť, tento, paradox...",slk,"[l, v, ž, k, n, á, a, ú, p, š, r, z, c, ť, m, ..."
1,"[v, roku, ho, prezident, george, w, bush, vyme...",slk,"[l, v, k, é, n, á, a, ú, p, š, r, z, c, m, ť, ..."
2,"[v, hudbe, sa, za, underground, označuje, smer...",slk,"[l, v, ž, k, ä, é, n, á, a, p, š, r, z, c, m, ..."
3,"[do, roku, mladé, talenty, zo, slovenska, navš...",slk,"[l, v, ž, k, é, n, á, a, ú, p, š, r, z, c, m, ..."
4,"[george, hammond, nastúpil, ako, generál, v, p...",slk,"[l, v, ž, k, ä, é, n, á, a, ú, p, r, z, c, m, ..."


In [712]:
test_data.shape

(500, 3)

In [713]:
print(data['ngrams'][0])
print(test_data['ngrams'][0])

['l', 'v', 'ž', 'k', 'n', 'á', 'a', 'ú', 'p', 'š', 'r', 'c', 'z', 'ť', 'm', 'ô', 'č', 'd', 'e', 'j', 't', 'o', 'u', 'h', 'ý', 'í', 'i', 'b', 's', 'ľ']
['l', 'v', 'ž', 'k', 'n', 'á', 'a', 'ú', 'p', 'š', 'r', 'z', 'c', 'ť', 'm', 'č', 'd', 'e', 'j', 't', 'o', 'u', 'x', 'h', 'ý', 'í', 'i', 'y', 'b', 's', 'ľ']


In [714]:
data.shape


(500, 3)

In [715]:
UB = 0.95
LB = 0.92

for i in range(len(test_data)):
    for j in range(len(data)):
        data.at[j, 'match_rate_lang_'+str(i)] = len(set(test_data.iloc[i]['ngrams']).intersection(data.iloc[j]['ngrams']))/len(data.iloc[j]['ngrams'])
data.head()

Unnamed: 0,text,lang,ngrams,match_rate_lang_0,match_rate_lang_1,match_rate_lang_2,match_rate_lang_3,match_rate_lang_4,match_rate_lang_5,match_rate_lang_6,...,match_rate_lang_490,match_rate_lang_491,match_rate_lang_492,match_rate_lang_493,match_rate_lang_494,match_rate_lang_495,match_rate_lang_496,match_rate_lang_497,match_rate_lang_498,match_rate_lang_499
0,"[január, po, voľbách, sa, časť, poslancov, zvo...",slk,"[l, v, ž, k, n, á, a, ú, p, š, r, c, z, ť, m, ...",0.966667,0.833333,0.9,0.933333,0.933333,0.833333,0.933333,...,0.8,0.933333,0.966667,0.833333,0.7,0.966667,0.9,0.766667,0.933333,0.866667
1,"[kniha, obsahuje, kompiláciu, výrokov, a, rozh...",slk,"[l, v, ž, k, é, n, á, a, ú, p, š, r, c, z, m, ...",0.852941,0.852941,0.882353,0.941176,0.941176,0.764706,0.852941,...,0.823529,0.911765,0.970588,0.794118,0.735294,0.970588,0.852941,0.705882,0.911765,0.852941
2,"[waters, bývalých, spoluhráčov, zažaloval, za,...",slk,"[l, v, ž, k, ä, é, n, á, a, ú, p, š, r, c, z, ...",0.805556,0.833333,0.861111,0.888889,0.944444,0.722222,0.805556,...,0.777778,0.861111,0.944444,0.777778,0.722222,0.972222,0.777778,0.666667,0.916667,0.805556
3,"[třebovice, je, obec, v, česku, v, okrese, úst...",slk,"[l, v, ž, k, n, á, a, ú, p, r, c, z, m, č, d, ...",0.965517,0.827586,0.931034,0.931034,0.931034,0.896552,0.931034,...,0.758621,0.931034,0.965517,0.896552,0.724138,0.965517,0.896552,0.793103,0.965517,0.862069
4,"[vydávala, periodiká, časopis, muzeálnej, slov...",slk,"[l, v, k, é, n, á, a, ú, p, r, z, m, č, d, e, ...",0.961538,0.923077,0.961538,1.0,1.0,0.884615,0.961538,...,0.807692,1.0,1.0,1.0,0.807692,1.0,0.961538,0.846154,1.0,0.923077


In [716]:
print('Language\tScore')
for i in range(len(test_data)):
    print(data.loc[data['match_rate_lang_'+str(i)].idxmax()]['lang']+'\t'+str(data.loc[data['match_rate_lang_'+str(i)].idxmax()]['match_rate_lang_'+str(i)]))

Language	Score
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	0.9629629629629629
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	0.9583333333333334
slk	1.0
slk	0.96
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	0.9259259259259259
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	0.9583333333333334
slk	1.0
slk	1.0
slk	0.9655172413793104
slk	1.0
slk	1.0
slk	1.0
slk	0.9130434782608695
slk	0.9166666666666666
slk	1.0
slk	1.0
slk	0.9259259259259259
slk	1.0
slk	1.0
slk	0.9666666666666667
slk	0.9166666666666666
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	0.9583333333333334
slk	1.0
slk	0.9230769230769231
slk	0.9629629629629629
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	1.0
slk	0.9629629629629629
slk	0.9642857142857143
slk	1.0
slk	1.0
slk	0.9615384615384616
slk	1.0
slk	0.9615384615384616
slk	1.0
slk	0.9583333333333334
slk	0.8461538461538461
slk	0.9629629629629629
slk	1.0
slk	1.0
slk	

In [717]:
results = []
for i in range(len(test_data)):
    results.append({'Language':data.loc[data['match_rate_lang_'+str(i)].idxmax()]['lang'], 'Score':data.loc[data['match_rate_lang_'+str(i)].idxmax()]['match_rate_lang_'+str(i)]})
res_df = pd.DataFrame(results)
res_df.head()

Unnamed: 0,Language,Score
0,slk,1.0
1,slk,1.0
2,slk,1.0
3,slk,1.0
4,slk,1.0


In [718]:
res_df['Score'].groupby(res_df['Language']).mean()

Language
slk    0.986601
Name: Score, dtype: float64