In [5]:
import os
import random
import numpy as np
import tensorflow as tf
import time

import warnings
warnings.filterwarnings('ignore')

In [6]:
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [7]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import keras.optimizers

In [90]:
if tf.test.is_gpu_available():
    BATCH_SIZE = 512
    EPOCHS = 12
else:
    BATCH_SIZE = 64
    EPOCHS = 5

In [50]:
LANGUAGES_DICT = {'en':0,'fr':1,'es':2,'it':3,'de':4,'sk':5,'cs':6}
MAX_LEN = 140
NUM_SAMPLES = 250000
SEED = 42

In [37]:
from support import define_alphabet

alphabet = define_alphabet()
print('String of all characters from all above languages ', '\n', alphabet[2])

VOCAB_SIZE = len(alphabet[2])
print('Total number of characters ', VOCAB_SIZE)

String of all characters from all above languages  
 abcdefghijklmnopqrstuvwxyzßàáâäæçèéêìíîïñòóôöùúûüýÿčďěĺľňœŕřšťůž !?¿¡ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÆÇÈÉÊÌÍÎÏÑÒÓÔÖÙÚÛÜÝČĎĚĹĽŇŒŔŘŠŤŮŸŽ
Total number of characters  132


In [62]:
data_directory = "data"
source_directory = os.path.join(data_directory, 'source')
cleaned_directory = os.path.join(data_directory, 'cleaned')
samples_directory = os.path.join('/tmp', 'samples')
train_test_directory = os.path.join('/tmp', 'train_test')

In [53]:
from support import clean_text

for lang_code in LANGUAGES_DICT:

    path_src = os.path.join(source_directory, lang_code+".txt")
    with open(path_src) as source_file:

        content = source_file.read()

        print('Language : ',lang_code)
        print('Content before cleaning :-> ', content[1000:1000+MAX_LEN])

    content = clean_text(content)
    
    print ('Content after cleaning :-> ', content[1000:1000+MAX_LEN])
    
    path_cl = os.path.join(cleaned_directory, lang_code + '_cleaned.txt')
    with open(path_cl,'w') as cleaned_file:
        cleaned_file.write(content)
    
    # Free the memory
    del content
    print ("Cleaning completed for : " + path_src,'->',path_cl)
    print (100*'-')
print ("END OF CLEANING")

Language :  en
Content before cleaning :->   Mark's Eve. Those sitting had to keep silent between the bell tolling at 11.00 p.m. until the bell struck 1.00 a.m. In Yorkshire it was nec
Content after cleaning :->  ing into the church. This practice took place throughout England, but was most prevalent in northern and western counties. Some accounts of 
Cleaning completed for : data/source/en.txt -> data/cleaned/en_cleaned.txt
----------------------------------------------------------------------------------------------------
Language :  fr
Content before cleaning :->  ow Wilson.

Au cours de sa carrière, il composa environ neuf cents toiles et plus de deux mille aquarelles, ainsi que d'innombrables croquis
Content after cleaning :->  quarelles, ainsi que d'innombrables croquis et dessins. Son œuvre documente ses voyages à travers le monde, de Venise au Tyrol, de Corfou au
Cleaning completed for : data/source/fr.txt -> data/cleaned/fr_cleaned.txt
-----------------------------------------

In [55]:
from support import get_sample_text, get_input_row
    
path = os.path.join(cleaned_directory, "en_cleaned.txt")

with open(path, 'r') as f:
    
    content = f.read()
    random_index = random.randrange(0,len(content)-2*MAX_LEN)
    sample_text = get_sample_text(content,random_index,MAX_LEN)
    print ("1. SAMPLE TEXT: \n", sample_text)
    
    all_characters = alphabet[0]+alphabet[1]
    print ("\n2. REFERENCE ALPHABET: \n", all_characters)
    
    sample_input_row = get_input_row(content, random_index, MAX_LEN, alphabet)
    print ("\n3. SAMPLE INPUT ROW: \n",sample_input_row)
    
    input_size = len(sample_input_row)
    if input_size != VOCAB_SIZE:
        print("Something strange happened!")
        
    print ("\n4. INPUT SIZE (VOCAB SIZE): ", input_size)
    
    count_characters = {}
    for i in range(VOCAB_SIZE):
        if sample_input_row[i] != 0:
           count_characters[all_characters[i]] = sample_input_row[i]
    
    print ("\n4. Associated character count: \n",count_characters)
    del content
    

1. SAMPLE TEXT: 
 small tied island connected by the largest tombolo in the UK to the south-western coast of the Mainland, Shetland, in Scotland. It is part

2. REFERENCE ALPHABET: 
 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ß', 'à', 'á', 'â', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ', 'č', 'ď', 'ě', 'ĺ', 'ľ', 'ň', 'œ', 'ŕ', 'ř', 'š', 'ť', 'ů', 'ž', ' ', '!', '?', '¿', '¡', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'À', 'Á', 'Â', 'Ä', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ì', 'Í', 'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ö', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Č', 'Ď', 'Ě', 'Ĺ', 'Ľ', 'Ň', 'Œ', 'Ŕ', 'Ř', 'Š', 'Ť', 'Ů', 'Ÿ', 'Ž']

3. SAMPLE INPUT ROW: 
 [9, 2, 4, 6, 11, 1, 1, 6, 7, 0, 1, 8, 3, 10, 9, 1, 0, 3, 9, 16, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [63]:
def size_mb(size):
    size_mb =  '{:.2f}'.format(size/(1000*1000.0))
    return size_mb + " MB"


sample_data = np.empty((NUM_SAMPLES*len(LANGUAGES_DICT),VOCAB_SIZE+1),dtype = np.uint16)
lang_seq = 0
jump_reduce = 0.2 # part of characters removed from jump to avoid passing the end of file

for lang_code in LANGUAGES_DICT:
    start_index = 0
    path = os.path.join(cleaned_directory, lang_code+"_cleaned.txt")
    with open(path, 'r') as f:
        print ("Processing file : " + path)
        file_content = f.read()
        content_length = len(file_content)
        remaining = content_length - MAX_LEN*NUM_SAMPLES
        jump = int(((remaining/NUM_SAMPLES)*3)/4)
        print('content_lenght {}, remaining_length {}, jump {}'.format(content_length,remaining,jump))
        print ("File size : ",size_mb(content_length),\
               " | # possible samples : ",int(content_length/VOCAB_SIZE),\
              "| # skip chars : " + str(jump))
        for idx in range(NUM_SAMPLES):
            input_row = get_input_row(file_content, start_index, MAX_LEN, alphabet)
            sample_data[NUM_SAMPLES*lang_seq+idx,] = input_row + [LANGUAGES_DICT[lang_code]]
            start_index += MAX_LEN + jump
        del file_content
    lang_seq += 1
    print (100*"-")

np.random.shuffle(sample_data)

print ("Vocab Size : ",VOCAB_SIZE )
print (100*"-")
print ("Samples array size : ",sample_data.shape )

if not os.path.exists(samples_directory):
    os.makedirs(samples_directory)

path_smpl = os.path.join(samples_directory,"lang_samples_"+str(VOCAB_SIZE)+".npz")
np.savez_compressed(path_smpl,data=sample_data)
print(path_smpl, "size : ", size_mb(os.path.getsize(path_smpl)))
del sample_data

Processing file : data/cleaned/en_cleaned.txt
content_lenght 101420888, remaining_length 66420888, jump 199
File size :  101.42 MB  | # possible samples :  768340 | # skip chars : 199
----------------------------------------------------------------------------------------------------
Processing file : data/cleaned/fr_cleaned.txt
content_lenght 98722777, remaining_length 63722777, jump 191
File size :  98.72 MB  | # possible samples :  747899 | # skip chars : 191
----------------------------------------------------------------------------------------------------
Processing file : data/cleaned/es_cleaned.txt
content_lenght 97562749, remaining_length 62562749, jump 187
File size :  97.56 MB  | # possible samples :  739111 | # skip chars : 187
----------------------------------------------------------------------------------------------------
Processing file : data/cleaned/it_cleaned.txt
content_lenght 101889621, remaining_length 66889621, jump 200
File size :  101.89 MB  | # possible samp

In [79]:
dt = np.load(path_smpl)['data']
dt = dt.astype(np.float32)
X = dt[:, 0:input_size]
Y = dt[:, input_size]
del dt

In [80]:
standard_scaler = preprocessing.StandardScaler().fit(X)
X = standard_scaler.transform(X)

In [81]:
Y = keras.utils.to_categorical(Y, num_classes=len(LANGUAGES_DICT))

In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=SEED)
del X, Y

In [85]:
if not os.path.exists(train_test_directory):
    os.makedirs(train_test_directory)

path_tt = os.path.join(train_test_directory,"train_test_data_"+str(VOCAB_SIZE)+".npz")
np.savez_compressed(path_tt,X_train=X_train,Y_train=Y_train,X_test=X_test,Y_test=Y_test)
print(path_tt, "size : ",size_mb(os.path.getsize(path_tt)))
del X_train,Y_train,X_test,Y_test

/tmp/train_test/train_test_data_132.npz size :  94.93 MB


In [86]:
path_tt = os.path.join(train_test_directory, "train_test_data_"+str(VOCAB_SIZE)+".npz")
train_test_data = np.load(path_tt)

# Train Set
X_train = train_test_data['X_train']
print ("X_train: ",X_train.shape)
Y_train = train_test_data['Y_train']
print ("Y_train: ",Y_train.shape)

# Test Set
X_test = train_test_data['X_test']
print ("X_test: ",X_test.shape)
Y_test = train_test_data['Y_test']
print ("Y_test: ",Y_test.shape)

del train_test_data

X_train:  (1400000, 132)
Y_train:  (1400000, 7)
X_test:  (350000, 132)
Y_test:  (350000, 7)


In [101]:
model = Sequential()


model.add(Dense(500,input_dim=input_size, kernel_initializer="glorot_uniform", activation="tanh"))
model.add(Dropout(0.5))
model.add(Dense(300, kernel_initializer="glorot_uniform", activation="tanh"))
model.add(Dropout(0.5))
model.add(Dense(100, kernel_initializer="glorot_uniform", activation="tanh"))
model.add(Dropout(0.5))
model.add(Dense(len(LANGUAGES_DICT), kernel_initializer="glorot_uniform", activation="softmax"))
model_optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy',optimizer=model_optimizer,metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 500)               66500     
_________________________________________________________________
dropout_7 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 300)               150300    
_________________________________________________________________
dropout_8 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 100)               30100     
_________________________________________________________________
dropout_9 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 7)                 707       
Total para

In [102]:
from keras.callbacks import TensorBoard

# Tensorboard
tensorboard = TensorBoard(log_dir="run")

history = model.fit(X_train,Y_train,epochs=EPOCHS,validation_split=0.1,batch_size=BATCH_SIZE,callbacks=[tensorboard],shuffle=True,verbose=2)

Train on 1260000 samples, validate on 140000 samples
Epoch 1/5
 - 194s - loss: 0.1266 - acc: 0.9621 - val_loss: 0.0953 - val_acc: 0.9708
Epoch 2/5
 - 614s - loss: 0.1128 - acc: 0.9665 - val_loss: 0.0894 - val_acc: 0.9725
Epoch 3/5
 - 664s - loss: 0.1104 - acc: 0.9671 - val_loss: 0.0874 - val_acc: 0.9733
Epoch 4/5
 - 215s - loss: 0.1092 - acc: 0.9675 - val_loss: 0.0886 - val_acc: 0.9732
Epoch 5/5
 - 210s - loss: 0.1087 - acc: 0.9678 - val_loss: 0.0878 - val_acc: 0.9736


In [103]:
scores = model.evaluate(X_test, Y_test, verbose=1)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 97.32%


In [104]:
Y_pred = model.predict_classes(X_test)
Y_pred = keras.utils.to_categorical(Y_pred, num_classes=len(LANGUAGES_DICT))
LABELS =  list(LANGUAGES_DICT.keys())

In [105]:
print(classification_report(Y_test, Y_pred, target_names=LABELS))

             precision    recall  f1-score   support

         en       0.96      0.97      0.97     49866
         fr       0.98      0.98      0.98     49871
         es       0.97      0.97      0.97     49995
         it       0.97      0.97      0.97     49909
         de       0.98      0.98      0.98     50364
         sk       0.97      0.98      0.97     49923
         cs       0.98      0.97      0.98     50072

avg / total       0.97      0.97      0.97    350000



In [106]:
model.save_weights('data/models/lang_identification_weights.h5')

In [107]:
from ipywidgets import interact_manual
from ipywidgets import widgets
from support import clean_text


def get_prediction(TEXT):
    if len(TEXT) < MAX_LEN:
        print("Text has to be at least {} chars long, but it is {}/{}".format(MAX_LEN, len(TEXT), MAX_LEN))
        return(-1)
    # Data cleaning
    cleaned_text = clean_text(TEXT)
    
    # Get the MAX_LEN char
    input_row = get_input_row(cleaned_text, 0, MAX_LEN, alphabet)
    
    # Data preprocessing (Standardization)
    test_array = standard_scaler.transform([input_row])
    
    raw_score = model.predict(test_array)
    pred_idx= np.argmax(raw_score, axis=1)[0]
    score = raw_score[0][pred_idx]*100
    
    # Prediction
    prediction = LABELS[model.predict_classes(test_array)[0]]
    print('TEXT:', TEXT, '\nPREDICTION:', prediction.upper(), '\nSCORE:', score)

interact_manual(get_prediction, TEXT=widgets.Textarea(placeholder='Type the text to identify here'));

interactive(children=(Textarea(value='', description='TEXT', placeholder='Type the text to identify here'), Bu…