In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import transformers as tr

import tokenizers
from tokenizers import Tokenizer, BertWordPieceTokenizer
from tokenizers import pre_tokenizers
import os
import re
import tqdm 

from sklearn.model_selection import train_test_split


In [2]:
dataset_dir = "../datasets/lang_det/"

In [3]:
with open (os.path.join(dataset_dir,"all.txt"), "r") as file :
    eng_fr_dataset_v1  = file.readlines()

with open (os.path.join(dataset_dir,"all_text_lang.txt"), "r") as file :
    tn_dataset_v1  = file.readlines()
    
with open (os.path.join(dataset_dir,"messages.txt"), "r") as file :
    tn_dataset_v2  = file.readlines()

In [4]:
def parse_train(line:str):
    _label = re.compile("__label__[\w]{3}").search(line).group()
    _text = line.replace(_label,"")
    
    return (_text, _label)

def parse_unlabled_train(line:str,label="__label__tun"):    
    return (line, label)


In [5]:
eng_fr_dataset_v1 = list(map(lambda x: parse_train(x),tqdm.tqdm(eng_fr_dataset_v1)))

100%|██████████| 3274302/3274302 [00:04<00:00, 772434.23it/s]


In [6]:
tn_dataset_v1 = list(map(lambda x: parse_train(x),tqdm.tqdm(tn_dataset_v1)))

100%|██████████| 100000/100000 [00:00<00:00, 800951.75it/s]


In [7]:
tn_dataset_v2 = list(map(lambda x: parse_unlabled_train(x),tqdm.tqdm(tn_dataset_v2)))

100%|██████████| 76122/76122 [00:00<00:00, 2759254.09it/s]


In [8]:
en = []
fr = []

for line in eng_fr_dataset_v1 :
    if line[1]=="__label__eng":
        en.append(line)
    else :
        fr.append(line)

tn = []
tn.extend(tn_dataset_v1)
tn.extend(tn_dataset_v2)

all_texts = []

all_texts.extend(en)
all_texts.extend(fr)
all_texts.extend(tn)

In [14]:
#pd.DataFrame(tn,columns=["text","label"])["text"].to_csv("tun.csv", index=False)

176122

In [15]:
bert_tokenizer  = BertWordPieceTokenizer()

In [16]:
bert_tokenizer.train_from_iterator(all_texts, vocab_size=50000)

In [33]:
bert_tokenizer.encode("").tokens

['waa', '##a']

In [21]:
train = []

min_length = min(len(en), len(fr), len(tn))

np.random.shuffle(tn)
np.random.shuffle(en)
np.random.shuffle(fr)

tn = tn[:min_length-1]
fr = fr[:min_length-1]
en = en[:min_length-1]


In [22]:

train.extend(tn)
train.extend(en)
train.extend(fr)

np.random.shuffle(train)


In [23]:
df = pd.DataFrame(train, columns=["text","label"])
df 

Unnamed: 0,text,label
0,Behy haw bech nab3athlek lien mta3 eli ena t3a...,__label__tun
1,My younger brother has a lot of money.\n,__label__eng
2,J'ai vu ce matin une jolie rue dont j'ai oubl...,__label__fra
3,Sami told Layla about the whole thing with Sa...,__label__eng
4,allah yar7mou w y na3mou w yaj3al mathweh el ...,__label__tun
...,...,...
528358,U.S. District Judge Joan Ericksen ruled in th...,__label__eng
528359,Behi n5amem we n9olek 5ater 3andi jamia fiha m...,__label__tun
528360,hmdlh 8dartna f s5ana hhhhh\n,__label__tun
528361,La neige est tombée sans discontinuer samedi ...,__label__fra


In [24]:
df.label.value_counts()

__label__fra    176121
__label__tun    176121
__label__eng    176121
Name: label, dtype: int64

In [25]:
#tokenizer = tr.AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

In [26]:
del en, fr,  eng_fr_dataset_v1

In [118]:
#label2id={"__label__fra":0,"__label__eng":1,"__label__tun":2}
#id2label={0:"__label__fra",1:"__label__eng",2:"__label__tun"}

label2id={"__label__fra":0,"__label__eng":1, "__label__tun":2 }
id2label={0:"__label__fra",1:"__label__eng", 2:"__label__tun"}


def tokenize(lines:str, max_length=16):
    if max_length is None :
        max_length = 16
    random_length = np.random.randint(1,max_length+1)
    
    text, label = lines[0], label2id[ lines[1] ]
    input_ids = bert_tokenizer.encode(text).ids[:random_length+1]
    
    return (input_ids, label)


In [119]:
def tokenize_array(lines, max_length=16):
    features = []
    labels = []
    for item in lines :
        fe,lab = tokenize(item, max_length)
        features.append(fe)
        labels.append(lab)
    if max_length is None :
        max_length = max(list(map(lambda x: len(x), features)))
   
    len_features = len(features)
    
    input_ids =  np.zeros((len_features, max_length))
    
    for index, item in enumerate(features) :
        input_ids[index][:min(len(item),max_length)] = item[:min(len(item),max_length)]
    
    return np.array(input_ids) ,  np.array(labels)
            

In [120]:
def steps(iterator, batch_size = 32) :
    return len(iterator)//batch_size


def generator(iterator, batch_size = 32, max_length= 16, steps=None, epochs=1):
    
    
    _len = len(iterator)
    
    for e in range(epochs) :
        if steps is None:
            for i  in range(0,_len,batch_size) :
                data = iterator[i:i+batch_size]
                yield tokenize_array(data, max_length)
        else :
            for i in range(steps):
                indexes = np.random.choice(range(_len),batch_size, replace=False)
                data =  [iterator[ind] for ind in  indexes]
                yield tokenize_array(data, max_length)

In [121]:
input_dim = bert_tokenizer.get_vocab_size()
embedding_size = 128

model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=input_dim,  output_dim=embedding_size),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(512,activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.25),
    #keras.layers.GRU(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    #keras.layers.GRU(512, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.Dense(3,activation="softmax")

])

In [122]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         6400000   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 128)               512       
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              132096    
_________________________________________________________________
batch_normalization_4 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)             

In [123]:
model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])

In [124]:
X_train, X_test = train_test_split(train, test_size=0.15)

In [125]:
epochs = 30
batch_size = 16
max_length=16
train_steps =  200
test_steps = 200

In [126]:
X_train = generator(X_train, batch_size, max_length, steps=train_steps, epochs=epochs)
X_test = generator(X_test, batch_size, max_length,steps=test_steps,epochs=epochs)

In [127]:
model.fit(x=X_train, epochs=epochs, validation_data=X_test, steps_per_epoch=train_steps, validation_steps=test_steps)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fe434b62460>

In [136]:
_tokens = tf.expand_dims( bert_tokenizer.encode("are you doing fine ?").ids , 0)
model(_tokens)

<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[7.362371e-07, 9.990735e-01, 9.257449e-04]], dtype=float32)>