In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

import os
import re
import tqdm 

import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from tensorflow.keras import callbacks
import string 
import emoji

In [6]:
with open ("../datasets/lang_det/all.txt", "r") as file :
    eng_fr_dataset_v1  = file.readlines()

with open ("../datasets/tunisien/scrapped_texts/comments.txt", "r") as file :
    tn_dataset_v1  = file.readlines()

with open ("../datasets/tunisien/scrapped_texts/messages.txt", "r") as file :
    tn_dataset_v2  = file.readlines()

In [7]:
trans = str.maketrans(string.punctuation," "*(len(string.punctuation)))

def transform_text(txt:str):
    _txt = txt
    _txt = _txt.translate(trans)
    _txt = emoji.get_emoji_regexp().sub("", _txt)
    _txt = _txt.strip().lower()
    _txt = re.sub("\s+", " ", _txt)
    return _txt

def parse_train(line:str, transform=False):
    _label = re.compile("__label__[\w]{3}").search(line).group()
    _text = line.replace(_label,"")
    if transform:
        _text = transform_text(_text)
    
    return (_text, _label)

def parse_unlabled_train(line:str,label="__label__tun"):    
    return (transform_text(line), label)


In [8]:
eng_fr_dataset_v1 = list(map(lambda x: parse_train(x),tqdm.tqdm(eng_fr_dataset_v1)))

100%|██████████| 3274302/3274302 [00:04<00:00, 795512.29it/s]


In [9]:
tn_dataset_v1 = list(map(lambda x: parse_unlabled_train(x),tqdm.tqdm(tn_dataset_v1)))

100%|██████████| 63176/63176 [00:15<00:00, 4099.49it/s]


In [10]:
tn_dataset_v2 = list(map(lambda x: parse_unlabled_train(x),tqdm.tqdm(tn_dataset_v2)))

100%|██████████| 76122/76122 [00:09<00:00, 7801.26it/s]


In [11]:
en = []
fr = []

for line in eng_fr_dataset_v1 :
    if line[1]=="__label__eng":
        en.append(line)
    else :
        fr.append(line)

tn = []
tn.extend(tn_dataset_v1)
tn.extend(tn_dataset_v2)

tn_texts= [text[0] for text in tn]
fr_texts = [text[0] for text in fr[:300000]]
en_texts= [text[0] for text in en[:300000]]

all_texts = []
all_texts.extend(tn_texts)
all_texts.extend(fr_texts)
all_texts.extend(en_texts)

In [12]:
tn_vectorizer = CountVectorizer(analyzer="char_wb", ngram_range=(2,3), min_df=2)

In [13]:
tn_vectorizer.fit(all_texts)

CountVectorizer(analyzer='char_wb', min_df=2, ngram_range=(2, 3))

In [14]:
with open("vectorizer.bin","wb") as file:
    pickle.dump(tn_vectorizer, file)

In [15]:
def extract_text(txt, max_length=12):
    _txt = txt.split()
    
    random_length = np.random.randint(1,max_length+1)
    return " ".join(_txt[:random_length])


def featurize(arr:[(str,str)], max_length=12):
    _texts = []
    _labels = []
    
    label2id={"__label__fra":0,"__label__eng":1, "__label__tun":2 }
    id2label={0:"__label__fra",1:"__label__eng", 2:"__label__tun"}
    
    for t,l in arr:
        _texts.append(extract_text(t,max_length=max_length))
        _labels.append(label2id[l])
    
    _features = tn_vectorizer.transform(_texts).toarray()
    
    return _features, np.array(_labels)

In [16]:
def steps(iterator, batch_size = 32) :
    return len(iterator)//batch_size


def generator(iterator, batch_size = 32, max_length= 12, steps=None, epochs=1):
    
    
    _len = len(iterator)
    
    for e in range(epochs) :
        if steps is None:
            for i  in range(0,_len,batch_size) :
                data = iterator[i:i+batch_size]
                yield featurize(data, max_length)
        else :
            for i in range(steps):
                indexes = np.random.choice(range(_len),batch_size, replace=False)
                data =  [iterator[ind] for ind in  indexes]
                yield featurize(data, max_length)

In [17]:
train = []

min_length = min(len(en), len(fr), len(tn))

np.random.shuffle(tn)
np.random.shuffle(en)
np.random.shuffle(fr)

tn = tn[:min_length-1]
fr = fr[:min_length-1]
en = en[:min_length-1]


train.extend(tn)
train.extend(en)
train.extend(fr)

np.random.shuffle(train)

In [18]:
input_dim = len(tn_vectorizer.vocabulary_)

model = keras.models.Sequential([
    keras.layers.InputLayer(input_dim),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(512,activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.25),

    keras.layers.Dense(3,activation="softmax")

])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 28276)             0         
_________________________________________________________________
dense (Dense)                (None, 1024)              28955648  
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
batch_normalization_1 (Batch (None, 512)               2048      
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0

In [20]:
model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])

In [21]:
X_train, X_test = train_test_split(train, test_size=0.15)

In [23]:
epochs = 30
batch_size = 64
max_length=12
train_steps =  400
test_steps = steps(X_test, batch_size)
steps(X_test, batch_size)

979

In [24]:
tensorboard_callbacks = callbacks.TensorBoard("./logs_lang_detector")
early_callback = callbacks.EarlyStopping(patience=5)
lr_callback= callbacks.ReduceLROnPlateau(patience=2)
save_callback = callbacks.ModelCheckpoint("model_lang", save_best_only=True)

clbk = [tensorboard_callbacks, early_callback,lr_callback,save_callback]

In [25]:
X_train = generator(X_train, batch_size, max_length, steps=train_steps, epochs=epochs)
X_test = generator(X_test, batch_size, max_length,steps=test_steps,epochs=epochs)

In [26]:
model.fit(x=X_train, epochs=epochs, validation_data=X_test, steps_per_epoch=train_steps, validation_steps=test_steps, callbacks=clbk)

Epoch 1/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 2/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 3/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 4/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 5/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 6/30
Epoch 7/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 8/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 9/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 10/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 11/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 12/30
Epoch 13/30
Epoch 14/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 15/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 16/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 17/30
Epoch 18/30
Epoch 19/30
INFO:tensorflow:Assets written to: model_lang/assets
Epoch 20/30
INFO:tensorflow:Assets writ

<tensorflow.python.keras.callbacks.History at 0x7fb927589e50>

In [34]:
def predict_lang(txt:str, model, vectorizer,label_dict):
    _txt = transform_text(txt)
    _txt = vectorizer.transform([_txt]).toarray()
    res = model(_txt)[0]
    print(res)
    res =  tf.argmax(res).numpy()
    return label_dict[res]

In [28]:
model = keras.models.load_model("./model_lang/")

In [78]:
with open("./vectorizer.bin", "rb") as file:
    vectorizer = pickle.load(file)

In [81]:
predict_lang("non", model, vectorizer, {0:"__label__fra",1:"__label__eng", 2:"__label__tun"})

tf.Tensor([0.53562284 0.08138265 0.38299447], shape=(3,), dtype=float32)


'__label__fra'