In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import transformers as tr

import tokenizers
from tokenizers import Tokenizer
from tokenizers import pre_tokenizers
import os
import re
import tqdm 

from sklearn.model_selection import train_test_split


In [2]:
dataset_dir = "../datasets/lang_det/"

In [54]:
with open (os.path.join(dataset_dir,"all.txt"), "r") as file :
    eng_fr_dataset_v1  = file.readlines()

with open (os.path.join(dataset_dir,"all_text_lang.txt"), "r") as file :
    tn_dataset_v1  = file.readlines()
    
with open (os.path.join(dataset_dir,"messages.txt"), "r") as file :
    tn_dataset_v2  = file.readlines()

In [55]:
def parse_train(line:str):
    _label = re.compile("__label__[\w]{3}").search(line).group()
    _text = line.replace(_label,"")
    
    return (_text, _label)

def parse_unlabled_train(line:str,label="__label__tun"):    
    return (line, label)


In [56]:
eng_fr_dataset_v1 = list(map(lambda x: parse_train(x),tqdm.tqdm(eng_fr_dataset_v1)))

100%|██████████| 3274302/3274302 [00:25<00:00, 129366.01it/s]


In [57]:
tn_dataset_v1 = list(map(lambda x: parse_train(x),tqdm.tqdm(tn_dataset_v1)))

100%|██████████| 100000/100000 [00:00<00:00, 126609.57it/s]


In [58]:
tn_dataset_v2 = list(map(lambda x: parse_unlabled_train(x),tqdm.tqdm(tn_dataset_v2)))

100%|██████████| 76122/76122 [00:00<00:00, 597685.49it/s]


In [59]:
en = []
fr = []

for line in eng_fr_dataset_v1 :
    if line[1]=="__label__eng":
        en.append(line)
    else :
        fr.append(line)

In [60]:
tn = []
tn.extend(tn_dataset_v1)
tn.extend(tn_dataset_v2)

In [61]:
train = []

min_length = min(len(en), len(fr))

np.random.shuffle(tn)
np.random.shuffle(en)
np.random.shuffle(fr)

tn = tn[:min_length-1]
fr = fr[:min_length-1]
en = en[:min_length-1]


In [62]:

#train.extend(tn)
train.extend(en)
train.extend(fr)

np.random.shuffle(train)


In [63]:
_test_size = 0.2
_train_size= round(len(train)*(1-_test_size))

_train = train[:_train_size]
_test = train[_train_size:]

with open("../datasets/lang_det/_train.txt", "w") as file:
    for text, label in _train :
        file.write(label+" "+text.strip()+"\n")

with open("../datasets/lang_det/_test.txt", "w") as file:
    for text, label in _test :
        file.write(label+" "+text.strip()+"\n")

In [12]:
df = pd.DataFrame(train, columns=["text","label"])
df 

Unnamed: 0,text,label
0,Une ardoise géante indique la quantité ou le ...,__label__fra
1,"""A: It is very hard to create that chemistry ...",__label__eng
2,Je regrette de ne pas avoir été plus gentil a...,__label__fra
3,"""Who cared that a grand jury had determined t...",__label__eng
4,Je vais fermer la porte maintenant.\n,__label__fra
...,...,...
1634189,Comment y répondre ? Face à la montée des ten...,__label__fra
1634190,Tokyo est loin d'ici.\n,__label__fra
1634191,Joe Blandino\n,__label__eng
1634192,Tom and Mary will get dirty if they do that.\n,__label__eng


In [13]:
df.label.value_counts()

__label__eng    817097
__label__fra    817097
Name: label, dtype: int64

In [14]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True, oov_token="[UNK]")

In [15]:
tokenizer.fit_on_texts(df.text)

In [16]:
del en, fr, tn, eng_fr_dataset_v1, tn_dataset_v1, tn_dataset_v2

In [17]:
#label2id={"__label__fra":0,"__label__eng":1,"__label__tun":2}
#id2label={0:"__label__fra",1:"__label__eng",2:"__label__tun"}

label2id={"__label__fra":0,"__label__eng":1}
id2label={0:"__label__fra",1:"__label__eng"}


def tokenize(lines:str):
    
    text, label = lines[0], label2id[ lines[1] ]
    input_ids = tokenizer.texts_to_sequences([text])[0]
    
    return (input_ids, label)

def simple_tokenize(line):
    return line[0], label2id[line[1]]

In [18]:
def tokenize_array(lines, max_length=None):
    features = []
    labels = []
    for item in lines :
        fe,lab = tokenize(item)
        features.append(fe)
        labels.append(lab)
    if max_length is None :
        max_length = max(list(map(lambda x: len(x), features)))
   
    len_features = len(features)
    
    input_ids =  np.zeros((len_features, max_length))
    attention_mask =  np.zeros((len_features, max_length))
    token_type_ids = np.zeros((len_features, max_length))
    
    for index, item in enumerate(features) :
        input_ids[index][:min(len(item),max_length)] = item[:min(len(item),max_length)]
        attention_mask[index][:min(len(item),max_length)] = 1
    
    return {"input_ids":tf.cast(input_ids, dtype = tf.int32) , 
            "token_type_ids":tf.cast(token_type_ids, dtype = tf.int32) , 
            "attention_mask":tf.cast(attention_mask, dtype = tf.int32)  }, tf.cast(labels,dtype=tf.int32)
            

In [19]:
def steps(iterator, batch_size = 32) :
    return len(iterator)//batch_size


def generator(iterator, batch_size = 32, max_length= 128):
    
    length = len(iterator)//batch_size
    for i  in range(0,len(iterator),batch_size) :
        data = iterator[i:i+batch_size]
        yield tokenize_array(data, max_length)
    

In [20]:
bert_model_config = tr.BertConfig(num_labels = 2, label2id=label2id,id2label=id2label, num_hidden_layers=6, num_attention_heads=6 )
bert_model_config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "__label__fra",
    "1": "__label__eng"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "__label__eng": 1,
    "__label__fra": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 6,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [21]:
model = tr.models.bert.TFBertForSequenceClassification(bert_model_config)

In [22]:
model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])

In [24]:
X_train, X_test = train_test_split(train, test_size=0.15)

In [25]:
epochs = 5
batch_size = 16
max_length=128
train_steps =  steps(X_train, batch_size)
test_steps = steps(X_test, batch_size)

In [26]:
X_train = generator(X_train, batch_size, max_length)
X_test = generator(X_test, batch_size, max_length)

In [27]:
model.fit(x=X_train, epochs=epochs, validation_data=X_test, steps_per_epoch=train_steps, validation_steps=test_steps)

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
  236/86816 [..............................] - ETA: 35:34:10 - loss: 0.8547 - accuracy: 0.4883

KeyboardInterrupt: 

In [145]:
tmp = tokenize_array([("hello hello hello hello ","__label__eng"), ("je suis je suis", "__label__fra")])[0]
tmp

{'input_ids': <tf.Tensor: shape=(2, 24), dtype=int32, numpy=
 array([[17,  3, 11, 11, 10,  2, 17,  3, 11, 11, 10,  2, 17,  3, 11, 11,
         10,  2, 17,  3, 11, 11, 10,  2],
        [32,  3,  2,  6, 12,  7,  6,  2, 32,  3,  2,  6, 12,  7,  6,  0,
          0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)>,
 'token_type_ids': <tf.Tensor: shape=(2, 24), dtype=int32, numpy=
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]], dtype=int32)>,
 'attention_mask': <tf.Tensor: shape=(2, 24), dtype=int32, numpy=
 array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0]], dtype=int32)>}

In [146]:
#tf.metrics.categorical_accuracy( model.predict(tmp), keras.utils.to_categorical([[1,0]],2))

In [148]:
tmp2 = model(tmp)