In [40]:
import tensorflow as tf
import os
import pandas as pd
import sys
import numpy as np
project_root = os.path.dirname(os.getcwd())
sys.path.append("..")  # Adds the parent directory to the path

In [41]:
# Define custom loss
weights = pd.Series([0.104978, 0.328745,0.566277])
class WeightedCategoricalCrossEntropy(tf.keras.losses.Loss):
    def __init__(self, weights=weights, name='weighted_categorical_crossentropy', **kwargs):
        super(WeightedCategoricalCrossEntropy, self).__init__()
        self.weights = tf.cast(weights, tf.float32)
        
    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        # Clip y_pred to avoid log(0)
        y_pred = tf.clip_by_value(y_pred, tf.keras.backend.epsilon(), 1 - tf.keras.backend.epsilon())
        weighted_losses = -self.weights * y_true * tf.math.log(y_pred)
        return tf.reduce_mean(tf.reduce_sum(weighted_losses, axis=1))
    
    
# Define custom metrics
class PrecisionMulticlass(tf.keras.metrics.Metric):
    def __init__(self, name='precision', n_class=3, **kwargs):
        super(PrecisionMulticlass, self).__init__(name=name, **kwargs)
        self.precision = self.add_weight(
            shape=(n_class,),
            name='precision',
            initializer='zeros')
        self.n_class = n_class
        self.true_positives = self.add_weight(name='true_positives', shape=(self.n_class,), initializer='zeros')
        self.false_positives = self.add_weight(name='false_positives', shape=(self.n_class,), initializer='zeros')
        
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.int64)
        y_pred = tf.cast(tf.one_hot(tf.argmax(y_pred, axis=1), self.n_class), tf.int64)
        
        for i in range(self.n_class):
            true_positive = tf.reduce_sum(y_true[:, i] * y_pred[:, i])
            false_positive = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(y_true[:, i], 0), tf.equal(y_pred[:, i], 1)), tf.int64))
            
            index = [[i]]  # Index for the class we are updating
            self.true_positives.assign(tf.tensor_scatter_nd_add(self.true_positives, index, [true_positive]))
            self.false_positives.assign(tf.tensor_scatter_nd_add(self.false_positives, index, [false_positive]))
            
    def result(self):
        precision_per_class = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        return tf.reduce_mean(precision_per_class)
    
    def reset_state(self):
        self.true_positives.assign(tf.zeros(self.n_class))
        self.false_positives.assign(tf.zeros(self.n_class))
        
        

class RecallMulticlass(tf.keras.metrics.Metric):
    def __init__(self, name='recall', n_class=3, **kwargs):
        super(RecallMulticlass, self).__init__(name=name, **kwargs)
        self.recall = self.add_weight(
            shape=(n_class,),
            name='recall',
            initializer='zeros')
        self.n_class = n_class
        self.true_positives = self.add_weight(name='true_positives', shape=(self.n_class,), initializer='zeros')
        self.false_negatives = self.add_weight(name='false_negatives', shape=(self.n_class,), initializer='zeros')
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.int64)
        y_pred = tf.cast(tf.one_hot(tf.argmax(y_pred, axis=1), self.n_class), tf.int64)
        
        for i in range(self.n_class):
            true_positive = tf.reduce_sum(y_true[:, i] * y_pred[:, i])
            false_negative = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(y_true[:, i], 1), tf.equal(y_pred[:, i], 0)), tf.int64))
            
            index = [[i]]  # Index for the class we are updating
            self.true_positives.assign(tf.tensor_scatter_nd_add(self.true_positives, index, [true_positive]))
            self.false_negatives.assign(tf.tensor_scatter_nd_add(self.false_negatives, index, [false_negative]))
            
    def result(self):
        recall_per_class = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        return tf.reduce_mean(recall_per_class)
    
    def reset_state(self):
        self.true_positives.assign(tf.zeros(self.n_class))
        self.false_negatives.assign(tf.zeros(self.n_class))
            

In [None]:
model = tf.keras.models.load_model(os.path.join(project_root, 'models', 'bi_lstm_model'), custom_objects={"WeightedCategoricalCrossEntropy": WeightedCategoricalCrossEntropy,
                                                                                                                 "PrecisionMulticlass": PrecisionMulticlass,
                                                                                                                 "RecallMulticlass": RecallMulticlass})
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 300)         8861400   
                                                                 
 bidirectional_2 (Bidirecti  (None, None, 128)         186880    
 onal)                                                           
                                                                 
 lstm_layer (LSTM)           (None, 64)                49408     
                                                                 
 dense1 (Dense)              (None, 64)                4160      
                                                                 
 dense2 (Dense)              (None, 16)                1040      
                                                                 
 output (Dense)              (None, 3)                 51        
                                                      

### Try to predict from text

In [80]:
input_text = "However this sentence is very insulting you should go fuck yourself bitch this is a test sentence just to see if the model works but you still are a dumb ass !"

In [13]:
import re
from utils.artifacts import REGEX_REMOVE, REGEX_REPLACE
import numpy as np
from textblob import TextBlob
import signal
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet, words, webtext, gutenberg, brown
from utils.artifacts import slang_dict

import tensorflow as tf
#from tensorflow.keras.utils import pad_sequences

# NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('words', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('webtext', quiet=True)
nltk.download('gutenberg', quiet=True)
nltk.download('brown', quiet=True)
stop_words = stopwords.words('english')
combined_corpus = set(words.words()) | set(wordnet.words()) | set(webtext.words()) | set(gutenberg.words()) | set(brown.words())
combined_corpus = {word.lower() for word in combined_corpus}


def clean_data(batch, stop_words = stop_words, slang_dict=slang_dict):
        def clean_text(text: str) -> str:
            # Apply REGEX_REMOVE and REGEX_REPLACE
            for pattern in REGEX_REMOVE:
                text = re.sub(pattern, "", text)
            for pattern, repl in REGEX_REPLACE.items():
                text = re.sub(pattern, repl, text)
            
            # Apply additionnal text cleaning steps
            text = re.sub(r'^RT @\w+: ', '', text)
            text = re.sub(r'http\S+', ' ', text)
            text = re.sub(r'\b\w*jpeg\w*\b|\b\w*jpg\w*\b', '', text)
            text = re.sub(r'\n', ' ', text)
            text = re.sub(r'@\w+', '<PERSON>', text)
            text = re.sub(r'[^\w\s]', '', text)
            text = re.sub(r'\d+', '', text)
            text = re.sub(r'\b(\w+)\b\s+\1\b', '', text)
            text = text.strip().lower()
            text = re.sub(r'[^\x00-\x7F]+', ' ', text)
            text = re.sub(r'[\x80-\xFF]', '', text)
            return text

        
        def correct_text(text: str, stop_words, slang_dict: dict) -> str:
            tokens = text.split()
            tokens = [slang_dict.get(word, word) for word in tokens]
            tokens = [word for word in tokens if word not in stop_words]
            #tokens = [word for word in tokens if len(word) < 15]
            text = ' '.join(tokens)
            corrected_text = str(TextBlob(text).correct())
            return corrected_text
        
        
        #def lemma_text(tokens: list) -> list:
        #    lemmatizer = WordNetLemmatizer()
        #    
        #    ls = [lemmatizer.lemmatize(token, 'v') for token in tokens]
        #    ls = [lemmatizer.lemmatize(token, 'n') for token in ls]
        #    ls = [lemmatizer.lemmatize(token, 'a') for token in ls]
        #    return ls
        
        def replace_unknown_tokens(tokens: list) -> list:
            return [token if token in combined_corpus else '<UNK>' for token in tokens]
        
        
        def combined_cleaning(text: str) -> list:
            text = clean_text(text)
            corrected_text = correct_text(text, stop_words, slang_dict)
            return corrected_text
        
        #def tokenize(text: str) -> list:
        #    tokens = word_tokenize(text, preserve_line=True)
        #    tokens = lemma_text(tokens)
        #    tokens = replace_unknown_tokens(tokens)
        #    return tokens
        
        # Process each text in the batch
        batch['corrected_text'] = batch['text'].apply(combined_cleaning)
        #batch['tokens'] = batch['corrected_text'].apply(tokenize)
        return batch

























In [93]:
dataframe = pd.DataFrame([input_text], columns=['text'])

# Clean the text
cleaned_data = clean_data(dataframe, stop_words, slang_dict)
cleaned_data

Unnamed: 0,text,corrected_text,tokens
0,However this sentence is very insulting you sh...,however this sentence is very insulting you sh...,"[however, this, sentence, be, very, insult, yo..."


In [None]:
#def feature_engineering(data, max_length:int=170, vocab_size:int=29538):

data = cleaned_data.copy()

# Vocabulary size and vocabulary list for embedding)
embedding_matrix = np.load(os.path.join(project_root, 'models', 'embedding_matrix_300.npy'))
vocab = list(np.load(os.path.join(project_root, 'models', 'vocab.npy'), allow_pickle=True))
vocab.append("<PAD>")
vocab_size = len(vocab)
# Token to index
token_to_index = {token: idx for idx, token in enumerate(vocab)}
# Convert text tokens to index
data['tokens_index'] = data['tokens'].apply(lambda x: [token_to_index.get(token, token_to_index["<UNK>"]) for token in x])


# Pad sequences
from tensorflow.keras.utils import pad_sequences

max_length = 170

pad_sequences = pad_sequences(data.tokens_index, maxlen=max_length, padding='post', truncating='post')
data['padded_tokens'] = [list(row) for row in pad_sequences]

# Convert to tensor for inference
data_tensor = tf.ragged.constant(data["padded_tokens"], dtype=tf.int32)
data_tensor = data_tensor.to_tensor(default_value=0)

In [95]:
model.predict(data_tensor)



array([[0.98801464, 0.0088652 , 0.00312017]], dtype=float32)

In [5]:
from functools import partial
import tensorflow as tf
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
project_root = os.path.dirname(os.getcwd())
from utils.custom_metrics import (
    WeightedCategoricalCrossEntropy, 
    PrecisionMulticlass, 
    RecallMulticlass,
    F1ScoreMulticlass,
    weights
)


model = tf.keras.models.load_model(
    os.path.join(project_root, "models", "bi_gru"),
    custom_objects={'PrecisionMulticlass': PrecisionMulticlass,
                    'RecallMulticlass': RecallMulticlass,
                    'F1ScoreMulticlass': F1ScoreMulticlass,
                    'WeightedCategoricalCrossEntropy': partial(WeightedCategoricalCrossEntropy, weights=weights)}
)

2024-11-29 09:08:39.468465: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-11-29 09:08:39.468516: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-11-29 09:08:39.468534: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-11-29 09:08:39.468751: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-29 09:08:39.469137: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-11-29 09:08:41.478285: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [None]:
user_input = "I a."


def build_predict_dataset(data: pd.DataFrame, batch_size:int = 512):
    # Prepare dataset
    X = data["text"]

    # Create tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((X))

    return dataset


data = pd.DataFrame({"text": [user_input]})
data = clean_data(data)
data = build_predict_dataset(data).batch(1)

In [38]:
model.predict(data)



array([[0.0310078, 0.9689922]], dtype=float32)

### BERT Model Prediction 

In [5]:
import tensorflow as tf
import pandas as pd
import os
import sys
sys.path.append("..")  # Adds the parent directory to the path
project_root = os.path.dirname(os.getcwd())

In [7]:
# Load data from pickle
path = os.path.join(project_root, "datasets/processed/data.pkl")
data = pd.read_pickle(path)
data['label'] = data['label'].apply(lambda x: 1 if x == 2 else x)

In [11]:
# Load Model
from utils.custom_metrics import RecallMulticlass, PrecisionMulticlass, F1ScoreMulticlass, WeightedCategoricalCrossEntropy
from transformers import TFBertModel, BertTokenizer

metrics = [RecallMulticlass(name="recall", n_class=2), PrecisionMulticlass(name="precision", n_class=2), F1ScoreMulticlass(name="f1", n_class=2)]

# weights
weights = data["label"].value_counts(normalize=True).sort_index().values
weights = 1/weights
weights = weights/weights.sum()

# loss
loss = WeightedCategoricalCrossEntropy(weights)


def build_bert_model(loss: list, metrics: list, name:str = "bert_model"):
    # Load the pre-trained BERT model
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    # Freeze the BERT model layers
    for layer in bert_model.layers:  # Freeze all layers
        layer.trainable = False

    # Define the input layers
    input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

    # Get the output from the BERT model
    bert_outputs = bert_model(input_ids, attention_mask=attention_mask)

    # Use the pooled output for classification
    pooled_output = bert_outputs.pooler_output

    # Add custom layers
    x = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    output = tf.keras.layers.Dense(2, activation='softmax')(x)

    # Create the model
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output, name=name)

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(),
                  loss=loss,
                  metrics=metrics)

    # Summary of the model
    model.summary()
    
    return model

test_model = build_bert_model(loss, metrics, "bert_model")
# Load the weights
test_model.load_weights(os.path.join(project_root, "models", "bert", "bert_model_test"))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "bert_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_3 (TFBertMod  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                  

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x368a168f0>

In [None]:
def build_predict_dataset(data: pd.DataFrame, tokenizer, max_length=128, batch_size=512):
    # Prepare dataset
    texts = data["text"].tolist()

    # Tokenize and encode the data
    encoded_data = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

    # Create tf.data.Dataset with tokenized inputs
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encoded_data)  # This ensures both input_ids and attention_mask are included
    ))

    # Batch the dataset
    return dataset.batch(batch_size)


# Tokenizer initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


  inputs = self._flatten_to_reference_inputs(inputs)
2024-12-02 23:54:14.410689: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




In [44]:
user_input = [
    "Hi Team, I think there are some areas where we can improve this proposal to align with the client's goals. Let's discuss during the next meeting.",
    "People often misjudge others without knowing their full story. Let's try to be more understanding and empathetic.",
    "Hey, I saw your post earlier. It made me think a lot about how we treat each other in difficult times. Hope you're doing well!",
    "This video really opened my eyes to the challenges faced by different communities. Thank you for sharing this perspective.",
    "Good progress on the project so far! Let's make sure we meet the deadline by keeping communication open and addressing issues promptly.",
    "Sometimes it's not about agreeing with everyone but about respecting their right to a different opinion. Stay kind.",
    "I just wanted to say that I really appreciate the way you handled the discussion yesterday. You kept it professional and constructive.",
    "Great tutorial! I didn't understand this concept before, but your explanation made it so much clearer. Subscribed!",
    "Not every problem has an easy answer, but working together gets us closer to a solution. Keep going, everyone!",
    "Hi Sarah, I just wanted to thank you for your support during the project. It really made a difference, and I'm grateful for your help."
]

In [60]:
user_input = [
    "You are so incompetent, it's embarrassing to work with someone like you.",
    "What a joke of a person. I can't believe anyone would take you seriously.",
    "Nobody cares about your opinion, so just shut up and go away.",
    "You're clearly too dumb to understand basic things. Maybe stay quiet next time.",
    "People like you are the reason this country is going downhill.",
    "How delusional do you have to be to post something this idiotic?",
    "Your presence here is not just unwelcome, it's downright disgusting.",
    "Seriously, just quit already. You're the most useless person in the team.",
    "Why don't you go back to whatever rock you crawled out from?",
    "It's pathetic how you keep trying and still fail every single time."
]

In [58]:
user_input = [
    "You're such a loser, nobody cares about you here.",
    "Why are you even posting? No one wants to see your stupid opinions.",
    "You look so dumb in that picture, I can't believe you uploaded it.",
    "Nobody likes you, you're just a burden to everyone.",
    "You're so annoying, why don't you just leave everyone alone?",
    "Seriously, how are you even still friends with them? They must feel sorry for you.",
    "Why don't you just stop trying? You'll never be as good as them.",
    "You think you're so special, but you're just like everyone else – useless.",
    "You're so pathetic, no wonder everyone ignores you.",
    "Why do you always have to act like you're so smart? You're really not.",
    "You're just a kid. What do you even know? Your opinion doesn't matter.",
    "Oh wow, you're so brave talking behind a screen. Try saying it to my face.",
    "I feel sorry for you, but I guess that's what you get for being such a weirdo.",
    "Do you honestly think anyone cares about your life? Get over yourself.",
    "You're just a waste of space, no one's going to remember you.",
    "I wouldn't be surprised if you ended up alone forever, no one likes people like you.",
    "You think you're cool, but you're just embarrassing yourself.",
    "I can't believe you're still hanging out with them, they're only tolerating you.",
    "Why don't you just quit? You'll never make it anywhere.",
    "You look like you don't even belong here, just leave."
]


In [61]:
data = pd.DataFrame({"text": user_input})


# Clean the data (if you have a cleaning function defined)
data = clean_data(data)

# Build the prediction dataset
predict_dataset = build_predict_dataset(data, tokenizer)

# Make predictions
predictions = test_model.predict(predict_dataset)
print(predictions)

[[0.6698157  0.33018425]
 [0.38523373 0.6147663 ]
 [0.09626018 0.9037398 ]
 [0.6853213  0.3146787 ]
 [0.4165259  0.5834741 ]
 [0.26706153 0.73293847]
 [0.6174586  0.3825414 ]
 [0.32340345 0.6765966 ]
 [0.045049   0.95495105]
 [0.25105333 0.74894667]]


In [43]:
model= build_bert_model(loss, metrics, "bert_model")
# Load the weights
model.load_weights(os.path.join(project_root, "models", "bert", "bert_model_test"))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "bert_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_4 (TFBertMod  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                  

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x384be0dc0>

In [57]:
data = pd.DataFrame({"text": user_input})


# Clean the data (if you have a cleaning function defined)
data = clean_data(data)

# Build the prediction dataset
predict_dataset = build_predict_dataset(data, tokenizer)

# Make predictions
predictions = model.predict(predict_dataset)
print(predictions)

[[0.17936371 0.8206363 ]
 [0.15537009 0.8446299 ]
 [0.03791552 0.9620845 ]]
