In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

# Load the embeddings and labels
embeddings_list = np.load("Embeddings/MT5/ARABIC/arabic_mt5_transliterated_train_embeds.npy")
label_list = np.load("Embeddings/MT5/ARABIC/arabic_mt5_transliterated_train_labels.npy")

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(label_list)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings_list, labels, test_size=0.2, random_state=42)

# Define a simple deep neural network model
model = Sequential([
    Flatten(input_shape=(X_train.shape[1:])),  # Flatten the 3D embeddings to 1D
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9943749904632568


In [5]:
embeddings_list

array([[[-0.07504977,  0.8022625 , -0.65197575, ..., -1.5437002 ,
         -0.98965794,  1.0390407 ],
        [-0.39166138,  1.0930082 , -0.14132908, ..., -0.08762001,
         -0.42113453,  0.77042985],
        [-0.46579906,  0.9027376 , -0.21910281, ..., -0.08811714,
         -0.38355112,  0.845365  ],
        ...,
        [-0.24701774,  0.5131173 ,  0.06251713, ..., -0.11632759,
         -0.21936874,  0.25060797],
        [-0.24701774,  0.5131173 ,  0.06251713, ..., -0.11632759,
         -0.21936874,  0.25060797],
        [-0.24701774,  0.5131173 ,  0.06251713, ..., -0.11632759,
         -0.21936874,  0.25060797]],

       [[ 0.41156083,  0.40113848,  0.37044606, ..., -0.09604359,
          1.1331083 , -1.2186924 ],
        [ 0.26437944,  0.25263545,  0.46119848, ..., -0.27794218,
          0.8057969 , -1.301826  ],
        [ 0.3014643 ,  0.17994457,  0.46296954, ..., -0.27046788,
          0.80826545, -1.4705987 ],
        ...,
        [ 0.41753992,  0.41199958,  0.37560907, ..., -

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary predictions
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Test Accuracy: 0.994375
Precision: 0.9937655860349127
Recall: 0.9950062421972534
F1 Score: 0.9943855271366188
