<a href="https://colab.research.google.com/github/zahidhamidi/ML-Project/blob/main/Training_NLP_false_positive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Clear the TensorFlow session and reset the computational graph
tf.keras.backend.clear_session()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Assuming you already have your 'balanced_df' DataFrame

# Separate the data into two DataFrames based on the label_code
df_class_0 = balanced_df[balanced_df["label_code"] == 0]
df_class_1 = balanced_df[balanced_df["label_code"] == 1]

# Shuffle the DataFrames
df_class_0 = shuffle(df_class_0, random_state=42)
df_class_1 = shuffle(df_class_1, random_state=42)

# Split each class into train, validation, and test sets
train_frac = 0.6
val_frac = 0.2

# Split class 0
train_size_class_0 = int(train_frac * len(df_class_0))
val_size_class_0 = int(val_frac * len(df_class_0))

train_class_0 = df_class_0[:train_size_class_0]
val_class_0 = df_class_0[train_size_class_0:train_size_class_0 + val_size_class_0]
test_class_0 = df_class_0[train_size_class_0 + val_size_class_0:]

# Split class 1
train_size_class_1 = int(train_frac * len(df_class_1))
val_size_class_1 = int(val_frac * len(df_class_1))

train_class_1 = df_class_1[:train_size_class_1]
val_class_1 = df_class_1[train_size_class_1:train_size_class_1 + val_size_class_1]
test_class_1 = df_class_1[train_size_class_1 + val_size_class_1:]

# Concatenate the balanced train, validation, and test sets
train_df = pd.concat([train_class_0, train_class_1])
val_df = pd.concat([val_class_0, val_class_1])
test_df = pd.concat([test_class_0, test_class_1])

# Shuffle the combined DataFrames
train_df = shuffle(train_df, random_state=42)
val_df = shuffle(val_df, random_state=42)
test_df = shuffle(test_df, random_state=42)

# Extract examples and labels from the split DataFrames
train_examples = train_df["doc_text_original"]
train_labels = train_df["label_code"]

val_examples = val_df["doc_text_original"]
val_labels = val_df["label_code"]

test_examples = test_df["doc_text_original"]
test_labels = test_df["label_code"]

# Now you have balanced train, validation, and test sets with equal distribution of classes.


In [None]:
model = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(model, input_shape=[], dtype=tf.string, trainable=False)

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])

In [None]:
# Train your model
history = model.fit(
    x = train_examples,  # Training data
    y = train_labels,  # Training labels
    epochs=40,           # Number of training epochs
    batch_size=512,      # Batch size
    validation_data=(val_examples, val_labels),  # Validation data
    verbose=1            # Verbosity level (0 = silent, 1 = progress bar, 2 = one line per epoch)
)

In [None]:
results = model.evaluate(test_examples, test_labels)

print(results)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Assuming your model has already been trained and evaluated as you mentioned

# Make predictions on the test data
predicted_labels = model.predict(test_examples)

# Convert the predicted probabilities to binary labels (0 or 1)
predicted_labels_binary = np.argmax(predicted_labels, axis=1)

# Calculate the confusion matrix
confusion = confusion_matrix(test_labels, predicted_labels_binary)

# Visualize the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', square=True, xticklabels=True, yticklabels=True)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Calculate additional evaluation metrics
accuracy = accuracy_score(test_labels, predicted_labels_binary)
precision = precision_score(test_labels, predicted_labels_binary)
recall = recall_score(test_labels, predicted_labels_binary)
f1 = f1_score(test_labels, predicted_labels_binary)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
import pandas as pd

# Assuming you have already made predictions on your test data as mentioned earlier

# Create a DataFrame with the original text, true labels, and predicted labels
results_df = pd.DataFrame({'doc_text_original': test_examples, 'label_code': test_labels, 'predicted_label': predicted_labels_binary})



In [None]:
# Display the DataFrame
results_df