<a href="https://colab.research.google.com/github/zahidhamidi/ML-Project/blob/main/Testing_Pre_trained_Model_Binary_False_Positive_Case_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import re
import spacy
from gensim import corpora, models
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
nlp = spacy.load("en_core_web_sm")

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

In [None]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Clear the TensorFlow session and reset the computational graph
tf.keras.backend.clear_session()

In [None]:
# Specify the path to the CSV file
csv_file_path = '/content/true_positive_experimentation.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Extract the text data from the 'text' column
text_data = df['doc_text_original'].tolist()

In [None]:
def preprocess_text(text):
    if not isinstance(text, str):
        # Handle non-string elements here, for example, by returning an empty string
        return ""

    # Replace contractions
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"n't", " not", text)

    # Remove special characters using regular expressions
    text = re.sub(r'[^\w\s]', '', text)

    doc = nlp(text)
    tokens = []

    for token in doc:
        # If the token is in title case (first letter uppercase, rest lowercase),
        # convert it to lowercase and keep only the first letter.
        if token.text.istitle():
            tokens.append(token.text[0].lower())
        else:
            tokens.append(token.lemma_)

    return " ".join(tokens)

# Apply preprocessing to the DataFrame, including removing NaN values
df['preprocessed_data'] = df['doc_text_original'].apply(preprocess_text)

# Filter out rows with empty strings (resulting from non-string elements)
df = df[df['preprocessed_data'] != ""]

# Filter out non-string elements from text_data and preprocess the rest
preprocessed_data = [preprocess_text(text) for text in text_data if isinstance(text, str)]

In [None]:
# Calculate the number of words per list item
word_counts = [len(text.split()) for text in preprocessed_data]

# Calculate the mean word count
mean_word_count = sum(word_counts) / len(word_counts)

# Create a histogram to visualize the distribution of word counts
plt.hist(word_counts, bins=range(1, max(word_counts) + 2), alpha=0.5, edgecolor='black')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Word Count Distribution')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add a vertical dashed red line for the mean
plt.axvline(x=mean_word_count, color='red', linestyle='--', label=f'Mean Word Count ({mean_word_count:.2f})')

# Display a legend
plt.legend()

plt.show()



In [None]:
# Testing with pre-trained model

# Load the pre-trained model from TensorFlow Hub
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
# embedding = "https://tfhub.dev/google/nnlm-en-dim128/2"
# embedding = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=False)

# Flatten and join the tokenized data into a single string for each document
X_test = [" ".join(doc) for doc in preprocessed_data]

# Define the number of models to average
num_models = 5  # You can adjust this based on your preference

# Create a list to store the models
models = []

# Train and store multiple models
for i in range(num_models):
    model = tf.keras.Sequential([
        hub_layer,
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model with binary cross-entropy loss and metric(s) of your choice
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Train the model on your data (replace with your actual training code)
    # model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_val, y_val))

    models.append(model)  # Store the trained model

# Make predictions using each model
predictions = []
for model in models:
    predictions.append(model.predict(X_test))

# Average the predictions
average_predictions = np.mean(predictions, axis=0)

# Assuming average_predictions contains probability scores, you can threshold them to get binary predictions
binary_predictions = (average_predictions > 0.5).astype(np.int32)

In [None]:
# Assuming 'binary_predictions' contains your binary predictions
# Ensure that the length of 'binary_predictions' matches the number of rows you want to update
num_rows_to_update = len(binary_predictions)

# Update the "predicted_label_code" column with the binary predictions for the first 'num_rows_to_update' rows
df.loc[:num_rows_to_update - 1, 'predicted_label_code'] = binary_predictions

# Now, 'df' should have the "predicted_label_code" column filled with the predictions for the specified number of rows


In [None]:
# Assuming you have loaded your DataFrame 'df' with the columns 'label_code' and 'predicted_label_code'

# Calculate accuracy
correct_predictions = (df['label_code'] == df['predicted_label_code']).sum()
total_predictions = len(preprocessed_data)
accuracy = (correct_predictions / total_predictions) * 100

print(f"Number of correct predictions: {correct_predictions}")

print(f"Accuracy: {accuracy:.2f}%")


In [None]:
# Assuming you have loaded your DataFrame 'df' with the columns 'label_code' and 'predicted_label_code'
y_true = df['label_code']
y_pred = df['predicted_label_code']

# Calculate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Calculate additional evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Create a heatmap of the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
import pandas as pd
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Assuming you have loaded your DataFrame 'df' with the columns 'label_code' and 'predicted_label_code'
y_true = df['label_code']
y_score = df['predicted_label_code']  # This should contain the predicted probabilities or scores

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_true, y_score)

# Calculate the AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Assuming you have your DataFrame 'df' with the updates

# Specify the file path where you want to save the CSV file
output_csv_file = "true_positive_updated.csv"

# Save the DataFrame to a new CSV file
df.to_csv(output_csv_file, index=False)

print(f"DataFrame saved to {output_csv_file}")
