In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load your dataset
file_path = 'DiscriminatoryText (1)/DiscriminatoryText.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Define preprocessing function
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove non-alphabet characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply preprocessing
df['prompt'] = df['prompt'].apply(preprocess_text)

# Handle negation
def handle_negation(text):
    # List of negation words
    negation_words = ["not", "no", "never", "none"]
    negation_dict = {
        "good": "bad", "happy": "sad", "like": "dislike", "love": "hate",
        "bad": "good", "sad": "happy", "dislike": "like", "hate": "love"
    }
    words = text.split()
    new_words = []
    skip = False
    for i in range(len(words)):
        if skip:
            skip = False
            continue
        if words[i] in negation_words and i+1 < len(words):
            word = negation_dict.get(words[i+1], words[i+1])
            new_words.append(word)
            skip = True
        else:
            new_words.append(words[i])
    return ' '.join(new_words)

df['prompt'] = df['prompt'].apply(handle_negation)

# Preprocessing and dataset preparation
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert examples to InputExamples
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    input_ids, attention_masks, labels = [], [], []
    for idx, example in examples.iterrows():
        text, label = example['prompt'], example['prompt_label']
        inputs = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length', add_special_tokens=True)
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
        labels.append(label)
    return tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': attention_masks}, labels))

train_data = convert_examples_to_tf_dataset(train, tokenizer)
val_data = convert_examples_to_tf_dataset(val, tokenizer)
test_data = convert_examples_to_tf_dataset(test, tokenizer)

train_data = train_data.shuffle(100).batch(32).repeat(2)
val_data = val_data.batch(32)
test_data = test_data.batch(32)

# Load BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-8)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Define callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')

In [None]:
# Train the model with callbacks
history = model.fit(train_data, epochs=3, validation_data=val_data, callbacks=[early_stopping, model_checkpoint])

# Evaluate the model
loss, accuracy = model.evaluate(test_data)
print(f'Test Accuracy: {accuracy}')

# Get classification report
y_true = np.concatenate([y for x, y in test_data], axis=0)
y_pred = np.argmax(model.predict(test_data)[0], axis=1)

print(classification_report(y_true, y_pred, target_names=['Non-Racist', 'Racist']))

In [None]:
loss, accuracy = model.evaluate(test_data_list)
print(f'Test Accuracy: {accuracy}')

In [None]:
test_data_subset = test_data.take(200)

# Evaluate the model with the subset of test data
loss, accuracy = model.evaluate(test_data_subset)
print(f'Test Accuracy: {accuracy}')

In [None]:
def classify_text(text, model, tokenizer, max_length=128):
    # Preprocess text
    text = preprocess_text(text)
    text = handle_negation(text)
    # Tokenize text
    inputs = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length', add_special_tokens=True)
    input_ids = np.array([inputs['input_ids']])
    attention_mask = np.array([inputs['attention_mask']])
    # Predict
    predictions = model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})
    label_id = np.argmax(predictions[0], axis=1).flatten()
    return label_id[0]

# Example usage
text_to_classify = "Indian food are always"
prediction = classify_text(text_to_classify, model, tokenizer)
if prediction == 0:
    print("Non-Racist")
else:
    print("Racist")


In [None]:
# Save tokenizer
tokenizer.save_pretrained('./saved_model2/tokenizer')

# Save model
model.save_pretrained('./saved_model2/model')