In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
nltk.download('stopwords')

# Initializing the variables

In [None]:
# Dataset
NEUTRAL_DATASET_PATH = '/content/datasets/training.1600000.processed.noemoticon.csv'
RADICAL_DATASET_PATH =  '/content/datasets/tweets.csv'
TRAIN_SIZE = 0.8

# Text Cleaning
stemmer = PorterStemmer()

# Loadind the data and preprocessing

In [None]:
df_radical = pd.read_csv(RADICAL_DATASET_PATH)

In [None]:
# Keep only the column containing the tweets
df_radical = df_radical.filter(['tweets'], axis=1)
df_radical['is_radical'] = 1

In [None]:
df_non_radical = pd.read_csv(NEUTRAL_DATASET_PATH, encoding='ISO-8859-1', names=['target', 'ids', 'date', 'flag', 'user', 'text'])

In [None]:
# Create a neutral data set containing 50/50 positive and negative tweets
negative_tweets = df_non_radical[df_non_radical['target'] == 0].sample(75000)

positive_tweets = df_non_radical[df_non_radical['target'] == 4].sample(75000)

df_neutral = pd.concat([negative_tweets, positive_tweets])

# Keep only the column containing the tweets
df_neutral = df_neutral.filter(['text'], axis=1)

# Rename 'text' column to 'tweets'
df_neutral.rename(columns={'text': 'tweets'}, inplace=True)
df_neutral['is_radical'] = 0

In [None]:
def preprocess(tweet):
    # Many tweets from radical dataset starts with 'ENGLISHT RANSLATION:', remove this
    tweet = re.sub(r'ENGLISH TRANSLATION:','', tweet)

    # Remove link, user and special characters
    tweet = re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', str(tweet).lower()).strip()
    tokens = []

    # Remove stopwords and stem words
    for token in tweet.split():
        if token and token not in stopwords.words("english"):
              tokens.append(stemmer.stem(token))
    return " ".join(tokens)

In [None]:
# Preprocess the column containing the tweets
df_radical['tweets'] = df_radical['tweets'].apply(preprocess)
df_neutral['tweets'] = df_neutral['tweets'].apply(preprocess)

# Combine the datasets
df_all_tweets = pd.concat([df_radical, df_neutral])

train_data, test_data = train_test_split(df_all_tweets, test_size=0.25, random_state=42)

# Creating Word2Vec Features

In [None]:
# Initialize the wrd2vec model
w2v_model = Word2Vec(vector_size=50, window=5, min_count=7, workers=4)

In [None]:
# Preprocess the tweets so they are on the right format
documents = train_data['tweets'].apply(gensim.utils.simple_preprocess)

# Build the vocabulary
w2v_model.build_vocab(documents)
vocabulary = w2v_model.wv.index_to_key

# Train the word2vec model
w2v_model.train(documents, total_examples=len(documents), epochs=200)
w2v_model.save('w2v.model')

## Testing the word embeddings using common examples

In [None]:
w2v_model.wv.most_similar("love")

In [None]:
w2v_model.wv.most_similar("hate")

In [None]:
w2v_model.wv.most_similar("islam")

In [None]:
vec = w2v_model.wv['king'] - w2v_model.wv['man'] + w2v_model.wv['woman']
w2v_model.wv.most_similar([vec])

# Creating a word vector for each tweet

In [None]:
# Converting the tweets to vectors by calculating the average word vector of all the words in the tweet. 
def tweet_to_vector(tweet, model, vocabulary):
  # Adjust the size to the model's vector size
  vectorized_tweet = np.zeros(model.vector_size)

  # Convert the tweets to a vector of the words
  words = tweet.split()
  word_count = 0

  # Calculate how many words in the tweet are in the vocabulary
  for word in words:
    if word in vocabulary:
      vectorized_tweet += model.wv[word]
      word_count += 1

  # Calculate the average vector
  if word_count > 0:
    vectorized_tweet /= word_count

  return vectorized_tweet

In [None]:
train_data['vectors'] = train_data['tweets'].apply(lambda tweet: tweet_to_vector(tweet, w2v_model, vocabulary))
test_data['vectors'] = test_data['tweets'].apply(lambda tweet: tweet_to_vector(tweet, w2v_model, vocabulary))

# Creating BoW Features

In [None]:
# Create a Bag of Words model, that transform a text corpus to a matrix of counts
bow_vectorizer = CountVectorizer(max_features=1000)

# Fit the vectorizer on the training data
bow_vectorizer.fit(train_data['tweets'])

# Transform training and test data into BoW vectors
X_train_bow = bow_vectorizer.transform(train_data['tweets']).toarray()
X_test_bow = bow_vectorizer.transform(test_data['tweets']).toarray()

# ANN Model training 

In [None]:
# Prepare the final input features for the neural network
X_train_w2v = np.vstack(train_data['vectors'].values)
X_test_w2v = np.vstack(test_data['vectors'].values)
y_train = train_data['is_radical'].values
y_test = test_data['is_radical'].values

In [None]:
# Define the model architecture, with word2vec as input
nn_model_w2v = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train_w2v.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(2, activation='softmax')
])

# Define the learning rate schedule
initial_learning_rate = 0.001
decay_steps = 1000
decay_rate = 0.96

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=decay_steps,
    decay_rate=decay_rate,
    staircase=True
)

# Building the model
nn_model_w2v.compile(
    optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Define the model architecture, with BoW as input
nn_model_bow = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train_bow.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(2, activation='softmax')
])

# Building the model
nn_model_bow.compile(
    optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
history_w2v = nn_model_w2v.fit(
    X_train_w2v,
    y_train,
    epochs=15,
    validation_data=(X_test_w2v, y_test)
)

In [None]:
history_bow = nn_model_bow.fit(
    X_train_bow,
    y_train,
    epochs=15,
    validation_data=(X_test_bow, y_test)
)

# ANN Model evaluation

In [None]:
nn_model_w2v.summary()
nn_model_bow.summary()

In [None]:
# Evaluate the ANN with Word2Vec model
test_loss_w2v, test_accuracy_w2v = nn_model_w2v.evaluate(X_test_w2v, y_test, verbose=2)
print("ANN with Word2Vec")
print(f"Test accuracy: {test_accuracy_w2v}")
print(f"Test loss: {test_loss_w2v}")

# Evaluate the ANN with BoW model
test_loss_bow, test_accuracy_bow = nn_model_bow.evaluate(X_test_bow, y_test, verbose=2)
print("ANN with BoW")
print(f"Test accuracy: {test_accuracy_bow}")
print(f"Test loss: {test_loss_bow}")

In [None]:
# Plot training & validation accuracy values

accuracy_ylim = (0.955, 0.985)
loss_ylim = (0.06, 0.15)

plt.figure(figsize=(12, 4))
plt.suptitle('ANN with Word2Vec Model Training and Validation Metrics')
plt.subplot(1, 2, 1)
plt.plot(history_w2v.history['accuracy'])
plt.plot(history_w2v.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.ylim(accuracy_ylim)
plt.legend(['Train', 'Validation'], loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history_w2v.history['loss'])
plt.plot(history_w2v.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.ylim(loss_ylim)
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()

plt.figure(figsize=(12, 4))
plt.suptitle('ANN with BoW Model Training and Validation Metrics')
plt.subplot(1, 2, 1)
plt.plot(history_bow.history['accuracy'])
plt.plot(history_bow.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.ylim(accuracy_ylim)
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history_bow.history['loss'])
plt.plot(history_bow.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.ylim(loss_ylim)
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()

In [None]:
# Create Confusin Matrix for the W2V model
predictions_w2v = nn_model_w2v.predict(X_test_w2v)
predicted_classes_w2v = np.argmax(predictions_w2v, axis=1)

# Generate the confusion matrix
cm_w2v = confusion_matrix(y_test, predicted_classes_w2v)

fig, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(cm_w2v, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, cbar=False)
ax.set(xlabel="Predicted Label", ylabel="True Label", title="ANN with Word2Vec Confusion Matrix")
plt.yticks(rotation=0)

plt.show()

# Create Confusion Matrix for the BoW model
predictions_bow = nn_model_bow.predict(X_test_bow)
predicted_classes_bow = np.argmax(predictions_bow, axis=1)

cm_bow = confusion_matrix(y_test, predicted_classes_bow)

fig, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(cm_bow, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, cbar=False)
ax.set(xlabel="Predicted Label", ylabel="True Label", title="ANNN with BoW Confusion Matrix")
plt.yticks(rotation=0)

plt.show()

In [None]:
precision_w2v = precision_score(y_test, predicted_classes_w2v, average='binary')
recall_w2v = recall_score(y_test, predicted_classes_w2v, average='binary')
f1_w2v = f1_score(y_test, predicted_classes_w2v, average='binary')

print("ANN with Word2Vec:")
print(f"Precision: {precision_w2v}")
print(f"Recall: {recall_w2v}")
print(f"F1 Score: {f1_w2v}")

precision_bow = precision_score(y_test, predicted_classes_bow, average='binary')
recall_bow = recall_score(y_test, predicted_classes_bow, average='binary')
f1_bow = f1_score(y_test, predicted_classes_bow, average='binary')

print("ANN with BoW:")
print(f"Precision: {precision_bow}")
print(f"Recall: {recall_bow}")
print(f"F1 Score: {f1_bow}")

# Baseline Classifier

In [None]:
baseline_model = DummyClassifier(strategy='stratified', random_state=42)

baseline_model.fit(X_train_w2v, y_train)

baseline_predictions = baseline_model.predict(X_test_w2v)

In [None]:
# Confusion Matrix
cm_baseline = confusion_matrix(y_test, baseline_predictions)

plt.figure(figsize=(8, 8))
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap=plt.cm.Blues, cbar=False)
plt.title('Baseline Classifier Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
baseline_accuracy = accuracy_score(y_test, baseline_predictions)
print(f"Baseline Classifier Accuracy: {baseline_accuracy:.2f}")

baseline_precision = precision_score(y_test, baseline_predictions, average='binary')
baseline_recall = recall_score(y_test, baseline_predictions, average='binary')
baseline_f1 = f1_score(y_test, baseline_predictions, average='binary')

print("Baseline:")
print(f"Precision: {baseline_precision}")
print(f"Recall: {baseline_recall}")
print(f"F1 Score: {baseline_f1}")